abp竞赛之文本文件词频查询优化报告Word文档格式.docx

资源描述

abp竞赛之文本文件词频查询优化报告Word文档格式.docx

《abp竞赛之文本文件词频查询优化报告Word文档格式.docx》由会员分享，可在线阅读，更多相关《abp竞赛之文本文件词频查询优化报告Word文档格式.docx（19页珍藏版）》请在冰豆网上搜索。

abp竞赛之文本文件词频查询优化报告Word文档格式.docx

评判标准：

正确性+速度

截止时间：

2003年10月11日前（含）

方法：

每个人可以多次提交。

每次提交完了，我会告诉你你的成绩和最快的人的成绩。

内容：

一个文件，仅由大小写字母，空格和换行符组成。

我们称一个词为连续的大小写字符，两边是空格或者文件头/尾。

词大小写敏感。

某个词的词频是这个词在这个文件里面出现的次数。

要求，输入一个文件（至少有一个词，并且最大词频的词只有一个），输出那个词频最大的词。

譬如，输入：

aaa

bbb

ccc

ddd

输出：

补充一句：

文件可能非常大。

（xxxM，xG）

还有就是，文件中不会出现TAB。

（测试程序的时候，我们将vc目录中的源代码文件合成了一个数据文件来作为测试数据）

一个“标准”C++实现版本：

（可以作为一个STL使用的实例:

#pragma

warning

disable

4786

#include

iostream>

fstream>

string>

map>

time.h>

using

namespace

std;

int

main（int

argc,

char*

argv[]）

{

//assert（argc==2）;

clock_t

start=clock（）;

const

file_name=argv[1];

ifstream

in_file（file_name）;

map<

string,int>

word_table;

string

max_word;

long

max_count=0;

word;

while（in_file

word）

longold_count_inc=（++word_table[word]）;

if（old_count_inc>

max_count）

{

max_word=word;

max_count=old_count_inc;

}

cout<

Word:

max_word<

Count:

max_count<

endl;

Seconds

（double）（clock（）-start）/CLOCKS_PER_SEC

）<

return

我用的测试编译器vc6.0,CPU赛扬2.0G

下面的代码很多时候速度是上面的版本的20倍,源代码如下（优化说明在代码之后）;

（我以前提交的代码使用了MFC库，为了容易编译和理解，我做了一些代码调整,去除MFC依赖，把一个复杂的代码循环展开删除了，可能慢了10%）

stdio.h>

vector>

algorithm>

class

CMyAllot

enum

chunk_size=1024*256

};

//块大小

_cur;

_end;

std:

vector<

char*>

_vector;

void*

_new_else（unsigned

size）;

public:

CMyAllot（）

_end（0）,_cur（0）

virtual

~CMyAllot（）

（!

_vector.empty（））

DelAll（）;

inline

_fastcall

New（unsigned

size）

size=（（size+3）>

2）;

//4字节边界对齐

（（int）size<

（_end-_cur））//够用

result=_cur;

_cur+=size;

result;

else

//不够用

_new_else（size）;

void

DelAll（）

for

（int

i=0;

（int）_vector.size（）;

++i）

delete

[]

（_vector[i]）;

_vector.clear（）;

CMyAllot:

（size>

（chunk_size>

2））//不够用，而且需要的空间较大

result=new

char[size];

old_back=_vector.back（）;

_vector[_vector.size（）-1]=result;

_vector.push_back（old_back）;

//不够用，开辟新的空间

char[chunk_size];

_cur=result+size;

_end=result+chunk_size;

_vector.push_back（result）;

struct

TNode//hash表使用的节点类型（链表）

TNode*

pNext;

unsigned

count;

char

str[1];

//不一定只有一个字节,会根据字符串分配空间

TComp//返回时的排序准则

bool

operator（）（const

l,const

r）

（（l->

count）==（r->

count））

string（&

l->

str[0]）<

（&

r->

str[0]）;

（l->

count）>

（r->

count）;

hash_value（char*

begin,char*

end）

result=0;

do{

result=5*result+（*begin）;

//利用asm:

lea

reg0,[reg1*4+reg1],并且5是质数

}while（（++begin）!

=end）;

pstr）

result=5*result+（*pstr）;

;

}while（（*（++pstr）））;

//测试字符串是否相同,

如果需要不区分大小写，修改这个函数和hash函数就可以了

test_str_EQ（char*

end,char*

str）

//for

（;

begin!

=end;

++begin,++str）

（*begin）!

=*（str）

）

false;

++begin;

++str;

}while（begin!

true;

CHashSet

typedef

TNode*>

base_t;

hash_index（char*

hash_value（begin,end）&

（_hash_mask）;

hash_value（pstr）&

resize（）;

move_insert（base_t&

v,TNode*

pOldNode）

const;

NewNode（char*

end）;

Sort（base_t&

v,unsigned

sortCount）;

_hash_power;

_hash_mask;

_node_count;

base_t

_vbase;

_allot;

else_insert（TNode*

pNode,char*

CHashSet（）;

~CHashSet（）;

size（）

sum（）;

insert（char*

GetStrList（std:

ostream&

cout,unsigned

CHashSet:

CHashSet（）

_hash_power

（2）,_vbase（（unsigned

int）（_hash_power）,（TNode*）0）//注意次序

_node_count=0;

_hash_mask=_hash_power-1;

//_hash_power=1<

~CHashSet（）

_allot.DelAll（）;

sum（）

sum=0;

if（_node_count>

0）

base_t:

iterator

end=_vbase.end（）;

（base_t:

i=_vbase.begin（）;

end;

pNode=（*i）;

while

（pNode!

=0）

sum+=pNode->

pNode=pNode->

sum;

index=hash_index（begin,end）;

pNode=_vbase[index];

pNode）//节点还没有使用

_vbase[index]=NewNode（begin,end）;

++_node_count;

（test_str_EQ（begin,end,pNode->

str））//累加

++（pNode->

else_insert（pNode,begin,end）;

（true）

if（!

（pNode->

pNext））

pNode->

pNext=NewNode（begin,end）;

=（_hash_power））

break;

pNext->

str））

TNode*&

pNode

v[hash_index（pOldNode->

str）];

pOldNode->

pNext=0;

pNode=pOldNode;

pNext）

pNext=pOldNode;

pListNode=pNode->

（pListNode->

pNext!

pListNode=pListNode->

pListNode->

pNode=（TNode*）（_allot.New（sizeof（TNode）+end-begin））;

count=1;

i=pNode->

str;

++i,++begin）

（*i）=（*begin）;

++i,++begin;

while（begin!

（*i）=char（0）;

pNode;

resize（）

_hash_power<

=2;

_hash_mask=（_hash_power）-1;

new_vbase（_hash_power,（TNode*）0）;

temp=pNode->

move_insert（new_vbase,pNode）;

pNode=temp;

_vbase.swap（new_vbase）;

////

sortCount）

（sortCount==1）

v.resize

（1）;

maxNode=_vbase[0];

TNode:

TComp

op;

（maxNode==0）||（op（pNode,maxNode））

maxNode=pNode;

v[0]=maxNode;

v.resize（_node_count）;

index=0;

TNode**

end=&

（TNode**

i=&

（_vbase[0]）;

v[index]=pNode;

++index;

partial_sort（v.begin（）,v.begin（）+sortCount,v.end（）,TNo

展开阅读全文

abp竞赛之文本文件词频查询 优化报告Word文档格式.docx

abp竞赛之文本文件词频查询优化报告Word文档格式.docx