《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx

上传人:b****5 文档编号:20470719 上传时间:2023-01-23 格式:DOCX 页数:23 大小:21.02KB
下载 相关 举报
《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx_第1页
第1页 / 共23页
《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx_第2页
第2页 / 共23页
《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx_第3页
第3页 / 共23页
《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx_第4页
第4页 / 共23页
《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx_第5页
第5页 / 共23页
点击查看更多>>
下载资源
资源描述

《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx

《《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx》由会员分享,可在线阅读,更多相关《《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx(23页珍藏版)》请在冰豆网上搜索。

《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx

System.out.println(uri);

}

9.6.2Page类

packagecn.edu.bjfu.search.page;

publicclassPage{

privateStringurl;

privateStringtitle;

privateStringsummary;

privateStringcontext;

privateintscore;

publicPage(){

url=null;

title=null;

summary=null;

context=null;

score=10;

publicStringgetUrl(){

returnthis.url;

publicvoidsetUrl(Stringurl){

this.url=url;

publicStringgetTitle(){

returnthis.title;

publicvoidsetTitle(Stringtitle){

this.title=title;

publicStringgetSummary(){

returnthis.summary;

publicvoidsetSummary(Stringsummary){

this.summary=summary;

publicStringgetContext(){

returnthis.context;

publicvoidSetContext(Stringcontext){

this.context=context;

publicintgetScore(){

returnthis.score;

publicvoidsetScore(intscore){

this.score=score;

9.6.3Extractor类

packagecn.edu.bjfu.search.extractor;

importorg.htmlparser.*;

importorg.htmlparser.util.*;

importorg.htmlparser.visitors.*;

importorg.htmlparser.nodes.*;

importorg.htmlparser.tags.*;

importcn.edu.bjfu.search.page.*;

importcn.edu.bjfu.search.util.*;

publicclassExtractorimplementsRunnable{

privateStringfilename;

privateParserparser;

privatePagepage;

privateStringencode;

publicvoidsetEncode(Stringencode){

this.encode=encode;

privateStringcombineNodeText(Node[]nodes){

StringBufferbuffer=newStringBuffer();

for(inti=0;

i<

nodes.length;

i++){

Nodeanode=(Node)nodes[i];

Stringline=null;

if(anodeinstanceofTextNode){

TextNodetextnode=(TextNode)anode;

line=textnode.getText();

}

elseif(anodeinstanceofLinkTag){

LinkTaglinknode=(LinkTag)anode;

line=linknode.getLinkText();

elseif(anodeinstanceofDiv){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

elseif(anodeinstanceofParagraphTag){

elseif(anodeinstanceofSpan){

elseif(anodeinstanceofTableTag){

elseif(anodeinstanceofTableRow){

elseif(anodeinstanceofTableColumn){

if(line!

buffer.append(line);

returnbuffer.toString();

privateStringgetUrl(Stringfilename){

Stringurl=filename;

url=url.replace(ProperConfig.getValue("

mirror.path"

),"

);

if(url.lastIndexOf("

/"

)==url.length()-1){

url=url.substring(0,url.length()-1);

url=url.substring

(1);

returnurl;

privateintgetScore(Stringurl,intscore){

String[]subStr=url.split("

score=score-(subStr.length-1);

returnscore;

privateStringgetSummary(Stringcontext){

if(context==null){

context="

;

returnMD5.MD5Encode(context);

publicvoidextract(Stringfilename){

System.out.println("

Message:

Nowextracting"

+filename);

this.filename=filename.replace("

\\"

"

run();

if(this.page!

PageLib.store(this.page);

publicvoidrun(){

try{

parser=newParser(this.filename);

parser.setEncoding(encode);

HtmlPagevisitor=newHtmlPage(parser);

parser.visitAllNodesWith(visitor);

page=newPage();

//获取网页的URL

this.page.setUrl(getUrl(this.filename));

//获取网页的标题

this.page.setTitle(visitor.getTitle());

//验证网页<

body>

标签内是否为空,如果是空则不用进行内容提取

if(visitor.getBody()==null){

this.page.SetContext(null);

else{

//如果不为空,则提取内容

this.page.SetContext(combineNodeText(visitor.getBody().toNodeArray()));

//计算网页的得分

this.page.setScore(getScore(this.page.getUrl(),this.page.getScore()));

//计算网页的摘要

this.page.setSummary(getSummary(this.page.getContext()));

catch(ParserExceptionpe){

this.page=null;

pe.printStackTrace();

System.out.println("

Continue..."

9.6.4PageLib类

importjava.io.BufferedWriter;

importjava.io.FileWriter;

importjava.io.File;

importjava.io.IOException;

publicclassPageLib{

publicstaticvoidstore(Pagepage){

Stringstorepath=ProperConfig.getValue("

files.path"

)+"

+page.getSummary();

if(newFile(storepath).exists()==true){

"

+storepath+"

isexisted!

return;

BufferedWriterwriter=newBufferedWriter(newFileWriter(storepath));

//第一行为URL

writer.append(page.getUrl());

writer.newLine();

//第二行为标题

writer.append(page.getTitle());

//第三行为得分

writer.append(String.valueOf(page.getScore()));

//第四行为网页内容

writer.append(page.getContext());

//关闭输出流

writer.close();

catch(IOExceptionioe){

Error:

Processing"

+page.getUrl()+"

accurserror"

ioe.printStackTrace();

9.6.5PropertyConfig类

packagecn.edu.bjfu.search.util;

importjava.util.ResourceBundle;

importjava.util.MissingResourceException;

publicclassProperConfig{

privatestaticStringCONFIG_FILE="

config"

privatestaticResourceBundlebundle;

static{

bundle=ResourceBundle.getBundle(CONFIG_FILE);

catch(MissingResourceExceptionmre){

Cannotfindconfigfile"

+CONFIG_FILE+"

.properties."

publicstaticStringgetValue(Stringkey){

returnbundle.getString(key);

9.6.6MD5类

importjava.security.MessageDigest;

importjava.security.NoSuchAlgorithmException;

publicclassMD5{

privatefinalstaticString[]hexDigits={"

0"

"

1"

2"

3"

4"

5"

6"

7"

8"

9"

a"

b"

c"

d"

e"

f"

};

publicstaticStringbyteArrayToHexString(byte[]b){

StringBufferresultSb=newStringBuffer();

for(inti=0;

b.length;

i++){

resultSb.append(byteToHexString(b[i]));

returnresultSb.toString();

privatestaticStringbyteToHexString(byteb){

intn=b;

if(n<

0)

n=256+n;

intd1=n/16;

intd2=n%16;

returnhexDigits[d1]+hexDigits[d2];

publicstaticStringMD5Encode(Stringorigin){

StringresultString=null;

try{

resultString=newString(origin);

MessageDigestmd=MessageDigest.getInstance("

MD5"

resultString=byteArrayToHexString(md.digest(resultString.getBytes()));

catch(NoSuchAlgorithmExceptionnsae){

System.err.println("

NosuchAlgorithmcalled\"

MD5\"

!

returnresultString;

9.6.7IndexBuilder类

packagecn.edu.bjfu.search.index;

importorg.apache.lucene.document.*;

importorg.apache.lucene.index.*;

importjeasy.analysis.*;

importjava.io.FileReader;

importjava.io.BufferedReader;

publicclassIndexBuilder{

//IndexWriter

IndexWriterwriter;

publicIndexBuilder(Stringpath)throwsIOException{

writer=newIndexWriter(path,newMMAnalyzer());

publicvoidbuild(Stringpath)throwsIOException{

BufferedReaderreader=null;

File[]files=newFile(path).listFiles();

files.length;

System.out.print("

."

reader=newBufferedReader(newFileReader(files[i]));

Documentdoc=newDocument();

Field[]fields=newField[5];

fields[0]=newField("

id"

String.valueOf(i),Field.Store.YES,Field.Index.NO);

fields[1]=newField("

url"

reader.readLine(),Field.Store.YES,Field.Index.NO);

fields[2]=newField("

title"

reader.readLine(),Field.Store.YES,Field.Index.TOKENIZED);

fields[3]=newField("

score"

fields[4]=newField("

context"

getBodyFile(files[i].getAbsolutePath(),reader),Field.Store.YES,Field.Index.TOKENIZED);

//创建Document

for(intj=0;

j<

fields.length;

j++){

doc.add(fields[j]);

//将Document添加至IndexWriter中

writer.addDocument(doc);

writer.optimize();

writer.close();

reader.close();

privateStringgetBodyFile(Stringpath,BufferedReaderreader)throwsIOException{

Stringline=reader.readLine();

while(line!

buffer.append(line);

line=reader.readLine();

9.6.8index.html

<

DOCTYPEHTMLPUBLIC"

-//W3C//DTDHTML4.01Transitional//EN"

>

html>

<

head>

title>

BJFUSearchEngine<

/title>

metahttp-equiv="

keywords"

content="

keyword1,keyword2,keyword3"

description"

this

展开阅读全文
相关资源
猜你喜欢
相关搜索

当前位置:首页 > 高等教育 > 院校资料

copyright@ 2008-2022 冰豆网网站版权所有

经营许可证编号:鄂ICP备2022015515号-1