《搜索引擎与信息检索教程》示例程序.docx

上传人:b****5 文档编号:7361411 上传时间:2023-01-23 格式:DOCX 页数:23 大小:21.02KB
下载 相关 举报
《搜索引擎与信息检索教程》示例程序.docx_第1页
第1页 / 共23页
《搜索引擎与信息检索教程》示例程序.docx_第2页
第2页 / 共23页
《搜索引擎与信息检索教程》示例程序.docx_第3页
第3页 / 共23页
《搜索引擎与信息检索教程》示例程序.docx_第4页
第4页 / 共23页
《搜索引擎与信息检索教程》示例程序.docx_第5页
第5页 / 共23页
点击查看更多>>
下载资源
资源描述

《搜索引擎与信息检索教程》示例程序.docx

《《搜索引擎与信息检索教程》示例程序.docx》由会员分享,可在线阅读,更多相关《《搜索引擎与信息检索教程》示例程序.docx(23页珍藏版)》请在冰豆网上搜索。

《搜索引擎与信息检索教程》示例程序.docx

《搜索引擎与信息检索教程》示例程序

9.6示例代码

9.6.1FrontierSchedulerForBjfu类

packageorg.archive.crawler.postprocessor;

importorg.archive.crawler.datamodel.CandidateURI;

publicclassFrontierSchedulerForBjfuextendsFrontierScheduler{

publicFrontierSchedulerForBjfu(Stringname){

super(name);

}

protectedvoidschedule(CandidateURIcaUri){

Stringuri=caUri.toString();

if(uri.indexOf("dns:

")!

=-1){

getController().getFrontier().schedule(caUri);

}

elseif(uri.indexOf("bjfu")!

=-1

&&(uri.indexOf(".html")!

=-1

||uri.indexOf(".htm")!

=-1

||uri.indexOf(".jsp")!

=-1

||uri.indexOf(".asp")!

=-1

||uri.indexOf(".aspx")!

=-1)){

System.out.println(uri);

getController().getFrontier().schedule(caUri);

}

}

}

9.6.2Page类

packagecn.edu.bjfu.search.page;

publicclassPage{

privateStringurl;

privateStringtitle;

privateStringsummary;

privateStringcontext;

privateintscore;

publicPage(){

url=null;

title=null;

summary=null;

context=null;

score=10;

}

publicStringgetUrl(){

returnthis.url;

}

publicvoidsetUrl(Stringurl){

this.url=url;

}

publicStringgetTitle(){

returnthis.title;

}

publicvoidsetTitle(Stringtitle){

this.title=title;

}

publicStringgetSummary(){

returnthis.summary;

}

publicvoidsetSummary(Stringsummary){

this.summary=summary;

}

publicStringgetContext(){

returnthis.context;

}

publicvoidSetContext(Stringcontext){

this.context=context;

}

publicintgetScore(){

returnthis.score;

}

publicvoidsetScore(intscore){

this.score=score;

}

}

9.6.3Extractor类

packagecn.edu.bjfu.search.extractor;

importorg.htmlparser.*;

importorg.htmlparser.util.*;

importorg.htmlparser.visitors.*;

importorg.htmlparser.nodes.*;

importorg.htmlparser.tags.*;

importcn.edu.bjfu.search.page.*;

importcn.edu.bjfu.search.util.*;

publicclassExtractorimplementsRunnable{

privateStringfilename;

privateParserparser;

privatePagepage;

privateStringencode;

publicvoidsetEncode(Stringencode){

this.encode=encode;

}

privateStringcombineNodeText(Node[]nodes){

StringBufferbuffer=newStringBuffer();

for(inti=0;i

Nodeanode=(Node)nodes[i];

Stringline=null;

if(anodeinstanceofTextNode){

TextNodetextnode=(TextNode)anode;

line=textnode.getText();

}

elseif(anodeinstanceofLinkTag){

LinkTaglinknode=(LinkTag)anode;

line=linknode.getLinkText();

}

elseif(anodeinstanceofDiv){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

}

}

elseif(anodeinstanceofParagraphTag){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

}

}

elseif(anodeinstanceofSpan){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

}

}

elseif(anodeinstanceofTableTag){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

}

}

elseif(anodeinstanceofTableRow){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

}

}

elseif(anodeinstanceofTableColumn){

if(anode.getChildren()!

=null){

line=combineNodeText(anode.getChildren().toNodeArray());

}

}

if(line!

=null){

buffer.append(line);

}

}

returnbuffer.toString();

}

privateStringgetUrl(Stringfilename){

Stringurl=filename;

url=url.replace(ProperConfig.getValue("mirror.path"),"");

if(url.lastIndexOf("/")==url.length()-1){

url=url.substring(0,url.length()-1);

}

url=url.substring

(1);

returnurl;

}

privateintgetScore(Stringurl,intscore){

String[]subStr=url.split("/");

score=score-(subStr.length-1);

returnscore;

}

privateStringgetSummary(Stringcontext){

if(context==null){

context="";

}

returnMD5.MD5Encode(context);

}

publicvoidextract(Stringfilename){

System.out.println("Message:

Nowextracting"+filename);

this.filename=filename.replace("\\","/");

run();

if(this.page!

=null){

PageLib.store(this.page);

}

}

publicvoidrun(){

try{

parser=newParser(this.filename);

parser.setEncoding(encode);

HtmlPagevisitor=newHtmlPage(parser);

parser.visitAllNodesWith(visitor);

page=newPage();

//获取网页的URL

this.page.setUrl(getUrl(this.filename));

//获取网页的标题

this.page.setTitle(visitor.getTitle());

//验证网页标签内是否为空,如果是空则不用进行内容提取

if(visitor.getBody()==null){

this.page.SetContext(null);

}

else{

//如果不为空,则提取内容

this.page.SetContext(combineNodeText(visitor.getBody().toNodeArray()));

}

//计算网页的得分

this.page.setScore(getScore(this.page.getUrl(),this.page.getScore()));

//计算网页的摘要

this.page.setSummary(getSummary(this.page.getContext()));

}

catch(ParserExceptionpe){

this.page=null;

pe.printStackTrace();

System.out.println("Continue...");

}

}

}

9.6.4PageLib类

packagecn.edu.bjfu.search.page;

importjava.io.BufferedWriter;

importjava.io.FileWriter;

importjava.io.File;

importjava.io.IOException;

importcn.edu.bjfu.search.util.*;

publicclassPageLib{

publicstaticvoidstore(Pagepage){

Stringstorepath=ProperConfig.getValue("files.path")+"/"+page.getSummary();

if(newFile(storepath).exists()==true){

System.out.println("Message:

"+storepath+"isexisted!

");

return;

}

try{

BufferedWriterwriter=newBufferedWriter(newFileWriter(storepath));

//第一行为URL

writer.append(page.getUrl());

writer.newLine();

//第二行为标题

writer.append(page.getTitle());

writer.newLine();

//第三行为得分

writer.append(String.valueOf(page.getScore()));

writer.newLine();

//第四行为网页内容

writer.append(page.getContext());

//关闭输出流

writer.close();

}

catch(IOExceptionioe){

System.out.println("Error:

Processing"+page.getUrl()+"accurserror");

ioe.printStackTrace();

}

}

}

9.6.5PropertyConfig类

packagecn.edu.bjfu.search.util;

importjava.util.ResourceBundle;

importjava.util.MissingResourceException;

publicclassProperConfig{

privatestaticStringCONFIG_FILE="config";

privatestaticResourceBundlebundle;

static{

try{

bundle=ResourceBundle.getBundle(CONFIG_FILE);

}

catch(MissingResourceExceptionmre){

System.out.println("Cannotfindconfigfile"

+CONFIG_FILE+".properties.");

}

}

publicstaticStringgetValue(Stringkey){

returnbundle.getString(key);

}

}

9.6.6MD5类

packagecn.edu.bjfu.search.util;

importjava.security.MessageDigest;

importjava.security.NoSuchAlgorithmException;

publicclassMD5{

privatefinalstaticString[]hexDigits={"0",

"1","2","3","4","5","6","7","8",

"9","a","b","c","d","e","f"};

publicstaticStringbyteArrayToHexString(byte[]b){

StringBufferresultSb=newStringBuffer();

for(inti=0;i

resultSb.append(byteToHexString(b[i]));

}

returnresultSb.toString();

}

privatestaticStringbyteToHexString(byteb){

intn=b;

if(n<0)

n=256+n;

intd1=n/16;

intd2=n%16;

returnhexDigits[d1]+hexDigits[d2];

}

publicstaticStringMD5Encode(Stringorigin){

StringresultString=null;

try{

resultString=newString(origin);

MessageDigestmd=MessageDigest.getInstance("MD5");

resultString=byteArrayToHexString(md.digest(resultString.getBytes()));

}

catch(NoSuchAlgorithmExceptionnsae){

System.err.println("NosuchAlgorithmcalled\"MD5\"!

");

}

returnresultString;

}

}

9.6.7IndexBuilder类

packagecn.edu.bjfu.search.index;

importorg.apache.lucene.document.*;

importorg.apache.lucene.index.*;

importjeasy.analysis.*;

importjava.io.IOException;

importjava.io.FileReader;

importjava.io.BufferedReader;

importjava.io.File;

publicclassIndexBuilder{

//IndexWriter

IndexWriterwriter;

publicIndexBuilder(Stringpath)throwsIOException{

writer=newIndexWriter(path,newMMAnalyzer());

}

publicvoidbuild(Stringpath)throwsIOException{

BufferedReaderreader=null;

File[]files=newFile(path).listFiles();

for(inti=0;i

System.out.print(".");

reader=newBufferedReader(newFileReader(files[i]));

Documentdoc=newDocument();

Field[]fields=newField[5];

fields[0]=newField("id",String.valueOf(i),Field.Store.YES,Field.Index.NO);

fields[1]=newField("url",reader.readLine(),Field.Store.YES,Field.Index.NO);

fields[2]=newField("title",reader.readLine(),Field.Store.YES,Field.Index.TOKENIZED);

fields[3]=newField("score",reader.readLine(),Field.Store.YES,Field.Index.NO);

fields[4]=newField("context",getBodyFile(files[i].getAbsolutePath(),reader),Field.Store.YES,Field.Index.TOKENIZED);

//创建Document

for(intj=0;j

doc.add(fields[j]);

}

//将Document添加至IndexWriter中

writer.addDocument(doc);

}

writer.optimize();

writer.close();

reader.close();

}

privateStringgetBodyFile(Stringpath,BufferedReaderreader)throwsIOException{

StringBufferbuffer=newStringBuffer();

Stringline=reader.readLine();

while(line!

=null){

buffer.append(line);

line=reader.readLine();

}

returnbuffer.toString();

}

}

9.6.8index.html

DOCTYPEHTMLPUBLIC"-//W3C//DTDHTML4.01Transitional//EN">

BJFUSearchEngine

展开阅读全文
相关资源
猜你喜欢
相关搜索

当前位置:首页 > 农林牧渔 > 林学

copyright@ 2008-2022 冰豆网网站版权所有

经营许可证编号:鄂ICP备2022015515号-1