搜索引擎原理与实践源程序.docx
《搜索引擎原理与实践源程序.docx》由会员分享,可在线阅读,更多相关《搜索引擎原理与实践源程序.docx(27页珍藏版)》请在冰豆网上搜索。
搜索引擎原理与实践源程序
9.6源程序
9.6.1FrontierSchedulerForBjfu类
packageorg.archive.crawler.postprocessor;
importorg.archive.crawler.datamodel.CandidateURI;
publicclassFrontierSchedulerForBjfuextendsFrontierScheduler{
publicFrontierSchedulerForBjfu(Stringname){
super(name);
}
protectedvoidschedule(CandidateURIcaUri){
Stringuri=caUri.toString();
if(uri.indexOf("dns:
")!
=-1){
getController().getFrontier().schedule(caUri);
}
elseif(uri.indexOf("bjfu")!
=-1
&&(uri.indexOf(".html")!
=-1
||uri.indexOf(".htm")!
=-1
||uri.indexOf(".jsp")!
=-1
||uri.indexOf(".asp")!
=-1
||uri.indexOf(".aspx")!
=-1)){
System.out.println(uri);
getController().getFrontier().schedule(caUri);
}
}
}
9.6.2Page类
packagecn.edu.bjfu.search.page;
publicclassPage{
privateStringurl;
privateStringtitle;
privateStringsummary;
privateStringcontext;
privateintscore;
publicPage(){
url=null;
title=null;
summary=null;
context=null;
score=10;
}
publicStringgetUrl(){
returnthis.url;
}
publicvoidsetUrl(Stringurl){
this.url=url;
}
publicStringgetTitle(){
returnthis.title;
}
publicvoidsetTitle(Stringtitle){
this.title=title;
}
publicStringgetSummary(){
returnthis.summary;
}
publicvoidsetSummary(Stringsummary){
this.summary=summary;
}
publicStringgetContext(){
returnthis.context;
}
publicvoidSetContext(Stringcontext){
this.context=context;
}
publicintgetScore(){
returnthis.score;
}
publicvoidsetScore(intscore){
this.score=score;
}
}
9.6.3Extractor类
packagecn.edu.bjfu.search.extractor;
importorg.htmlparser.*;
importorg.htmlparser.util.*;
importorg.htmlparser.visitors.*;
importorg.htmlparser.nodes.*;
importorg.htmlparser.tags.*;
importcn.edu.bjfu.search.page.*;
importcn.edu.bjfu.search.util.*;
publicclassExtractorimplementsRunnable{
privateStringfilename;
privateParserparser;
privatePagepage;
privateStringencode;
publicvoidsetEncode(Stringencode){
this.encode=encode;
}
privateStringcombineNodeText(Node[]nodes){
StringBufferbuffer=newStringBuffer();
for(inti=0;iNodeanode=(Node)nodes[i];
Stringline=null;
if(anodeinstanceofTextNode){
TextNodetextnode=(TextNode)anode;
line=textnode.getText();
}
elseif(anodeinstanceofLinkTag){
LinkTaglinknode=(LinkTag)anode;
line=linknode.getLinkText();
}
elseif(anodeinstanceofDiv){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
}
}
elseif(anodeinstanceofParagraphTag){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
}
}
elseif(anodeinstanceofSpan){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
}
}
elseif(anodeinstanceofTableTag){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
}
}
elseif(anodeinstanceofTableRow){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
}
}
elseif(anodeinstanceofTableColumn){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
}
}
if(line!
=null){
buffer.append(line);
}
}
returnbuffer.toString();
}
privateStringgetUrl(Stringfilename){
Stringurl=filename;
url=url.replace(ProperConfig.getValue("mirror.path"),"");
if(url.lastIndexOf("/")==url.length()-1){
url=url.substring(0,url.length()-1);
}
url=url.substring
(1);
returnurl;
}
privateintgetScore(Stringurl,intscore){
String[]subStr=url.split("/");
score=score-(subStr.length-1);
returnscore;
}
privateStringgetSummary(Stringcontext){
if(context==null){
context="";
}
returnMD5.MD5Encode(context);
}
publicvoidextract(Stringfilename){
System.out.println("Message:
Nowextracting"+filename);
this.filename=filename.replace("\\","/");
run();
if(this.page!
=null){
PageLib.store(this.page);
}
}
publicvoidrun(){
try{
parser=newParser(this.filename);
parser.setEncoding(encode);
HtmlPagevisitor=newHtmlPage(parser);
parser.visitAllNodesWith(visitor);
page=newPage();
//获取网页的URL
this.page.setUrl(getUrl(this.filename));
//获取网页的标题
this.page.setTitle(visitor.getTitle());
//验证网页
标签内是否为空,如果是空则不用进行内容提取
if(visitor.getBody()==null){
this.page.SetContext(null);
}
else{
//如果不为空,则提取内容
this.page.SetContext(combineNodeText(visitor.getBody().toNodeArray()));
}
//计算网页的得分
this.page.setScore(getScore(this.page.getUrl(),this.page.getScore()));
//计算网页的摘要
this.page.setSummary(getSummary(this.page.getContext()));
}
catch(ParserExceptionpe){
this.page=null;
pe.printStackTrace();
System.out.println("Continue...");
}
}
}
9.6.4PageLib类
packagecn.edu.bjfu.search.page;
importjava.io.BufferedWriter;
importjava.io.FileWriter;
importjava.io.File;
importjava.io.IOException;
importcn.edu.bjfu.search.util.*;
publicclassPageLib{
publicstaticvoidstore(Pagepage){
Stringstorepath=ProperConfig.getValue("files.path")+"/"+page.getSummary();
if(newFile(storepath).exists()==true){
System.out.println("Message:
"+storepath+"isexisted!
");
return;
}
try{
BufferedWriterwriter=newBufferedWriter(newFileWriter(storepath));
//第一行为URL
writer.append(page.getUrl());
writer.newLine();
//第二行为标题
writer.append(page.getTitle());
writer.newLine();
//第三行为得分
writer.append(String.valueOf(page.getScore()));
writer.newLine();
//第四行为网页内容
writer.append(page.getContext());
//关闭输出流
writer.close();
}
catch(IOExceptionioe){
System.out.println("Error:
Processing"+page.getUrl()+"accurserror");
ioe.printStackTrace();
}
}
}
9.6.5PropertyConfig类
packagecn.edu.bjfu.search.util;
importjava.util.ResourceBundle;
importjava.util.MissingResourceException;
publicclassProperConfig{
privatestaticStringCONFIG_FILE="config";
privatestaticResourceBundlebundle;
static{
try{
bundle=ResourceBundle.getBundle(CONFIG_FILE);
}
catch(MissingResourceExceptionmre){
System.out.println("Cannotfindconfigfile"
+CONFIG_FILE+".properties.");
}
}
publicstaticStringgetValue(Stringkey){
returnbundle.getString(key);
}
}
9.6.6MD5类
packagecn.edu.bjfu.search.util;
importjava.security.MessageDigest;
importjava.security.NoSuchAlgorithmException;
publicclassMD5{
privatefinalstaticString[]hexDigits={"0",
"1","2","3","4","5","6","7","8",
"9","a","b","c","d","e","f"};
publicstaticStringbyteArrayToHexString(byte[]b){
StringBufferresultSb=newStringBuffer();
for(inti=0;iresultSb.append(byteToHexString(b[i]));
}
returnresultSb.toString();
}
privatestaticStringbyteToHexString(byteb){
intn=b;
if(n<0)
n=256+n;
intd1=n/16;
intd2=n%16;
returnhexDigits[d1]+hexDigits[d2];
}
publicstaticStringMD5Encode(Stringorigin){
StringresultString=null;
try{
resultString=newString(origin);
MessageDigestmd=MessageDigest.getInstance("MD5");
resultString=byteArrayToHexString(md.digest(resultString.getBytes()));
}
catch(NoSuchAlgorithmExceptionnsae){
System.err.println("NosuchAlgorithmcalled\"MD5\"!
");
}
returnresultString;
}
}
9.6.7IndexBuilder类
packagecn.edu.bjfu.search.index;
importorg.apache.lucene.document.*;
importorg.apache.lucene.index.*;
importjeasy.analysis.*;
importjava.io.IOException;
importjava.io.FileReader;
importjava.io.BufferedReader;
importjava.io.File;
publicclassIndexBuilder{
//IndexWriter
IndexWriterwriter;
publicIndexBuilder(Stringpath)throwsIOException{
writer=newIndexWriter(path,newMMAnalyzer());
}
publicvoidbuild(Stringpath)throwsIOException{
BufferedReaderreader=null;
File[]files=newFile(path).listFiles();
for(inti=0;iSystem.out.print(".");
reader=newBufferedReader(newFileReader(files[i]));
Documentdoc=newDocument();
Field[]fields=newField[5];
fields[0]=newField("id",String.valueOf(i),Field.Store.YES,Field.Index.NO);
fields[1]=newField("url",reader.readLine(),Field.Store.YES,Field.Index.NO);
fields[2]=newField("title",reader.readLine(),Field.Store.YES,Field.Index.TOKENIZED);
fields[3]=newField("score",reader.readLine(),Field.Store.YES,Field.Index.NO);
fields[4]=newField("context",getBodyFile(files[i].getAbsolutePath(),reader),Field.Store.YES,Field.Index.TOKENIZED);
//创建Document
for(intj=0;jdoc.add(fields[j]);
}
//将Document添加至IndexWriter中
writer.addDocument(doc);
}
writer.optimize();
writer.close();
reader.close();
}
privateStringgetBodyFile(Stringpath,BufferedReaderreader)throwsIOException{
StringBufferbuffer=newStringBuffer();
Stringline=reader.readLine();
while(line!
=null){
buffer.append(line);
line=reader.readLine();
}
returnbuffer.toString();
}
}
9.6.8index.html
DOCTYPEHTMLPUBLIC"-//W3C//DTDHTML4.01Transitional//EN">
BJFUSearchEngine