《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx
《《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx》由会员分享,可在线阅读,更多相关《《搜索引擎与信息检索教程》示例程序Word文档下载推荐.docx(23页珍藏版)》请在冰豆网上搜索。
System.out.println(uri);
}
9.6.2Page类
packagecn.edu.bjfu.search.page;
publicclassPage{
privateStringurl;
privateStringtitle;
privateStringsummary;
privateStringcontext;
privateintscore;
publicPage(){
url=null;
title=null;
summary=null;
context=null;
score=10;
publicStringgetUrl(){
returnthis.url;
publicvoidsetUrl(Stringurl){
this.url=url;
publicStringgetTitle(){
returnthis.title;
publicvoidsetTitle(Stringtitle){
this.title=title;
publicStringgetSummary(){
returnthis.summary;
publicvoidsetSummary(Stringsummary){
this.summary=summary;
publicStringgetContext(){
returnthis.context;
publicvoidSetContext(Stringcontext){
this.context=context;
publicintgetScore(){
returnthis.score;
publicvoidsetScore(intscore){
this.score=score;
9.6.3Extractor类
packagecn.edu.bjfu.search.extractor;
importorg.htmlparser.*;
importorg.htmlparser.util.*;
importorg.htmlparser.visitors.*;
importorg.htmlparser.nodes.*;
importorg.htmlparser.tags.*;
importcn.edu.bjfu.search.page.*;
importcn.edu.bjfu.search.util.*;
publicclassExtractorimplementsRunnable{
privateStringfilename;
privateParserparser;
privatePagepage;
privateStringencode;
publicvoidsetEncode(Stringencode){
this.encode=encode;
privateStringcombineNodeText(Node[]nodes){
StringBufferbuffer=newStringBuffer();
for(inti=0;
i<
nodes.length;
i++){
Nodeanode=(Node)nodes[i];
Stringline=null;
if(anodeinstanceofTextNode){
TextNodetextnode=(TextNode)anode;
line=textnode.getText();
}
elseif(anodeinstanceofLinkTag){
LinkTaglinknode=(LinkTag)anode;
line=linknode.getLinkText();
elseif(anodeinstanceofDiv){
if(anode.getChildren()!
=null){
line=combineNodeText(anode.getChildren().toNodeArray());
elseif(anodeinstanceofParagraphTag){
elseif(anodeinstanceofSpan){
elseif(anodeinstanceofTableTag){
elseif(anodeinstanceofTableRow){
elseif(anodeinstanceofTableColumn){
if(line!
buffer.append(line);
returnbuffer.toString();
privateStringgetUrl(Stringfilename){
Stringurl=filename;
url=url.replace(ProperConfig.getValue("
mirror.path"
),"
);
if(url.lastIndexOf("
/"
)==url.length()-1){
url=url.substring(0,url.length()-1);
url=url.substring
(1);
returnurl;
privateintgetScore(Stringurl,intscore){
String[]subStr=url.split("
score=score-(subStr.length-1);
returnscore;
privateStringgetSummary(Stringcontext){
if(context==null){
context="
;
returnMD5.MD5Encode(context);
publicvoidextract(Stringfilename){
System.out.println("
Message:
Nowextracting"
+filename);
this.filename=filename.replace("
\\"
"
run();
if(this.page!
PageLib.store(this.page);
publicvoidrun(){
try{
parser=newParser(this.filename);
parser.setEncoding(encode);
HtmlPagevisitor=newHtmlPage(parser);
parser.visitAllNodesWith(visitor);
page=newPage();
//获取网页的URL
this.page.setUrl(getUrl(this.filename));
//获取网页的标题
this.page.setTitle(visitor.getTitle());
//验证网页<
body>
标签内是否为空,如果是空则不用进行内容提取
if(visitor.getBody()==null){
this.page.SetContext(null);
else{
//如果不为空,则提取内容
this.page.SetContext(combineNodeText(visitor.getBody().toNodeArray()));
//计算网页的得分
this.page.setScore(getScore(this.page.getUrl(),this.page.getScore()));
//计算网页的摘要
this.page.setSummary(getSummary(this.page.getContext()));
catch(ParserExceptionpe){
this.page=null;
pe.printStackTrace();
System.out.println("
Continue..."
9.6.4PageLib类
importjava.io.BufferedWriter;
importjava.io.FileWriter;
importjava.io.File;
importjava.io.IOException;
publicclassPageLib{
publicstaticvoidstore(Pagepage){
Stringstorepath=ProperConfig.getValue("
files.path"
)+"
+page.getSummary();
if(newFile(storepath).exists()==true){
"
+storepath+"
isexisted!
return;
BufferedWriterwriter=newBufferedWriter(newFileWriter(storepath));
//第一行为URL
writer.append(page.getUrl());
writer.newLine();
//第二行为标题
writer.append(page.getTitle());
//第三行为得分
writer.append(String.valueOf(page.getScore()));
//第四行为网页内容
writer.append(page.getContext());
//关闭输出流
writer.close();
catch(IOExceptionioe){
Error:
Processing"
+page.getUrl()+"
accurserror"
ioe.printStackTrace();
9.6.5PropertyConfig类
packagecn.edu.bjfu.search.util;
importjava.util.ResourceBundle;
importjava.util.MissingResourceException;
publicclassProperConfig{
privatestaticStringCONFIG_FILE="
config"
privatestaticResourceBundlebundle;
static{
bundle=ResourceBundle.getBundle(CONFIG_FILE);
catch(MissingResourceExceptionmre){
Cannotfindconfigfile"
+CONFIG_FILE+"
.properties."
publicstaticStringgetValue(Stringkey){
returnbundle.getString(key);
9.6.6MD5类
importjava.security.MessageDigest;
importjava.security.NoSuchAlgorithmException;
publicclassMD5{
privatefinalstaticString[]hexDigits={"
0"
"
1"
2"
3"
4"
5"
6"
7"
8"
9"
a"
b"
c"
d"
e"
f"
};
publicstaticStringbyteArrayToHexString(byte[]b){
StringBufferresultSb=newStringBuffer();
for(inti=0;
b.length;
i++){
resultSb.append(byteToHexString(b[i]));
returnresultSb.toString();
privatestaticStringbyteToHexString(byteb){
intn=b;
if(n<
0)
n=256+n;
intd1=n/16;
intd2=n%16;
returnhexDigits[d1]+hexDigits[d2];
publicstaticStringMD5Encode(Stringorigin){
StringresultString=null;
try{
resultString=newString(origin);
MessageDigestmd=MessageDigest.getInstance("
MD5"
resultString=byteArrayToHexString(md.digest(resultString.getBytes()));
catch(NoSuchAlgorithmExceptionnsae){
System.err.println("
NosuchAlgorithmcalled\"
MD5\"
!
returnresultString;
9.6.7IndexBuilder类
packagecn.edu.bjfu.search.index;
importorg.apache.lucene.document.*;
importorg.apache.lucene.index.*;
importjeasy.analysis.*;
importjava.io.FileReader;
importjava.io.BufferedReader;
publicclassIndexBuilder{
//IndexWriter
IndexWriterwriter;
publicIndexBuilder(Stringpath)throwsIOException{
writer=newIndexWriter(path,newMMAnalyzer());
publicvoidbuild(Stringpath)throwsIOException{
BufferedReaderreader=null;
File[]files=newFile(path).listFiles();
files.length;
System.out.print("
."
reader=newBufferedReader(newFileReader(files[i]));
Documentdoc=newDocument();
Field[]fields=newField[5];
fields[0]=newField("
id"
String.valueOf(i),Field.Store.YES,Field.Index.NO);
fields[1]=newField("
url"
reader.readLine(),Field.Store.YES,Field.Index.NO);
fields[2]=newField("
title"
reader.readLine(),Field.Store.YES,Field.Index.TOKENIZED);
fields[3]=newField("
score"
fields[4]=newField("
context"
getBodyFile(files[i].getAbsolutePath(),reader),Field.Store.YES,Field.Index.TOKENIZED);
//创建Document
for(intj=0;
j<
fields.length;
j++){
doc.add(fields[j]);
//将Document添加至IndexWriter中
writer.addDocument(doc);
writer.optimize();
writer.close();
reader.close();
privateStringgetBodyFile(Stringpath,BufferedReaderreader)throwsIOException{
Stringline=reader.readLine();
while(line!
buffer.append(line);
line=reader.readLine();
9.6.8index.html
<
DOCTYPEHTMLPUBLIC"
-//W3C//DTDHTML4.01Transitional//EN"
>
html>
<
head>
title>
BJFUSearchEngine<
/title>
metahttp-equiv="
keywords"
content="
keyword1,keyword2,keyword3"
description"
this