1、搜索引擎原理与实践源程序9.6 源程序9.6.1 FrontierSchedulerForBjfu类package org.archive.crawler.postprocessor;import org.archive.crawler.datamodel.CandidateURI;public class FrontierSchedulerForBjfu extends FrontierScheduler public FrontierSchedulerForBjfu(String name) super(name); protected void schedule(CandidateURI
2、 caUri) String uri = caUri.toString(); if(uri.indexOf(dns:) != -1) getController().getFrontier().schedule(caUri); else if(uri.indexOf(bjfu) != -1 & (uri.indexOf(.html) != -1 | uri.indexOf(.htm) != -1 | uri.indexOf(.jsp) != -1 | uri.indexOf(.asp) != -1 | uri.indexOf(.aspx) != -1) System.out.println(u
3、ri); getController().getFrontier().schedule(caUri); 9.6.2 Page类package cn.edu.bjfu.search.page;public class Page private String url; private String title; private String summary; private String context; private int score; public Page() url = null; title = null; summary = null; context = null; score
4、= 10; public String getUrl() return this.url; public void setUrl(String url) this.url = url; public String getTitle() return this.title; public void setTitle(String title) this.title = title; public String getSummary() return this.summary; public void setSummary(String summary) this.summary = summar
5、y; public String getContext() return this.context; public void SetContext(String context) this.context = context; public int getScore() return this.score; public void setScore(int score) this.score = score; 9.6.3 Extractor类package cn.edu.bjfu.search.extractor;import org.htmlparser.*;import org.htmlp
6、arser.util.*;import org.htmlparser.visitors.*;import org.htmlparser.nodes.*;import org.htmlparser.tags.*;import cn.edu.bjfu.search.page.*;import cn.edu.bjfu.search.util.*;public class Extractor implements Runnable private String filename; private Parser parser; private Page page; private String enco
7、de; public void setEncode(String encode) this.encode = encode; private String combineNodeText(Node nodes) StringBuffer buffer = new StringBuffer(); for(int i = 0; i nodes.length; i+) Node anode = (Node)nodesi; String line = null; if(anode instanceof TextNode) TextNode textnode = (TextNode)anode; lin
8、e = textnode.getText(); else if (anode instanceof LinkTag) LinkTag linknode = (LinkTag) anode; line = linknode.getLinkText(); else if (anode instanceof Div) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof ParagraphTag) if(anode.getC
9、hildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof Span) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof TableTag) if(anode.getChildren() != null) line = combineNodeText(anode.getC
10、hildren().toNodeArray(); else if (anode instanceof TableRow) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof TableColumn) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); if(line != null) buf
11、fer.append(line); return buffer.toString(); private String getUrl(String filename) String url = filename; url = url.replace(ProperConfig.getValue(mirror.path), ); if(url.lastIndexOf(/) = url.length() - 1) url = url.substring(0, url.length() - 1); url = url.substring(1); return url; private int getSc
12、ore(String url, int score) String subStr = url.split(/); score = score - (subStr.length - 1); return score; private String getSummary(String context) if(context = null) context = ; return MD5.MD5Encode(context); public void extract(String filename) System.out.println(Message: Now extracting + filena
13、me); this.filename = filename.replace(, /); run(); if(this.page != null) PageLib.store(this.page); public void run() try parser = new Parser(this.filename); parser.setEncoding(encode); HtmlPage visitor = new HtmlPage(parser); parser.visitAllNodesWith(visitor); page = new Page(); / 获取网页的URL this.page
14、.setUrl(getUrl(this.filename); / 获取网页的标题 this.page.setTitle(visitor.getTitle(); / 验证网页标签内是否为空,如果是空则不用进行内容提取 if(visitor.getBody() = null) this.page.SetContext(null); else / 如果不为空,则提取内容 this.page.SetContext(combineNodeText(visitor.getBody().toNodeArray(); / 计算网页的得分 this.page.setScore(getScore(this.pag
15、e.getUrl(), this.page.getScore(); / 计算网页的摘要 this.page.setSummary(getSummary(this.page.getContext(); catch(ParserException pe) this.page = null; pe.printStackTrace(); System.out.println(Continue.); 9.6.4 PageLib类package cn.edu.bjfu.search.page;import java.io.BufferedWriter;import java.io.FileWriter;i
16、mport java.io.File;import java.io.IOException;import cn.edu.bjfu.search.util.*;public class PageLib public static void store(Page page) String storepath = ProperConfig.getValue(files.path) + / + page.getSummary(); if(new File(storepath).exists() = true) System.out.println(Message: + storepath + is e
17、xisted!); return; try BufferedWriter writer = new BufferedWriter(new FileWriter(storepath); / 第一行为URL writer.append(page.getUrl(); writer.newLine(); / 第二行为标题 writer.append(page.getTitle(); writer.newLine(); / 第三行为得分 writer.append(String.valueOf(page.getScore(); writer.newLine(); / 第四行为网页内容 writer.ap
18、pend(page.getContext(); / 关闭输出流 writer.close(); catch(IOException ioe) System.out.println(Error: Processing + page.getUrl() + accurs error); ioe.printStackTrace(); 9.6.5 PropertyConfig类package cn.edu.bjfu.search.util;import java.util.ResourceBundle;import java.util.MissingResourceException;public cl
19、ass ProperConfig private static String CONFIG_FILE = config; private static ResourceBundle bundle; static try bundle = ResourceBundle.getBundle(CONFIG_FILE); catch(MissingResourceException mre) System.out.println(Cannot find config file + CONFIG_FILE + .properties.); public static String getValue(St
20、ring key) return bundle.getString(key); 9.6.6 MD5类package cn.edu.bjfu.search.util;import java.security.MessageDigest;import java.security.NoSuchAlgorithmException;public class MD5 private final static String hexDigits = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f; public static String byteArrayTo
21、HexString(byte b) StringBuffer resultSb = new StringBuffer(); for (int i = 0; i b.length; i+) resultSb.append(byteToHexString(bi); return resultSb.toString(); private static String byteToHexString(byte b) int n = b; if (n 0) n = 256 + n; int d1 = n / 16; int d2 = n % 16; return hexDigitsd1 + hexDigi
22、tsd2; public static String MD5Encode(String origin) String resultString = null; try resultString=new String(origin); MessageDigest md = MessageDigest.getInstance(MD5); resultString=byteArrayToHexString(md.digest(resultString.getBytes(); catch (NoSuchAlgorithmException nsae) System.err.println(No suc
23、h Algorithm called MD5!); return resultString; 9.6.7 IndexBuilder类package cn.edu.bjfu.search.index;import org.apache.lucene.document.*;import org.apache.lucene.index.*;import jeasy.analysis.*;import java.io.IOException;import java.io.FileReader;import java.io.BufferedReader;import java.io.File;publi
24、c class IndexBuilder / IndexWriter IndexWriter writer; public IndexBuilder(String path) throws IOException writer = new IndexWriter(path, new MMAnalyzer(); public void build(String path) throws IOException BufferedReader reader = null; File files = new File(path).listFiles(); for(int i = 0; i files.
25、length; i+) System.out.print(.); reader = new BufferedReader(new FileReader(filesi); Document doc = new Document(); Field fields = new Field5; fields0 = new Field(id, String.valueOf(i), Field.Store.YES, Field.Index.NO); fields1 = new Field(url, reader.readLine(), Field.Store.YES, Field.Index.NO); fi
26、elds2 = new Field(title, reader.readLine(), Field.Store.YES, Field.Index.TOKENIZED); fields3 = new Field(score, reader.readLine(), Field.Store.YES, Field.Index.NO); fields4 = new Field(context, getBodyFile(filesi.getAbsolutePath(), reader), Field.Store.YES, Field.Index.TOKENIZED); / 创建Document for(i
27、nt j = 0; j fields.length; j+) doc.add(fieldsj); / 将Document添加至IndexWriter中 writer.addDocument(doc); writer.optimize(); writer.close(); reader.close(); private String getBodyFile(String path, BufferedReader reader) throws IOException StringBuffer buffer = new StringBuffer(); String line = reader.readLine(); while(line != null) buffer.append(line); line = reader.readLine(); return buffer.toString(); 9.6.8 index.html BJFU Search Engine meta http-equiv=description content=this is my
copyright@ 2008-2022 冰豆网网站版权所有
经营许可证编号:鄂ICP备2022015515号-1