ImageVerifierCode 换一换
格式:DOCX , 页数:27 ,大小:20.93KB ,
资源ID:9643999      下载积分:3 金币
快捷下载
登录下载
邮箱/手机:
温馨提示:
快捷下载时,用户名和密码都是您填写的邮箱或者手机号,方便查询和重复下载(系统自动生成)。 如填写123,账号就是123,密码也是123。
特别说明:
请自助下载,系统不会自动发送文件的哦; 如果您已付费,想二次下载,请登录后访问:我的下载记录
支付方式: 支付宝    微信支付   
验证码:   换一换

加入VIP,免费下载
 

温馨提示:由于个人手机设置不同,如果发现不能下载,请复制以下地址【https://www.bdocx.com/down/9643999.html】到电脑端继续下载(重复下载不扣费)。

已注册用户请登录:
账号:
密码:
验证码:   换一换
  忘记密码?
三方登录: 微信登录   QQ登录  

下载须知

1: 本站所有资源如无特殊说明,都需要本地电脑安装OFFICE2007和PDF阅读器。
2: 试题试卷类文档,如果标题没有明确说明有答案则都视为没有答案,请知晓。
3: 文件的所有权益归上传用户所有。
4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
5. 本站仅提供交流平台,并不能对任何下载内容负责。
6. 下载文件中如有侵权或不适当内容,请与我们联系,我们立即纠正。
7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

版权提示 | 免责声明

本文(搜索引擎原理与实践源程序.docx)为本站会员(b****8)主动上传,冰豆网仅提供信息存储空间,仅对用户上传内容的表现方式做保护处理,对上载内容本身不做任何修改或编辑。 若此文所含内容侵犯了您的版权或隐私,请立即通知冰豆网(发送邮件至service@bdocx.com或直接QQ联系客服),我们立即给予删除!

搜索引擎原理与实践源程序.docx

1、搜索引擎原理与实践源程序9.6 源程序9.6.1 FrontierSchedulerForBjfu类package org.archive.crawler.postprocessor;import org.archive.crawler.datamodel.CandidateURI;public class FrontierSchedulerForBjfu extends FrontierScheduler public FrontierSchedulerForBjfu(String name) super(name); protected void schedule(CandidateURI

2、 caUri) String uri = caUri.toString(); if(uri.indexOf(dns:) != -1) getController().getFrontier().schedule(caUri); else if(uri.indexOf(bjfu) != -1 & (uri.indexOf(.html) != -1 | uri.indexOf(.htm) != -1 | uri.indexOf(.jsp) != -1 | uri.indexOf(.asp) != -1 | uri.indexOf(.aspx) != -1) System.out.println(u

3、ri); getController().getFrontier().schedule(caUri); 9.6.2 Page类package cn.edu.bjfu.search.page;public class Page private String url; private String title; private String summary; private String context; private int score; public Page() url = null; title = null; summary = null; context = null; score

4、= 10; public String getUrl() return this.url; public void setUrl(String url) this.url = url; public String getTitle() return this.title; public void setTitle(String title) this.title = title; public String getSummary() return this.summary; public void setSummary(String summary) this.summary = summar

5、y; public String getContext() return this.context; public void SetContext(String context) this.context = context; public int getScore() return this.score; public void setScore(int score) this.score = score; 9.6.3 Extractor类package cn.edu.bjfu.search.extractor;import org.htmlparser.*;import org.htmlp

6、arser.util.*;import org.htmlparser.visitors.*;import org.htmlparser.nodes.*;import org.htmlparser.tags.*;import cn.edu.bjfu.search.page.*;import cn.edu.bjfu.search.util.*;public class Extractor implements Runnable private String filename; private Parser parser; private Page page; private String enco

7、de; public void setEncode(String encode) this.encode = encode; private String combineNodeText(Node nodes) StringBuffer buffer = new StringBuffer(); for(int i = 0; i nodes.length; i+) Node anode = (Node)nodesi; String line = null; if(anode instanceof TextNode) TextNode textnode = (TextNode)anode; lin

8、e = textnode.getText(); else if (anode instanceof LinkTag) LinkTag linknode = (LinkTag) anode; line = linknode.getLinkText(); else if (anode instanceof Div) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof ParagraphTag) if(anode.getC

9、hildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof Span) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof TableTag) if(anode.getChildren() != null) line = combineNodeText(anode.getC

10、hildren().toNodeArray(); else if (anode instanceof TableRow) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); else if (anode instanceof TableColumn) if(anode.getChildren() != null) line = combineNodeText(anode.getChildren().toNodeArray(); if(line != null) buf

11、fer.append(line); return buffer.toString(); private String getUrl(String filename) String url = filename; url = url.replace(ProperConfig.getValue(mirror.path), ); if(url.lastIndexOf(/) = url.length() - 1) url = url.substring(0, url.length() - 1); url = url.substring(1); return url; private int getSc

12、ore(String url, int score) String subStr = url.split(/); score = score - (subStr.length - 1); return score; private String getSummary(String context) if(context = null) context = ; return MD5.MD5Encode(context); public void extract(String filename) System.out.println(Message: Now extracting + filena

13、me); this.filename = filename.replace(, /); run(); if(this.page != null) PageLib.store(this.page); public void run() try parser = new Parser(this.filename); parser.setEncoding(encode); HtmlPage visitor = new HtmlPage(parser); parser.visitAllNodesWith(visitor); page = new Page(); / 获取网页的URL this.page

14、.setUrl(getUrl(this.filename); / 获取网页的标题 this.page.setTitle(visitor.getTitle(); / 验证网页标签内是否为空,如果是空则不用进行内容提取 if(visitor.getBody() = null) this.page.SetContext(null); else / 如果不为空,则提取内容 this.page.SetContext(combineNodeText(visitor.getBody().toNodeArray(); / 计算网页的得分 this.page.setScore(getScore(this.pag

15、e.getUrl(), this.page.getScore(); / 计算网页的摘要 this.page.setSummary(getSummary(this.page.getContext(); catch(ParserException pe) this.page = null; pe.printStackTrace(); System.out.println(Continue.); 9.6.4 PageLib类package cn.edu.bjfu.search.page;import java.io.BufferedWriter;import java.io.FileWriter;i

16、mport java.io.File;import java.io.IOException;import cn.edu.bjfu.search.util.*;public class PageLib public static void store(Page page) String storepath = ProperConfig.getValue(files.path) + / + page.getSummary(); if(new File(storepath).exists() = true) System.out.println(Message: + storepath + is e

17、xisted!); return; try BufferedWriter writer = new BufferedWriter(new FileWriter(storepath); / 第一行为URL writer.append(page.getUrl(); writer.newLine(); / 第二行为标题 writer.append(page.getTitle(); writer.newLine(); / 第三行为得分 writer.append(String.valueOf(page.getScore(); writer.newLine(); / 第四行为网页内容 writer.ap

18、pend(page.getContext(); / 关闭输出流 writer.close(); catch(IOException ioe) System.out.println(Error: Processing + page.getUrl() + accurs error); ioe.printStackTrace(); 9.6.5 PropertyConfig类package cn.edu.bjfu.search.util;import java.util.ResourceBundle;import java.util.MissingResourceException;public cl

19、ass ProperConfig private static String CONFIG_FILE = config; private static ResourceBundle bundle; static try bundle = ResourceBundle.getBundle(CONFIG_FILE); catch(MissingResourceException mre) System.out.println(Cannot find config file + CONFIG_FILE + .properties.); public static String getValue(St

20、ring key) return bundle.getString(key); 9.6.6 MD5类package cn.edu.bjfu.search.util;import java.security.MessageDigest;import java.security.NoSuchAlgorithmException;public class MD5 private final static String hexDigits = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f; public static String byteArrayTo

21、HexString(byte b) StringBuffer resultSb = new StringBuffer(); for (int i = 0; i b.length; i+) resultSb.append(byteToHexString(bi); return resultSb.toString(); private static String byteToHexString(byte b) int n = b; if (n 0) n = 256 + n; int d1 = n / 16; int d2 = n % 16; return hexDigitsd1 + hexDigi

22、tsd2; public static String MD5Encode(String origin) String resultString = null; try resultString=new String(origin); MessageDigest md = MessageDigest.getInstance(MD5); resultString=byteArrayToHexString(md.digest(resultString.getBytes(); catch (NoSuchAlgorithmException nsae) System.err.println(No suc

23、h Algorithm called MD5!); return resultString; 9.6.7 IndexBuilder类package cn.edu.bjfu.search.index;import org.apache.lucene.document.*;import org.apache.lucene.index.*;import jeasy.analysis.*;import java.io.IOException;import java.io.FileReader;import java.io.BufferedReader;import java.io.File;publi

24、c class IndexBuilder / IndexWriter IndexWriter writer; public IndexBuilder(String path) throws IOException writer = new IndexWriter(path, new MMAnalyzer(); public void build(String path) throws IOException BufferedReader reader = null; File files = new File(path).listFiles(); for(int i = 0; i files.

25、length; i+) System.out.print(.); reader = new BufferedReader(new FileReader(filesi); Document doc = new Document(); Field fields = new Field5; fields0 = new Field(id, String.valueOf(i), Field.Store.YES, Field.Index.NO); fields1 = new Field(url, reader.readLine(), Field.Store.YES, Field.Index.NO); fi

26、elds2 = new Field(title, reader.readLine(), Field.Store.YES, Field.Index.TOKENIZED); fields3 = new Field(score, reader.readLine(), Field.Store.YES, Field.Index.NO); fields4 = new Field(context, getBodyFile(filesi.getAbsolutePath(), reader), Field.Store.YES, Field.Index.TOKENIZED); / 创建Document for(i

27、nt j = 0; j fields.length; j+) doc.add(fieldsj); / 将Document添加至IndexWriter中 writer.addDocument(doc); writer.optimize(); writer.close(); reader.close(); private String getBodyFile(String path, BufferedReader reader) throws IOException StringBuffer buffer = new StringBuffer(); String line = reader.readLine(); while(line != null) buffer.append(line); line = reader.readLine(); return buffer.toString(); 9.6.8 index.html BJFU Search Engine meta http-equiv=description content=this is my

copyright@ 2008-2022 冰豆网网站版权所有

经营许可证编号:鄂ICP备2022015515号-1