heritrix.docx - 冰豆网

资源描述

heritrix.docx

《heritrix.docx》由会员分享，可在线阅读，更多相关《heritrix.docx（13页珍藏版）》请在冰豆网上搜索。

heritrix.docx

heritrix

Heritrix很可能由于包加多了而导致很多错误，要注意加的包的冲突问题

Lib中的包很多个有冲突，只是包的版本不一样，看包的名字就知道了

org.archive.crawler.frontier;

AdaptiveRevisitFrontier

//@Override

publicvoidfinalTasks（）{

//bydefaultdonothing

}

由于有错误，将@Override注释掉，编译无错

在org.archive.crawler.prefetch;包下的

PreconditionEnforcer类中的

privatebooleanconsiderRobotsPreconditions（CrawlURIcuri）方法，注释掉，并返回false

右键工程创建一个包用来放自己定制heritrix所需要写的类

先继承FrontierScheduler类写一个处理链接的类，代码如下

packagemy;

importjava.util.logging.Logger;

importorg.archive.crawler.datamodel.CandidateURI;

importorg.archive.crawler.postprocessor.FrontierScheduler;

publicclassFrontierSchedulerFor163MobileextendsFrontierScheduler{

privatestaticLoggerLOGGER=Logger

.getLogger（FrontierSchedulerFor163Mobile.class.getName（））;

publicFrontierSchedulerFor163Mobile（Stringname）{

super（name）;

}

protectedvoidschedule（CandidateURIcaUri）{

Stringurl=caUri.toString（）;

try{

if（url.indexOf（""）!

=-1

||url.indexOf（"robots.txt"）!

=-1

||url.indexOf（"dns:

"）!

=-1）{

if（url.indexOf（""）!

=-1）

{

return;

}

if（url.endsWith（".zip"）

||url.endsWith（".exe"）

||url.endsWith（".pdf"）

||url.endsWith（".doc"）

||url.endsWith（".xls"）

||url.endsWith（".rar"）

||url.endsWith（".swf"）

||url.endsWith（".rmvb"）

||url.endsWith（".wmv"）

||url.endsWith（".asf"）

||url.endsWith（".ppt"）

||url.endsWith（".mpg"）

||url.endsWith（".mp3"）

||url.endsWith（".iso"）

||url.endsWith（".wma"）

||url.endsWith（".dat"）

||url.endsWith（".ape"）

||url.endsWith（".ask"）

||url.endsWith（".csf"）

||url.endsWith（".mkv"）

||url.endsWith（".vod"）

||url.endsWith（".rn"）

）

{

return;

}

if（url.indexOf（"#"）==-1）{

getController（）.getFrontier（）.schedule（caUri）;

}

}else{

return;

}

}catch（Exceptione）{

e.printStackTrace（）;

}finally{

}

这个类的作用是在过滤掉不需要的音频视频文件，压缩文件，可执行文件，office等文件，获取需要抓取的文件的URI。

一个编程启动heritrix的类：

packagemy;

importjava.io.File;

importjavax.management.InvalidAttributeValueException;

importorg.archive.crawler.event.CrawlStatusListener;

importorg.archive.crawler.framework.CrawlController;

importorg.archive.crawler.framework.exceptions.InitializationException;

importorg.archive.crawler.settings.XMLSettingsHandler;

publicclassStartHeritrixByEclipse{

publicstaticvoidmain（String[]args）throwsInterruptedException{

StringorderFile="D:

/DocumentsandSettings/admin/workspace/heritrix_1/jobs/keyanchu-20100827131710296/order.xml";//order.xml文件路径

Filefile=null;//order.xml文件

CrawlStatusListenerlistener=null;//监听器

XMLSettingsHandlerhandler=null;//读取order.xml文件的处理器

CrawlControllercontroller=null;//Heritrix的控制器

try{

file=newFile（orderFile）;

handler=newXMLSettingsHandler（file）;

handler.initialize（）;//读取order.xml中的各个配置

controller=newCrawlController（）;//

controller.initialize（handler）;//从读取的order.xml中的各个配置来初始化控制器

if（listener!

=null）{

controller.addCrawlStatusListener（listener）;//控制器添加监听器

}

controller.requestCrawlStart（）;//开始抓取

*如果Heritrix还一直在运行则等待

while（true）{

if（controller.isRunning（）==false）{

break;

}

Thread.sleep（1000）;

}

//如果Heritrix不再运行则停止

controller.requestCrawlStop（）;

}catch（InvalidAttributeValueExceptione）{

//TODOAuto-generatedcatchblock

e.printStackTrace（）;

}catch（InitializationExceptione）{

//TODOAuto-generatedcatchblock

e.printStackTrace（）;

}catch（InterruptedExceptione）{

//TODOAuto-generatedcatchblock

e.printStackTrace（）;

}

启动heritrix的代码执行的流程书上有介绍，很详细，在《开发自己的搜索引擎Lucene+Heritrix》中304~308页

继承链接制造工厂frontier写一个抓取线程处理的类，重写了getClassKey方法，加入ELFHash算法，并对robots的识别做了相关处理

【这边还要注意要把自己写的这个类加载到heritrix的属性文件】

这个图中倒数第二行是所有线程策略，在这边要把自己写好的策略的类名加进去

packageorg.archive.crawler.frontier;

importjava.util.logging.Level;

importjava.util.logging.Logger;

importmons.httpclient.URIException;

importorg.archive.crawler.datamodel.CandidateURI;

importorg.archive.crawler.framework.CrawlController;

importorg.archive.crawler.frontier.QueueAssignmentPolicy;

import.UURI;

import.UURIFactory;

publicclassELFHashQueueAssignmentPolicyextendsQueueAssignmentPolicy{

privatestaticfinalLoggerlogger=Logger

.getLogger（ELFHashQueueAssignmentPolicy.class.getName（））;

privatestaticStringDEFAULT_CLASS_KEY="default...";

privatestaticfinalStringDNS="dns";

publicELFHashQueueAssignmentPolicy（）{

//TODOAuto-generatedconstructorstub

}

@Override

publicStringgetClassKey（CrawlControllercontroller,CandidateURIcauri）{

Stringuri=cauri.getUURI（）.toString（）;

Stringscheme=cauri.getUURI（）.getScheme（）;

Stringcandidate=null;

Stringname=null;

longhash=0;

try{

name=cauri.getUURI（）.getName（）;

}catch（URIExceptione1）{

//TODOAuto-generatedcatchblock

e1.printStackTrace（）;

}

try{

if（scheme.equals（DNS））{

if（cauri.getVia（）!

=null）{

//SpecialhandlingforDNS:

treatasbeing

//ofthesameclassasthetriggeringURI.

//WhenaURIincludesaport,thisensures

//theDNSlookupgoesatopthehost:

port

//queuethattriggeredit,ratherthan

//someotherhostqueue

UURIviaUuri=UURIFactory.getInstance（cauri.flattenVia（））;

candidate=viaUuri.getAuthorityMinusUserinfo（）;

//adoptschemeoftriggeringURI

//scheme=viaUuri.getScheme（）;

hash=ELFHash（viaUuri.toString（））;

candidate=candidate+Long.toString（hash%10）;

}else{

candidate=cauri.getUURI（）.getReferencedHost（）;

}

}else{

//Stringuri=cauri.getUURI（）.toString（）;

candidate=cauri.getUURI（）.getAuthorityMinusUserinfo（）;

if（name!

=null&&name.equals（"robots.txt"））{

hash=ELFHash（UURIFactory.getInstance（cauri.flattenVia（））.toString（））;

}

else{

hash=ELFHash（uri）;

}

candidate=candidate+Long.toString（hash%10）;

}

if（candidate==null||candidate.length（）==0）{

candidate=DEFAULT_CLASS_KEY;

}

}catch（URIExceptione）{

logger.log（Level.INFO,

"unabletoextractclasskey;usingdefault",e）;

candidate=DEFAULT_CLASS_KEY;

}

returncandidate.replace（':

','#'）;

}

publicStringgetClassKey（Stringuri）{

//Stringuri=cauri.getUURI（）.toString（）;

longhash=ELFHash（uri）;

Stringa=Long.toString（hash%100）;

returna;

}

publicstaticlongELFHash（Stringstr）{

longhash=0;

longx=0;

for（inti=0;i

hash=（hash<<4）+str.charAt（i）;

if（（x=hash&0xF0000000L）!

=0）{

hash^=（x>>24）;

hash&=~x;

}

return（hash&0x7FFFFFFF）;

}

Heriytrix抓取的网址乱码问题【部分解决】

org.archive.crawler.writer.MirrorWriterProcessor.joinParts（）

StringBuffersb=newStringBuffer（length（））;

Stringss=null;

sb.append（mainPart.asStringBuffer（））;

if（null!

=uniquePart）{

sb.append（uniquePart）;

}

if（suffixAtEnd）{

if（null!

=query）{

sb.append（"@"）;

sb.append（query）;

}

if（null!

=suffix）{

sb.append（'.'）;

sb.append（suffix）;

}

}else{

if（null!

=suffix）{

sb.append（'.'）;

sb.append（suffix）;

}

if（null!

=query）{

sb.append（query）;

}

try{

ss=newString（sb.toString（）.getBytes（"ISO-8859-1"）,"UTF-8"）;

}catch（UnsupportedEncodingExceptione）{

//TODOAuto-generatedcatchblock

e.printStackTrace（）;

}

returnss;

修改org.archive.crawler.frontier.WorkQueueFrontier中的publicCrawlURInext（）方法，这里也对该方法做一些介绍，具体请看源码注释,改成如下，红色部分为改动部分:

1./**

2. * 从调度中心获取下一个要抓取的URL

3. *

4. */

5.public CrawlURI next（） throws InterruptedException, EndedException {

6. while （true） {//一直不停的循环,直到遇到异常或终止

7. // 郭芸修改，用于当队列里没有可抓取的URL的时候去获取种子继续

8. synchronized （this） {

9. if （this.controller.getFrontier（）.isEmpty（）） { //如果没有可抓取的URL

10. loadSeeds（）; //重新载入种子

11. this.controller.getToePool（）.notifyAll（）; //唤醒所有抓取线程

12. }

13.

14. }// 郭芸修改，用于当队列里没有可抓取的URL的时候去获取种子继续

15.

16.

17. long now = System.currentTimeMillis（）;//开始获取时间

18.

19. // 检查是否有暂停命令、结束命令以及宽带控制，这里会导致Heritrix结束

20. preNext（now）;

21.

22. /*

23. * 允许最多一个线程去填充准备队列（readyClassQueues）

24. */

25. if （readyFiller.tryAcquire（）） {// 表示没有线程去使用当前变量，当前类1次只允许1个线程同时使用

26. try {

27.

28. // 空闲队列数=目标队列数-准备队列数

29. int activationsNeeded = targetSizeForReadyQueues（）

30. - readyClassQueues.size（）;

31. // 如果空闲队列数大于0，并且不在活动状态的队列数不是空的，则表示需要将不在活动状态的队列转移到准备队列

32. while （activationsNeeded > 0 && !

inactiveQueues.isEmpty（）） {

33. activateInactiveQueue（）;//将不在活动状态队列的URL转移一定数目到活动状态队列

34. activationsNeeded--;

35. }

36. } finally {

37. readyFiller.release（）;// 必须释放，这样下次才可以继续使用

38. }

39. }

40.

41. WorkQueue readyQ = null;//准备工作队列

42. // 获取并移除此准备队列表示的队列的头部（即准备队列的第一个元素）如果该队列没有可用元素，则等待指定的时间，这里是1000毫秒也就是1秒

43. Object key = readyClassQueues.poll（DEFAULT_WAIT,TimeUnit.MILLISECONDS）;// 获得classKey，然后再通过classKey去获得队列

44.

45. if （key !

= null） {

46. readyQ = （WorkQueue） this.allQueues.get（key）;// 获得工作队列WorkQueue

47. }

48. if （readyQ !

= null） {

49.

展开阅读全文