抓取网站信息源码.docx
《抓取网站信息源码.docx》由会员分享,可在线阅读,更多相关《抓取网站信息源码.docx(15页珍藏版)》请在冰豆网上搜索。
抓取网站信息源码
用HttpURLConnection抓取登陆网站信息的源码:
供参考。
一共2个类。
一个测试类
packagedev.smart.craw.util;
importjava.io.BufferedReader;
importjava.io.BufferedWriter;
importjava.io.IOException;
importjava.io.InputStream;
importjava.io.InputStreamReader;
importjava.io.OutputStream;
importjava.io.OutputStreamWriter;
import.HttpURLConnection;
import.ProtocolException;
import.URL;
importjava.util.List;
importjava.util.Map;
/**
*
*@authorroyoky
*
*/
publicclassCrawWeb{
/**
*获得一个HttpURLConnection对象
*
*@paramcookie
*@paramreferer
*上一个引用页面
*@paramurl
*请求的URL地址
*@parammethod
*方法类型(POSTGET)
*@return
*@throwsIOException
*@throwsProtocolException
*/
publicHttpURLConnectiongetHttpURLConnection(HeaderEntiyentity)
throwsIOException,ProtocolException{
URLcookieUrl=newURL(entity.getUrl());
HttpURLConnectioncookiecon=(HttpURLConnection)cookieUrl
.openConnection();
cookiecon.setInstanceFollowRedirects(false);
if(entity.getHost()!
=null&&!
"".equals(entity.getHost()))
cookiecon.setRequestProperty("Host",entity.getHost());
if(entity.getUser_Agent()!
=null
&&!
"".equals(entity.getUser_Agent()))
cookiecon.setRequestProperty("User-Agent",entity.getUser_Agent());
if(entity.getAccept()!
=null&&!
"".equals(entity.getAccept()))
cookiecon.setRequestProperty("Accept",entity.getAccept());
if(entity.getAccept_Language()!
=null
&&!
"".equals(entity.getAccept_Language()))
cookiecon.setRequestProperty("Accept-Language",entity
.getAccept_Language());
if(entity.getAccept_Encoding()!
=null
&&!
"".equals(entity.getAccept_Encoding()))
cookiecon.setRequestProperty("Accept-Encoding",entity
.getAccept_Encoding());
if(entity.getContent_Type()!
=null
&&!
"".equals(entity.getContent_Type()))
cookiecon.setRequestProperty("Content-Type",entity
.getContent_Type());
if(entity.getContent_Length()!
=null
&&!
"".equals(entity.getContent_Length()))
cookiecon.setRequestProperty("Content-Length",entity
.getContent_Length());
//map.put("Accept-Charset","x-gbk,utf-8;q=0.7,*;q=0.7");
if(entity.getAccept_Charset()!
=null
&&!
"".equals(entity.getAccept_Charset()))
cookiecon.setRequestProperty("Accept-Charset",entity
.getAccept_Charset());
if(entity.getConnection()!
=null
&&!
"".equals(entity.getConnection()))
cookiecon.setRequestProperty("Connection",entity.getConnection());
if(entity.getCache_Control()!
=null
&&!
"".equals(entity.getCache_Control()))
cookiecon.setRequestProperty("Cache-Control",entity
.getCache_Control());
if(entity.getReferer()!
=null&&!
"".equals(entity.getReferer())){
cookiecon.setRequestProperty("Referer",entity.getReferer());
}
if(entity.getCookie()!
=null&&!
"".equals(entity.getCookie())){
cookiecon.setRequestProperty("Cookie",entity.getCookie());
}
cookiecon.setRequestMethod(entity.getFormWay());
cookiecon.setConnectTimeout(90000);
cookiecon.setReadTimeout(120000);
cookiecon.setDoOutput(true);
cookiecon.setDoInput(true);
cookiecon.setUseCaches(false);
returncookiecon;
}
/**
*POST做参数传递
*
*@paramcon
*@paramparam
*@throwsIOException
*/
publicvoidsendParameter(HttpURLConnectioncon,HeaderEntiyentity)
throwsIOException{
con.connect();
OutputStreamout=con.getOutputStream();
BufferedWriterbw=newBufferedWriter(newOutputStreamWriter(out));
if(entity.getParamPost()!
=null)
bw.write(entity.getParamPost());
bw.flush();
bw.close();
out.close();
}
/**
*获得请求返回的文本信息
*
*@paramcon
*@paramencode
*@return
*@throwsException
*/
publicStringgetText(HttpURLConnectioncon,Stringencode)
throwsException{
//Stringhost="";
//String
//url="
//String
//parm="startDate="+startDate+"&endDate="+endDate+"&payType=all&orderStatus=&hostType=D&tripType=all&isGroup=all&userid=DZC001&orgID=TUAIR&officeCode=SZX348&iataNo=08017074&grp=SZXYYB&CanViewOrhersRight=true&downtype=b2b_pay&fileType=csv";
returnthis.doString(con,con.getInputStream(),encode);
}
publicStringgetCookie(HttpURLConnectioncon){
Listcookie=null;
Stringsession="";
Map>map=con.getHeaderFields();
if(map!
=null){
cookie=map.get("Set-Cookie");
}
if(cookie!
=null){
for(Strings:
cookie){
session+=s.replace("path=/","");
}
}
returnsession;
}
/**
*得到请求返回的状态200表示请求成功
*
*@paramcon
*@return
*/
publicintgetResponseState(HttpURLConnectioncon){
try{
if(con!
=null)
returncon.getResponseCode();
}catch(IOExceptione){
//TODOAuto-generatedcatchblock
e.printStackTrace();
}
return0;
}
privateStringdoString(HttpURLConnectioncon,InputStreamin,Stringencode)
throwsException{
Stringtemp=null;
String__viewstate="";
if(encode==null||"".equals(encode)){
encode="UTF-8";
}
BufferedReaderbr=newBufferedReader(
newInputStreamReader(in,encode));
while((temp=br.readLine())!
=null){
//System.out.println(temp);
__viewstate+=temp+"\r\n";
}
con.disconnect();
br.close();
in.close();
return__viewstate;
}
}
packagedev.smart.craw.util;
publicclassHeaderEntiy{
//map.put("Accept","image/gif,image/jpeg,image/pjpeg,image/pjpeg,application/x-shockwave-flash,application/msword,application/vnd.ms-excel,application/vnd.ms-powerpoint,*/*");
//map.put("Accept-Language","zh-cn");
//map.put("Accept-Encoding","GBK");
//map.put("User-Agent","Mozilla/4.0(compatible;MSIE8.0;WindowsNT5.1;Trident/4.0;.NETCLR2.0.50727)");
//map.put("Host","www.cococ.cc");
//map.put("Connection","Keep-Alive");
//map.put("Content-Type","application/x-www-form-urlencoded");
//map.put("Cache-Control","no-cache");
//map.put("Accept-Charset","x-gbk,utf-8;q=0.7,*;q=0.7");
//map.put("Referer","http:
//www.cococ.cc/member/?
act=login&jumpUrl=http:
//www.cococ.cc/");
privateStringHost;
privateStringUser_Agent;
privateStringAccept;
privateStringAccept_Language;
privateStringAccept_Encoding;
privateStringContent_Type;
privateStringContent_Length;
privateStringAccept_Charset;
privateStringConnection;
privateStringCache_Control;
privateStringReferer;
privateStringCookie;
privateStringUrl;
privateStringFormWay="GET";
privateStringParamPost;
publicStringgetAccept(){
returnAccept;
}
publicvoidsetAccept(Stringaccept){
Accept=accept;
}
publicStringgetAccept_Charset(){
returnAccept_Charset;
}
publicvoidsetAccept_Charset(Stringaccept_Charset){
Accept_Charset=accept_Charset;
}
publicStringgetAccept_Encoding(){
returnAccept_Encoding;
}
publicvoidsetAccept_Encoding(Stringaccept_Encoding){
Accept_Encoding=accept_Encoding;
}
publicStringgetAccept_Language(){
returnAccept_Language;
}
publicvoidsetAccept_Language(Stringaccept_Language){
Accept_Language=accept_Language;
}
publicStringgetCache_Control(){
returnCache_Control;
}
publicvoidsetCache_Control(Stringcache_Control){
Cache_Control=cache_Control;
}
publicStringgetConnection(){
returnConnection;
}
publicvoidsetConnection(Stringconnection){
Connection=connection;
}
publicStringgetContent_Length(){
returnContent_Length;
}
publicvoidsetContent_Length(Stringcontent_Length){
Content_Length=content_Length;
}
publicStringgetContent_Type(){
returnContent_Type;
}
publicvoidsetContent_Type(Stringcontent_Type){
Content_Type=content_Type;
}
publicStringgetCookie(){
returnCookie;
}
publicvoidsetCookie(Stringcookie){
Cookie=cookie;
}
publicStringgetHost(){
returnHost;
}
publicvoidsetHost(Stringhost){
Host=host;
}
publicStringgetReferer(){
returnReferer;
}
publicvoidsetReferer(Stringreferer){
Referer=referer;
}
publicStringgetUser_Agent(){
returnUser_Agent;
}
publicvoidsetUser_Agent(Stringuser_Agent){
User_Agent=user_Agent;
}
publicStringgetParamPost(){
returnParamPost;
}
publicvoidsetParamPost(StringparamPost){
ParamPost=paramPost;
}
publicStringgetUrl(){
returnUrl;
}
publicvoidsetUrl(Stringurl){
Url=url;
}
publicStringgetFormWay(){
returnFormWay;
}
publicvoidsetFormWay(StringformWay){
FormWay=formWay;
}
}
packagedev.smart.craw.util;
import.HttpURLConnection;
publicclassTestextendsCrawWeb{
publicStringgetInfo()throwsException{
HeaderEntiyvo=newHeaderEntiy();
vo.setAccept("image/gif,image/jpeg,image/pjpeg,image/pjpeg,application/x-shockwave-flash,application/msword,application/vnd.ms-excel,application/vnd.ms-powerpoint,*/*");
vo.setAccept_Charset("x-gbk,utf-8;q=0.7,*;q=0.7");
vo.setAccept_Encoding("GBK");
vo.setAccept_Language("zh-cn");
vo.setCache_Control("no-cache");
vo.setConnection("Keep-Alive");
vo.setContent_Length("");
vo.setContent_Type("application/x-www-form-urlencoded");
vo.setCookie("");
vo.setHost("www.cococ.cc");
vo.setReferer("http:
//www.cococ.cc/member/?
act=login&jumpUrl=http:
//www.cococ.cc/");
vo.setUser_Agent("Mozilla/4.0(compatible;MSIE8.0;WindowsNT5.1;Trident/4.0;.NETCLR2.0.50727)");
vo.setUrl("http:
//www.cococ.cc/member/?
act=login");
vo.setParamPost("username=royoks&password=123456&+%B5%C7+%C2%BD+=++++&handler=login&jumpurl=http%3A%2F%2Fwww.cococ.cc%2FPOST/member/?
act=login");
vo.setFormWay("POST");
HttpURLConnectioncon=super.getHttpURLConnection(vo