c从网页提取数据.docx

资源描述

c从网页提取数据.docx

《c从网页提取数据.docx》由会员分享，可在线阅读，更多相关《c从网页提取数据.docx（12页珍藏版）》请在冰豆网上搜索。

c从网页提取数据.docx

c从网页提取数据

下面的函数作用，将DataTable导出到EXCEL文件：

privatevoidDataTabletoExcel（System.Data.DataTabletmpDataTable,stringstrFileName）

{

if（tmpDataTable==null）

{

return;

}

introwNum=tmpDataTable.Rows.Count;

intcolumnNum=tmpDataTable.Columns.Count;

introwIndex=1;

intcolumnIndex=0;

Excel.ApplicationxlApp=newExcel.ApplicationClass（）;

xlApp.DefaultFilePath="";

xlApp.DisplayAlerts=true;

xlApp.SheetsInNewWorkbook=1;

Excel.WorkbookxlBook=xlApp.Workbooks.Add（true）;

//将DataTable的列名导入Excel表第一行

foreach（DataColumndcintmpDataTable.Columns）

{

columnIndex++;

xlApp.Cells[rowIndex,columnIndex]=dc.ColumnName;

}

//将DataTable中的数据导入Excel中

for（inti=0;i

{

rowIndex++;

columnIndex=0;

for（intj=0;j

{

columnIndex++;

xlApp.Cells[rowIndex,columnIndex]=tmpDataTable.Rows[i][j].ToString（）;

}

xlBook.SaveCopyAs（strFileName+".xls"）;

}

C#code

usingSystem;

usingSystem.Text.RegularExpressions;

usingSystem.IO;

usingSystem.Text;

usingSystem.Net;

usingSystem.IO.Compression;

usingSystem.Web;

usingSystem.Collections;

namespace控制台测试

{

classProgram

{

staticvoidMain（string[]args）

{

byte[]buffer=getBytes（"null,null）;

stringhtml=Encoding.UTF8.GetString（buffer）;

MatchCollectionmc=Regex.Matches（html,@"{""BonusNumberString"":

""（?

[^|]*）\|\d\|\d"",""BonusTime"":

""（?

[^""]*）"",""IssueNumber"":

""（?

[^""]*）""}"）;

foreach（Matchminmc）

{

Console.WriteLine（m.Groups["qi"]+""+m.Groups["num"].Value.Replace（",",""）+""+m.Groups["kai"]）;

}

Console.WriteLine（"程序运行结束，按任意键关闭窗口！

"）;

Console.ReadKey（）;

}

//读取网络资源，返回字节数组

privatestaticbyte[]getBytes（stringurl,CookieContainercookie,byte[]postData）

{

intc=url.IndexOf（"/",10）;

byte[]data=null;

HttpWebRequestrequest=（HttpWebRequest）WebRequest.Create（url）;

request.AllowAutoRedirect=true;

if（cookie!

=null）request.CookieContainer=cookie;

request.Referer=（c>0?

url.Substring（0,c）:

url）;

request.UserAgent="Mozilla/4.0（compatible;MSIE6.0;WindowsNT5.1）";

request.Headers[HttpRequestHeader.AcceptEncoding]="gzip,deflate";

if（postData!

=null）//需要Post数据

{

request.Method="POST";

request.ContentType="application/x-www-form-urlencoded";

request.ContentLength=postData.Length;

StreamrequestStream=request.GetRequestStream（）;

requestStream.Write（postData,0,postData.Length）;

requestStream.Close（）;

}

HttpWebResponseresponse=（HttpWebResponse）request.GetResponse（）;

stringce=response.Headers[HttpResponseHeader.ContentEncoding];

intContentLength=（int）response.ContentLength;

Streams=response.GetResponseStream（）;

c=1024*10;

if（ContentLength<0）//不能获取数据的长度

{

data=newbyte[c];

MemoryStreamms=newMemoryStream（）;

intl=s.Read（data,0,c）;

while（l>0）

{

ms.Write（data,0,l）;

l=s.Read（data,0,c）;

}

data=ms.ToArray（）;

ms.Close（）;

}

else//数据长度已知

{

data=newbyte[ContentLength];

intpos=0;

while（ContentLength>0）

{

intl=s.Read（data,pos,ContentLength）;

pos+=l;

ContentLength-=l;

}

s.Close（）;

response.Close（）;

if（ce=="gzip"）//若数据是压缩格式，则要进行解压

{

MemoryStreamjs=newMemoryStream（）;//解压后的流

MemoryStreamms=newMemoryStream（data）;//用于解压的流

GZipStreamg=newGZipStream（ms,CompressionMode.Decompress）;

byte[]buffer=newbyte[c];//读数据缓冲区

intl=g.Read（buffer,0,c）;//一次读10K

while（l>0）

{

js.Write（buffer,0,l）;

l=g.Read（buffer,0,c）;

}

g.Close（）;

ms.Close（）;

data=js.ToArray（）;

js.Close（）;

}

returndata;//返回字节数组

}

运行结果：

20100625-083907232010-06-2519:

20100625-082613712010-06-2519:

20100625-081309382010-06-2519:

20100625-080138602010-06-2519:

20100625-079432382010-06-2519:

20100625-078681452010-06-2519:

20100625-077203032010-06-2518:

20100625-076533652010-06-2518:

20100625-075803612010-06-2518:

20100625-074761212010-06-2518:

20100625-073508112010-06-2518:

20100625-072360942010-06-2518:

20100625-071913812010-06-2517:

20100625-070065152010-06-2517:

20100625-069689052010-06-2517:

20100625-068622752010-06-2517:

20100625-067437852010-06-2517:

20100625-066435082010-06-2517:

20100625-065823072010-06-2516:

20100625-064630792010-06-2516:

20100625-063171782010-06-2516:

20100625-062296232010-06-2516:

20100625-061153362010-06-2516:

20100625-060253502010-06-2516:

20100625-059478022010-06-2515:

20100625-058022212010-06-2515:

20100625-057578882010-06-2515:

20100625-056410192010-06-2515:

20100625-055005002010-06-2515:

20100625-054095622010-06-2515:

20100625-053003372010-06-2514:

20100625-052694352010-06-2514:

20100625-051453142010-06-2514:

20100625-050450872010-06-2514:

20100625-049937432010-06-2514:

20100625-048562802010-06-2514:

20100625-047112072010-06-2513:

20100625-046556422010-06-2513:

20100625-045320702010-06-2513:

20100625-044676242010-06-2513:

20100625-043742272010-06-2513:

20100625-042508082010-06-2513:

20100625-041636572010-06-2512:

20100625-040019642010-06-2512:

20100625-039873152010-06-2512:

20100625-038867832010-06-2512:

20100625-037552632010-06-2512:

20100625-036595782010-06-2512:

20100625-035183962010-06-2511:

20100625-034678262010-06-2511:

20100625-033064462010-06-2511:

20100625-032439162010-06-2511:

20100625-031429442010-06-2511:

20100625-030631492010-06-2511:

20100625-029047562010-06-2510:

20100625-028565872010-06-2510:

20100625-027221102010-06-2510:

20100625-026522932010-06-2510:

20100625-025060402010-06-2510:

20100625-024979922010-06-2510:

20100625-023049152010-06-2501:

20100625-022812202010-06-2501:

20100625-021336832010-06-2501:

20100625-020411642010-06-2501:

20100625-019980972010-06-2501:

20100625-018056272010-06-2501:

20100625-017126362010-06-2501:

20100625-016046672010-06-2501:

20100625-015887182010-06-2501:

20100625-014590382010-06-2501:

20100625-013669442010-06-2501:

20100625-012076632010-06-2501:

20100625-011181992010-06-2500:

20100625-010379632010-06-2500:

20100625-009168432010-06-2500:

20100625-008310372010-06-2500:

20100625-007693502010-06-2500:

20100625-006597642010-06-2500:

20100625-005793752010-06-2500:

20100625-004414392010-06-2500:

20100625-003801382010-06-2500:

20100625-002427172010-06-2500:

20100625-001835542010-06-2500:

20100624-120612802010-06-2500:

20100624-119007472010-06-2423:

20100624-118614352010-06-2423:

20100624-117918162010-06-2423:

20100624-116294732010-06-2423:

20100624-115283882010-06-2423:

20100624-114890152010-06-2423:

20100624-113565862010-06-2423:

20100624-112920122010-06-2423:

20100624-111043332010-06-2423:

20100624-110613662010-06-2423:

20100624-109302312010-06-2423:

20100624-108325482010-06-2423:

20100624-107315222010-06-2422:

20100624-106347672010-06-2422:

20100624-105870772010-06-2422:

20100624-104965932010-06-2422:

20100624-103073992010-06-2422:

20100624-102617732010-06-2422:

20100624-101046222010-06-2422:

20100624-100753522010-06-2422:

20100624-099303352010-06-2422:

20100624-098394592010-06-2422:

20100624-097281542010-06-2422:

20100624-096200112010-06-2422:

20100624-095327782010-06-2421:

20100624-094382442010-06-2421:

20100624-093755902010-06-2421:

20100624-092813222010-06-2421:

20100624-091685412010-06-2421:

20100624-090818552010-06-2421:

20100624-089883212010-06-2420:

20100624-088952302010-06-2420:

20100624-087793022010-06-2420:

20100624-086086242010-06-2420:

20100624-085870482010-06-2420:

20100624-084827102010-06-2420:

程序运行结束，按任意键关闭窗口！

抓取Web网页数据分析（c#）

通过程序自动的读取其它网站网页显示的信息，类似于爬虫程序。

比方说我们有一个系统，要提取BaiDu网站上歌曲搜索排名。

分析系统在根据得到的数据进行数据分析。

为业务提供参考数据。

为了完成以上的需求，我们就需要模拟浏览器浏览网页，得到页面的数据在进行分析，最后把分析的结构，即整理好的数据写入数据库。

那么我们的思路就是：

　　1、发送HttpRequest请求。

　　2、接收HttpResponse返回的结果。

得到特定页面的html源文件。

　　3、取出包含数据的那一部分源码。

　　4、根据html源码生成HtmlDocument，循环取出数据。

　　5、写入数据库。

程序如下：

//根据Url地址得到网页的html源码

privatestringGetWebContent（stringUrl）

{

stringstrResult="";

try

{

HttpWebRequestrequest=（HttpWebRequest）WebRequest.Create（Url）;

　　　　//声明一个HttpWebRequest请求

request.Timeout=30000;

//设置连接超时时间

request.Headers.Set（"Pragma","no-cache"）;

HttpWebResponseresponse=（HttpWebResponse）request.GetResponse（）;

StreamstreamReceive=response.GetResponseStream（）;

Encodingencoding=Encoding.GetEncoding（"GB2312"）;

StreamReaderstreamReader=newStreamReader（streamReceive,encoding）;

strResult=streamReader.ReadToEnd（）;

}

catch

{

MessageBox.Show（"出错"）;

}

returnstrResult;

}

为了使用HttpWebRequest和HttpWebResponse，需填名字空间引用

　　usingSystem.Net;

以下是程序具体实现过程：

privatevoidbutton1_Click（objectsender,EventArgse）

{

//要抓取的URL地址

stringUrl="

//得到指定Url的源码

　　　stringstr

展开阅读全文