Python网络爬虫实习报告python实习报告文档格式.docx
《Python网络爬虫实习报告python实习报告文档格式.docx》由会员分享,可在线阅读,更多相关《Python网络爬虫实习报告python实习报告文档格式.docx(7页珍藏版)》请在冰豆网上搜索。
文章主要图片;
3>
文章中嵌入得任heYoutube/Vimeo视频;
4〉元描述;
〈5〉元标签
五、数据爬取实战(豆瓣网爬取电影数据)
1分析网页
#获取html源代码
def__getHtml():
data= []ﻫpageNum=1
pageSize=0
try:
ﻫwhile(pageSize〈=125):
#headers= {’User—Agent'
:
’Mozilla/5、0(WindowsNT6、1)AppleWebKit/537、11(KHTML,like Gecko)Chrome/23、0、1271、64Safari/537、11’,
#’Referer’:
None #注意如果依然不能抓取得话,这里可以设置抓取网站得host
#}ﻫ # opener = urllib、request、build_opener()
#opener、addheaders=[headers]ﻫ url ="”+str(pageSize)+ ”&filter="+str(pageNum)
#data['
html%s’% i]=urllib、request、urlopen(url)、read()、decode("
utf—8"
)ﻫ data、append(urllib、request、urlopen(url)、read()、decode(”utf-8"))
pageSize+= 25ﻫ pageNum+=1ﻫprint(pageSize,pageNum)
exceptExceptionase:
raisee
returndata
2爬取数据
def __getData(html):
title= []# 电影标题ﻫ #rating_num=[]#评分ﻫrange_num= []#排名ﻫ #rating_people_num =[] #评价人数
movie_author =[]#导演ﻫdata={}
#bs4解析htmlﻫsoup=BeautifulSoup(html,”html、parser”)ﻫforli insoup、find(”ol”,attrs={'
class’:
'
grid_view'
})、find_all("
li"
):
title、append(li、find("
span"
class_=”title"
)、text)
#rating_num、append(li、find("
div”, class_='
star'
)、find(”span"
, class_='
rating_num')、text)
range_num、append(li、find("
div”,class_=’pic'
)、find("
em")、text)ﻫ #spans=li、find("
div”,class_=’star’)、find_all("span"
)ﻫ #forx in range(len(spans)):
#ifx<
=2:
ﻫ # passﻫ #else:
# rating_people_num、append(spans[x]、string[—len(spans[x]、string):
—3])
str=li、find(”div"
class_=’bd’)、find(”p"
class_=’'
)、text、lstrip()ﻫ index=str、find("
主")ﻫif(index ==-1):
index=str、find("、、、")ﻫ print(li、find(”div”,class_='
pic’)、find(”em"
)、text)
if (li、find("
div”,class_=’pic’)、find(”em”)、text ==210):
index= 60
#print("
aaa”)ﻫ # print(str[4:
index])ﻫ movie_author、append(str[4:
index])
data['
title'
] =title
#data['
rating_num'
] =rating_num
data['range_num’]= range_numﻫ #data[’rating_people_num’]=rating_people_num
data[’movie_author’]=movie_author
returndata
3数据整理、转换
def__getMovies(data):
f=open('
F:
//douban_movie、html'
, ’w’,encoding=’utf—8’)
f、write("
〈html>
”)
f、write(”<
head〉<
metacharset=’UTF-8’>
〈title〉Inserttitlehere<
/title>
</head〉"
)ﻫ f、write(”〈body>
”)ﻫ f、write(”〈h1>
爬取豆瓣电影<
/h1>
"
)
f、write(”<
h4>
作者:
刘文斌〈/h4〉"
f、write(”〈h4>
时间:
"+nowtime+”</h4>”)ﻫf、write("
hr>
f、write("
table width='
800px’ border='
1’ align=center>
f、write("
〈thead>”)ﻫ f、write("
tr〉"
)ﻫ f、write(”<
th〉<
fontsize='
5'
color=green〉电影<
/font>
/th〉”)ﻫ#f、write("<thwidth='50px'>
fontsize='
5’color=green〉评分</font〉〈/th〉"
f、write("
〈th width='
50px’>
〈fontsize=’5'color=green〉排名〈/font〉〈/th>
”)ﻫ #f、write("〈thwidth=’100px’〉〈fontsize='
5'
color=green〉评价人数<
/font〉〈/th〉"
)ﻫ f、write("
〈th>
〈fontsize='
5'color=green>
导演〈/font〉</th>”)
f、write(”<
/tr>
f、write("
/thead〉”)
f、write("
〈tbody>
fordataindatas:
ﻫforiin range(0, 25):
ﻫ f、write("〈tr〉”)
f、write("<
tdstyle='
color:
orange;
text—align:
center’〉%s</td〉”%data['
title'
][i])ﻫ # f、write(”〈tdstyle=’color:
blue;
text-align:
center'〉%s</td>
%data['rating_num'
][i])ﻫ f、write("
td style='
color:
red;
text—align:
center'
〉%s<
/td〉”% data[’range_num’][i])
#f、write("
<tdstyle=’color:
blue;
text-align:
center’〉%s〈/td>”% data[’rating_people_num'
][i])
f、write(”〈tdstyle=’color:
black;
text—align:
center’〉%s<
/td>
%data[’movie_author’][i])
f、write("
〈/tr>
)ﻫ f、write(”〈/tbody〉"
f、write("
</thead〉"
)ﻫf、write(”<
/table〉")
f、write(”</body〉"
)ﻫ f、write("〈/html〉”)ﻫf、close()
if __name__ == ’__main__’:
datas=[]
htmls=__getHtml()ﻫ fori inrange(len(htmls)):
ﻫdata =__getData(htmls[i])ﻫ datas、append(data)
ﻫ__getMovies(datas)
4数据保存、展示
结果如后图所示:
5技术难点关键点
数据爬取实战(搜房网爬取房屋数据)
frombs4 import BeautifulSoupﻫimportrequestsﻫrep =requests、get(’'
rep、encoding="
gb2312"
#设置编码方式
html = rep、textﻫsoup=BeautifulSoup(html,'
html、parser’)
f=open('
//fang、html’, ’w',encoding=’utf—8’)
f、write(”<
html〉")
f、write("
head〉〈metacharset=’UTF—8'
><
title>
Inserttitlehere<
/title>
/head〉”)ﻫf、write(”〈body>
f、write("
〈center>
<h1〉新房成交TOP3<
/h1>〈/center>
”)ﻫf、write("〈table border='
1px’width=’1000px’height='
800px’align=center〉<tr〉")ﻫf、write("
th>
h2〉房址<
/h2>
〈/th>
)ﻫf、write("
〈th〉〈h2〉成交量〈/h2〉〈/th〉"
f、write(”<
th>
〈h2>均价<
/h2>
</th〉〈/tr>”)ﻫforli insoup、find(”ul"
class_=”ul02"
)、find_all("li”):
name=li、find(”div”,class_="
pbtext”)、find(”p”)、text
chengjiaoliang=li、find("
span”,class_="
red—f3”)、text
try:
ﻫjunjia=li、find(”div"
,class_="
ohter"
)、find("
p"
class_="
gray—9”)#、text、replace(’。
O’,’平方米'
)ﻫ exceptException as e:
ﻫ junjia=li、find("
div"
class_="
gray—9"
)#、text、replace('
�O', ’平方米')ﻫ
f、write("
tr〉<
tdalign=center〉<
font size='
5px’color=red>%s〈/font〉</td〉"
%name)ﻫ f、write("〈td align=center>
〈fontsize=’5px'
color=blue>
%s<
/font>
〈/td〉"
%chengjiaoliang)ﻫf、write("
〈td align=center〉<
font size='5px’ color=green〉%s〈/font>〈/td>
/tr〉"
%junjia)ﻫ print(name)
f、write("
</table〉")ﻫf、write("</body>
ﻬ六、总结
教师评语:
成绩:
指导教师: