1、Portia 框架:Portia 框架是一款允许没有任何编程基础的用户可视化地爬取网页的爬虫框架。newspaper 框架:newspaper 框架是一个用来提取新闻、文章以及内容分析的 Python 爬虫框架。Python-goose 框架:Python-goose 框架可提取的信息包括:文章主体内容;2文章主要图片;3文章中嵌入的任 heYoutube/Vimeo 视频;4元描述;5元标签五、数据爬取实战(豆瓣网爬取电影数据)1 分析网页# 获取 html 源代码def getHtml(): data = pageNum = 1 pageSize = 0 try: while (pageS
2、ize = 125): # headers = User-Agent:Mozilla/5.0( Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11, # RefererNone #注意如果依然不能抓取的话,这里可以设置抓取网站的host # # opener = urllib.request.build_opener() # opener.addheaders = headers url = + str(pageSize) + &filter= + str(pageNu
3、m) # datahtml%s % i =urllib.request.urlopen(url).read().decode(utf-8) data.append(urllib.request.urlopen(url).read().decode(utf- 8) pageSize += 25 pageNum += 1 print(pageSize, pageNum) except Exception as e: raise e return data 2 爬取数据def getData(html): title = # 电影标题 #rating_num = # 评 分 range_num =
4、# 排 名 #rating_people_num = # 评价人数 movie_author = # 导 演 data = # bs4 解析 html soup = BeautifulSoup(html, html.parser for li in soup.find(ol, attrs=class grid_view).find_all(li): title.append(li.find(span, class_=title).text) #rating_num.append(li.find(div, class_=star).find(rating_num range_num.append
5、(li.find(picem #spans = li.find(class_=).find_all( #for x in range(len(spans): # if x headmeta charset=UTF-8titleInsert title here/headbodyh1爬取豆瓣电影h4作者:刘文斌时间: + nowtime + hrtable width=800px border=1 align=centertheadtrthfont size=5 color=green电影/font/th #f.write(th width=50px评分 f.write( 排名100px评价人数/thead) f.write(tbody for data in datas: for i in range(0, 25): f.write(td style=color:orange;text-align:center%s % datai) # f.write(td style=blue; f.write(red;i)
copyright@ 2008-2022 冰豆网网站版权所有
经营许可证编号:鄂ICP备2022015515号-1