Python网络爬虫实习报告python实习报告.docx-资源下载

Python网络爬虫实习报告python实习报告.docx

1、Python网络爬虫实习报告python实习报告Python网络爬虫实习报告一、选题背景二、爬虫原理三、爬虫历史和分类四、常用爬虫框架比较Scrapy框架:Scrapy框架是一套比较成熟的Python爬虫框架，是使用Python开发的快速、高层次的信息爬取框架，可以高效的爬取web页面并提取出结构化数据。Scrapy应用范围很广，爬虫开发、数据挖掘、数据监测、自动化测试等。Crawley框架:Crawley也是Python开发出的爬虫框架，该框架致力于改变人们从互联网中提取数据的方式。Portia框架:Portia框架是一款允许没有任何编程基础的用户可视化地爬取网页的爬虫框架。newspape

2、r框架:newspaper框架是一个用来提取新闻、文章以及内容分析的Python爬虫框架。Python-goose框架：Python-goose框架可提取的信息包括：文章主体内容;文章主要图片;文章中嵌入的任heYoutube/Vimeo视频;元描述;元标签五、数据爬取实战（豆瓣网爬取电影数据）1分析网页* 获取html源代码def _getHtml(): data = pageNum = 1 pageSize = 0 try: while (pageSize = 125): * headers = User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWe

3、bKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11, * Referer:None *注意如果依然不能抓取的话，这里可以设置抓取网站的host * * opener = urllib.request.build_opener() * opener.addheaders = headers url = https:/movie.douban./top250start= + str(pageSize) + &filter= + str(pageNum) * datahtml%s % i =urllib.request.

4、urlopen(url).read().decode(utf-8) data.append(urllib.request.urlopen(url).read().decode(utf-8) pageSize += 25 pageNum += 1 print(pageSize, pageNum) except Exception as e: raise e return data2爬取数据def _getData(html): title = * 电影标题 *rating_num = * 评分 range_num = * 排名 *rating_people_num = * 评价人数 movie_

5、author = * 导演 data = * bs4解析html soup = BeautifulSoup(html, html.parser) for li in soup.find(ol, attrs=class: grid_view).find_all(li): title.append(li.find(span, class_=title).text) *rating_num.append(li.find(div, class_=star).find(span, class_=rating_num).text) range_num.append(li.find(div, class_=

6、pic).find(em).text) *spans = li.find(div, class_=star).find_all(span) *for x in range(len(spans): * if x = 2: * pass * else: * rating_people_num.append(spansx.string-len(spansx.string):-3) str = li.find(div, class_=bd).find(p, class_=).text.lstrip() index = str.find(主) if (index = -1): index = str.f

7、ind(.) print(li.find(div, class_=pic).find(em).text) if (li.find(div, class_=pic).find(em).text = 210): index = 60 * print(aaa) * print(str4:index) movie_author.append(str4:index) datatitle = title *datarating_num = rating_num datarange_num = range_num *datarating_people_num = rating_people_num data

8、movie_author = movie_author return data3数据整理、转换def _getMovies(data): f = open(F:/douban_movie.html, w,encoding=utf-8) f.write() f.write(Insert title here) f.write() f.write(爬取豆瓣电影) f.write( 作者：刘文斌) f.write( 时间： + nowtime + ) f.write() f.write() f.write() f.write() f.write(电影) *f.write(评分) f.write(排名

9、) *f.write(评价人数) f.write(导演) f.write() f.write()f.write() for data in datas: for i in range(0, 25): f.write() f.write(%s % datatitlei) * f.write(%s % datarating_numi) f.write(%s % datarange_numi) * f.write(%s % datarating_people_numi) f.write(%s % datamovie_authori) f.write() f.write()f.write() f.wr

10、ite() f.write() f.write() f.close()if _name_ = _main_: datas = htmls = _getHtml() for i in range(len(htmls): data = _getData(htmlsi) datas.append(data) _getMovies(datas)4数据保存、展示结果如后图所示：5技术难点关键点数据爬取实战（搜房网爬取房屋数据）from bs4 import BeautifulSoupimport requestsrep = requests.get(newhouse.fang./top/)rep.enc

11、oding = gb2312 * 设置编码方式html = rep.textsoup = BeautifulSoup(html, html.parser)f = open(F:/fang.html, w,encoding=utf-8)f.write()f.write(Insert title here)f.write()f.write(新房成交TOP3)f.write()f.write(房址)f.write(成交量)f.write(均价)for li in soup.find(ul,class_=ul02).find_all(li): name=li.find(div,class_=pbtex

12、t).find(p).text chengjiaoliang=li.find(span,class_=red-f3).text try: junjia=li.find(div,class_=ohter).find(p,class_=gray-9)*.text.replace(O, 平方米) except Exception as e: junjia=li.find(div,class_=gray-9)*.text.replace(O, 平方米) f.write(%s % name) f.write(%s % chengjiaoliang) f.write(%s % junjia) print(name)f.write()f.write()六、总结教师评语：成绩：指导教师：

邮箱/手机：
温馨提示：	快捷下载时，用户名和密码都是您填写的邮箱或者手机号，方便查询和重复下载（系统自动生成）。如填写123，账号就是123，密码也是123。
特别说明：	请自助下载，系统不会自动发送文件的哦；如果您已付费，想二次下载，请登录后访问：我的下载记录
支付方式：
验证码：	换一换

账号：
密码：
验证码：	换一换
当日自动登录忘记密码？