糗事百科爬虫源代码1.0
'''
#=============================================================================
# FileName: qiushibaike.py
# Desc:
# Author: modys
# Email: http://www.modys.top
# HomePage: http://www.modys.top
# Version: 1.0
# LastChange: 2020-08-23 19:11:02
# History:
#=============================================================================
'''
# 爬去糗事百科段子
from urllib import request
import re
headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0')
opener = request.build_opener()
opener.addheaders = [headers]
# 安装全局
request.install_opener(opener)
fh = open('C:\\Users\\Administrator\\Desktop\\qiushibaike.txt', 'w', encoding='utf-8')
# 爬去前5页
for i in range(0, 5):
url = 'https://www.qiushibaike.com/text/page/' + str(1+i)
req = request.urlopen(url).read().decode('utf-8', 'ignore')
print(len(req))
print(url)
pat = '<div class="content">.*?<span>(.*?)</span>.*?</a>'
rst = re.compile(pat, re.S).findall(req)
for i in range(0, len(rst)):
# print(rst[i].strip())
# print(rst[i].strip() + "\n", end="---------\n")
fh.write(str(rst[i].strip()) + "\n-----------\n")
fh.close()