网络爬虫
大约 11 分钟
快速上手
from urllib import request
import re
url = "http://www.baidu.com"
result = request.urlopen(url)
# 打印状态码
print(result.code)
# 快速爬取网页的三种方法
# 爬到内存中(方法1)
data1 = result.read().decode("utf-8", "ignore")
print(data1)
# 爬到内存中(方法2)
data2 = request.urlopen(request.Request(url)).read().decode("utf-8", "ignore")
print(data2)
# 保存到硬盘
request.urlretrieve(url, filename="C:/baidu.html")
# 循环爬取慕课网页面数据
for i in range(100, 200):
url = "https://coding.imooc.com/class/" + str(i) + ".html"
data = request.urlopen(url).read().decode("utf-8", "ignore")
title = '<h1>(.*?)</h1>'
price = '<div class="cur-price">¥(.*?)</div>'
title_result = re.compile(title, re.S).findall(data)
price_result = re.compile(price, re.S).findall(data)
if len(title_result) > 0:
print(title_result[0])
else:
continue
if len(price_result) > 0:
print(price_result[0])
else:
print("免费")
print("------------------------")
urllib模块
urllib模块是Python使用得最多的网络请求工具之一。
搜索关键词
因为现在度娘增加了安全验证,所以下面的代码爬不出任何东西,我也懒得去整活,只不过方法还是可以参考借鉴的。
后面可以用Google来代替它。
# 自动搜索某个关键词并自动翻页
# 关键词:python爬虫
# 初始搜索参数
baiduurl = "http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=python%E7%88%AC%E8%99%AB&oq=python%25E7%2588%25AC%25E8%2599%25AB&rsv_pq=a3c1791b004fa253&rsv_t=88b3TunpVfdXbxyMRDLM19Qb%2F%2FHLq0TlVLpcW8UK8%2BiFX2JrXYoNyhZJe2Y&rqlang=cn&rsv_enter=1&rsv_dl=tb&rsv_sug3=1&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&rsv_btype=t&inputT=3&rsv_sug4=251"
# 观察参数的变化,发现真正起作用的只有“wd”、“pn”这两个参数,一个是关键字,一个是翻页
# 第一页
baiduurl1 = "https://www.baidu.com/s?wd=python%E7%88%AC%E8%99%AB"
# 第二页
baiduurl2 = "https://www.baidu.com/s?wd=python%E7%88%AC%E8%99%AB&pn=10"
# 第三页
baiduurl3 = "https://www.baidu.com/s?wd=python%E7%88%AC%E8%99%AB&pn=20"
# 重新构造参数,爬取前10页内容
key = request.quote("python爬虫")
baidu = "http://www.baidu.com/s?wd=" + key
for i in range(0, 10):
searchurl = baidu + "&pn=" + str(i * 10)
print("正在爬取第" + str(i + 1) + "页数据")
# 根据正则表达式将爬到的网页列表中各网页标题进行提取
data = request.urlopen(searchurl).read().decode("utf-8", "ignore")
pat = '"title":"(.*?)"'
result = re.compile(pat, re.S).findall(data)
# 将各标题信息通过循环遍历输出
for j in range(0, len(result)):
print("第" + str(j) + "条网页标题是:" + str(result[j]))
print("------------------------")
POST请求
通过本地部署的Web Tours Application网站实现对POST请求的测试。

from urllib import request
from urllib import parse
url = "http://172.16.185.181:1080/cgi-bin/welcome.pl"
data = parse.urlencode({
"username": "test1",
"password": "123456",
}).encode('utf-8')
req = request.Request(url, data)
result = request.urlopen(req).read()
file = open("~/post.html", "wb")
file.write(result)
file.close()
虽然成功执行并保存了文件,但因为没有Cookie,所以页面显示得并不完全。
Cookie处理
现在给它增加Cookie处理的功能代码。
from urllib import request
from urllib import parse
import http.cookiejar
# Cookie处理
cjar = http.cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cjar))
# 将Cookie安装为全局对象
request.install_opener(opener)
url = "http://172.16.185.181:1080/cgi-bin/welcome.pl"
data = parse.urlencode({
"username": "test1",
"password": "123456",
}).encode('utf-8')
req = request.Request(url, data)
result = request.urlopen(req).read()
file = open("~/post.html", "wb")
file.write(result)
file.close()
# 读取Cookie内容
print(str(cjar))
浏览器伪装
可以通过给浏览器添加HTTP Header
头,将网络爬虫伪装成浏览器。
from urllib import request
url = "https://www.qiushibaike.com/"
# 头文件格式header=("User-Agent", 具体用户代理值)
# 通过Opener添加Header
header = ("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
opener = request.build_opener()
opener.addheaders = [header]
data1 = opener.open(url).read().decode("utf-8", "ignore")
# 全局生效
request.install_opener(opener)
result = request.urlopen(url).read().decode("utf-8", "ignore")
# 以字典的方式添加多个头信息
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Content-Type": "application/javascript",
}
opener = request.build_opener()
head_all = []
for key, value in header.items():
item = (key, value)
head_all.append(item)
opener.addheaders = head_all
request.install_opener(opener)
data2 = request.urlopen(url).read().decode("utf-8", "ignore")
# 通过request添加Header
req = request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
data3 = request.urlopen(req).read().decode("utf-8", "ignore")
写入数据库
需要先通过Pip安装pymysql驱动,然后在相应环境的~/site-packages/pymysql/connections.py
中查找charset=""
,并将它修改为charset="utf8mb4"
。
from urllib import request
import re
import pymysql
conn = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", db="test")
for i in range(0, 1000):
url = "https://coding.imooc.com/class/" + str(i) + ".html"
data = request.urlopen(url).read().decode("utf-8", "ignore")
title = '<h1>(.*?)</h1>'
price = '<div class="cur-price">¥(.*?)</div>'
title_result = re.compile(title, re.S).findall(data)
price_result = re.compile(price, re.S).findall(data)
if len(title_result) > 0:
print(title_result[0])
else:
continue
if len(price_result) > 0:
print(price_result[0])
else:
print("免费")
# 插入数据
conn.query('INSERT INTO imooc(title, price) VALUES(\'{0}\', \'{1}\')'.format(str(title_result[0]), str(price_result[0])))
conn.commit()
print("------------------------")
requests模块
requests模块和urllib模块一样,都是Python的网络请求工具。
先通过命令行安装。
> pip install requests
然后再使用requests爬取网页。
import requests
import re
rsp1 = requests.get("https://www.imooc.com/")
# 设置Cookie
ck = requests.utils.dict_from_cookiejar(rsp1.cookies)
title = re.compile("<title>(.*?)</title>", re.S).findall(rsp1.text)
print(title)
# 设置浏览器头
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
# # 设置代理
# proxy = {
# "http": "http://127.0.0.1:8888",
# "https": "http://127.0.0.1:8888",
# }
rsp2 = requests.get("https://www.imooc.com/", headers=header, cookies=ck)
title = re.compile("<title>(.*?)</title>", re.S).findall(rsp2.text)
print(title)
key = {
"q": "python",
}
rsp3 = requests.get("https://www.google.com.hk/search", headers=header, cookies=ck, params=key)
title = re.compile("<title>(.*?)</title>", re.S).findall(rsp3.text)
print(title)
data = {
"username": "test1",
"password": "123456",
}
rsp4 = requests.post("http://172.16.185.181:1080/cgi-bin/welcome.pl", data=data)
title = re.compile("<title>(.*?)</title>", re.S).findall(rsp4.text)
print(title)
beautifulsoup模块
beautifulsoup与正则表达式类似,但它更多是通过各种HTML标签来实现信息的筛选与提取。
先通过命令行安装。
> pip install bs4
再使用它来处理提取到的数据。
from bs4 import BeautifulSoup as bs
from urllib import request
data = request.urlopen("https://www.imooc.com/").read().decode("utf-8", "ignore")
bs1 = bs(data)
# 格式化输出
print(bs1.prettify())
# 获取标签:bs对象.标签名
bs1.title
# 获取标签里面的文字:bs对象.标签名.string
bs1.title.string
# 获取标签名:bs对象.标签名.name
bs1.title.name
# 获取属性列表:bs对象.标签名.attrs
bs1.a.attrs
# 获取某个属性对应的值:bs对象.标签名[属性名] 或者 bs对象.标签名.get(属性名)
bs1.a["class"]
bs1.a.get("class")
# 提取所有某个节点的内容:bs对象.find_all('标签名')
bs1.find_all('a')
# 提取多个标签的内容:bs对象.find_all(['标签名1','标签名2,…,标签n'])
bs1.find_all(['a', 'ul'])
# 提取所有子节点:bs对象.标签.contents
k1 = bs1.ul.contents
# 提取所有子节点的迭代器:bs对象.标签.children
k2 = bs1.ul.children
allulc = [i for i in k2]
更多操作可以看它的官方文档。
代码实例
爬取Google
from urllib import request
import re
import http.cookiejar
# 伪装成浏览器并设置cookie
header = ("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
cjar = http.cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cjar))
opener.addheaders = [header]
# 全局生效
request.install_opener(opener)
# 搜索某个关键词并自动翻页
# 关键词:python爬虫
# 初始搜索参数
google_url = "https://www.google.com.hk/search?q=python&newwindow=1&sca_esv=0c814c2424608e41&source=hp&ei=wqJBZ8nOG8yBvr0P5PKY8AM&iflsig=AL9hbdgAAAAAZ0Gw0i-0e2grD7EgEGMlpmnYeMTARPx2&ved=0ahUKEwjJt46ulPKJAxXMgK8BHWQ5Bj4Q4dUDCA0&uact=5&oq=python&gs_lp=Egdnd3Mtd2l6IgZweXRob24yChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigUyChAAGIAEGEMYigVI5AhQAFj0BnAAeACQAQCYAWqgAYMEqgEDNS4xuAEDyAEA-AEBmAIGoAKaBMICBRAAGIAEmAMAkgcDNC4yoAeSEQ&sclient=gws-wiz"
# 观察参数的变化,发现真正起作用的只有“search”、“start”这两个参数,一个是关键字,一个是翻页
# 第一页
google_url1 = "https://www.google.com.hk/search?q=python%E7%88%AC%E8%99%AB"
# 第二页
google_url2 = "https://www.google.com.hk/search?q=python%E7%88%AC%E8%99%AB&start=10"
# 第三页
google_url3 = "https://www.google.com.hk/search?q=python%E7%88%AC%E8%99%AB&start=20"
# 重新构造参数,爬取前10页内容
# 对中文进行编码,不然会报错
key = request.quote("python爬虫")
google = "https://www.google.com.hk/search?q=" + key
for i in range(0, 10):
searchurl = google + "&start=" + str(i * 10)
print("正在爬取第" + str(i + 1) + "页数据")
# 根据正则表达式将爬到的网页列表中各网页标题进行提取
data = request.urlopen(searchurl).read().decode("utf-8", "ignore")
pat = '<h3 class="LC20lb MBeuO DKV0Md">(.*?)</h3>'
result = re.compile(pat, re.S).findall(data)
# 将各标题信息通过循环遍历输出
for j in range(0, len(result)):
print("第" + str(j) + "条网页标题是:" + str(result[j]))
print("\n")
print("------------------------")
爬取17173
from urllib import request
import ssl
import re
# 消除https证书错误
ssl._create_default_https_context = ssl._create_unverified_context
class Spider():
'''
网址:https://v.17173.com/so-index.html?code=SYPC
需要爬取的是这部分内容,需求:统计浏览量排名,并以倒序输出
<li class="item">
<a href="//v.17173.com/v_1_1/NDAwNzExNjk.html">
<p class="pic-box">
<img src="//i.17173cdn.com/gdthue/YWxqaGBf/8tha/UnhuzvbsdppxpuD.jpg" alt="">
<span class="duration">00:32</span>
<span class="view">2,129</span>
</p>
<p class="txt-box"> 《魔域》跨服家族战8强出炉!凯诺克斯“赤龙”战椅助力巅峰之战</p>
</a>
</li>
'''
url = 'https://v.17173.com/so-index.html?code=SYPC'
root_pattern = '<li class="item">([\s\S]*?)</li>'
view_pattern = '<span class="view">([\s\S]*?)</span>'
text_pattern = '<p class="txt-box">([\s\S]*?)</p>'
def __fetch_content(self):
htmls = request.urlopen(self.url).read()
return str(htmls, encoding='utf-8')
def __analysis(self, htmls):
root_html = re.findall(self.root_pattern, htmls)
results = []
# 再次解析内容
for html in root_html:
view = re.findall(self.view_pattern, html)
text = re.findall(self.text_pattern, html)
result = {
# 将字符串转换为数字
'view': int(str(view[0]).strip().replace(',', '')),
'text': str(text[0]).strip()
}
results.append(result)
return results
def go(self):
htmls = self.__fetch_content()
results = self.__analysis(htmls)
# 对字典列表进行排序
results = sorted(results, key=lambda x: x['view'])
return results
spider = Spider()
results = spider.go()
print("页面输出结果:")
for result in results:
print("观看人数:", result['view'], "\t内容标题:", result['text'])
输出结果如下。
页面输出结果:
观看人数: 2131 内容标题: 《魔域》跨服家族战8强出炉!凯诺克斯“赤龙”战椅助力巅峰之战
观看人数: 8595 内容标题: “1”起冒险再出发!合金周年狂欢418震撼开启
观看人数: 9192 内容标题: 《火影忍者:忍者新世代》宇智波一族的天才
观看人数: 9384 内容标题: 玩法升级畅享真实赛场体验,《FC足球世界》玩法制作人Timo带你玩转新版本!
观看人数: 10086 内容标题: 《光与夜之恋》活动PV:漫漫春居
观看人数: 12335 内容标题: 《光与夜之恋》活动PV:暖日闲居
观看人数: 12909 内容标题: 断魂针,绝情剑!《剑侠世界:起源》新门派古墓技能首曝
观看人数: 13883 内容标题: 《光与夜之恋》陆沉生日活动:如梦长栖
观看人数: 20766 内容标题: 【张卫健主演】五毒现世!西山居《剑侠世界:起源》江湖系列片
观看人数: 20963 内容标题: 【张卫健主演】五毒现世!西山居《剑侠世界:起源》江湖系列片
观看人数: 21019 内容标题: 【张卫健主演】五毒现世!西山居《剑侠世界:起源》江湖系列片
观看人数: 22305 内容标题: 亿万剑侠的江湖起源!西山居《剑侠世界:起源》年度版今日公测
观看人数: 25158 内容标题: 国产单机大厂进军二次元!《白荆回廊》凭什么杀出重围?
观看人数: 26039 内容标题: 张卫健主演!《剑侠世界:起源》年度版武侠片花絮曝光
观看人数: 30356 内容标题: 古墓登场,剑指铁浮城!《剑侠世界:起源》新资料片震撼公布
观看人数: 30584 内容标题: 张卫健主演!《剑侠世界:起源》江湖系列大片预告曝光!
观看人数: 34063 内容标题: 【张卫健主演】五毒现世!西山居《剑侠世界:起源》江湖系列片
观看人数: 36251 内容标题: 新门派霸刀!《剑侠世界3》绝世霸刀资料片今日上线
观看人数: 38911 内容标题: 仗剑同行,两载回忆!《剑侠世界3》2周年视频点燃剑侠情
观看人数: 89165 内容标题: 金银匠心,剑光璀璨! 非遗大师助阵《剑侠世界:起源》武林大会
爬取huya
再爬取另一个网站。
from urllib import request
import ssl
import re
import gzip
from io import BytesIO
import json
# 消除MacBook中的https证书错误
ssl._create_default_https_context = ssl._create_unverified_context
class Spider():
'''
目标网址:https://www.huya.com/g/wzry
爬取的内容解解码后变成了JSON字符串,需要爬取的是 var ALL_LIST_DATA = [...] 中的内容,其中某条数据的格式为:
{
"lUid": 1354160941,
"lYyid": 1496115569,
"sNick": "哈迪斯第一大司命李白",
"iSex": 1,
"iLevel": 40,
"sAvatar180": "https://huyaimg.msstatic.com/avatar/1025/47/ebd8d06fcad9e3048cfac92e515c48_180_135.jpg?1647435888",
"lProfileRoom": 12366,
"sPrivateHost": "haddis",
"sProfileHomeHost": "haddis",
"iIsPlatinum": 1,
"lActivityId": 0,
"lActivityCount": 9333105,
"iGid": 2336,
"iGameId": 0,
"sGameFullName": "王者荣耀",
"sGameHostName": "wzry",
"iBussType": 3,
"lLiveId": "7403136622269811153",
"lChannel": 1354160941,
"lLiveChannel": 1354160941,
"lUserCount": 2549020,
"lTotalCount": 2549020,
"sRoomName": "",
"sIntroduction": "国一大司命单排100星冲教学~",
"sPreviewUrl": "",
"iLiveSourceType": 0,
"iScreenType": 1,
"sScreenshot": "http://live-cover.msstatic.com/huyalive/1354160941-1354160941-5816076955115585536-2708445338-10057-A-0-1-imgplus/20240815113234.jpg",
"iIsSecret": 0,
"iCameraOpen": 0,
"iIsBluRay": 1,
"sBluRayMBitRate": "10M",
"iBitRate": 10000,
"lLiveCompatibleFlag": 0,
"iRecommendStatus": 0,
"sRecommendTagName": "",
"iIsRoomPay": 0,
"sRoomPayTag": "",
"iIsWatchTogetherVip": 0,
"iStartTime": 1723677065,
"iTime": 15743,
"iUpdateCacheTime": 1723692808,
"mpCorner": {
"_kproto": {
"_classname": "string"
},
"_bKey": 0,
"_bValue": 0,
"value": {
"ListPos2": {
"sContent": "李白",
"sIcon": "",
"sBackImage": "",
"_classname": "LiveList.CornerInfo"
}
},
"_classname": "map<string,LiveList.CornerInfo>"
},
"tImgRecInfo": {
"sType": "",
"sValue": "",
"sTypeDesc": "",
"_classname": "LiveList.ImgRecInfo"
},
"iIsGaming": 0,
"_classname": "LiveList.LiveListInfo"
},
'''
url = 'https://www.huya.com/g/wzry'
root_pattern = 'var ALL_LIST_DATA = \[([\s\S]*?)\]'
def __fetch_content(self):
data_bytes = request.urlopen(self.url).read()
buff = BytesIO(data_bytes)
f = gzip.GzipFile(fileobj=buff)
return f.read().decode('utf-8')
def __analysis(self, data):
list_result = []
root_htmls = re.findall(self.root_pattern, data)
root_html = root_htmls[0]
# 使用 ,"tImgRecInfo":{"sType":"","sValue":"","sTypeDesc":"","_classname":"LiveList.ImgRecInfo"},"iIsGaming":0,"_classname":"LiveList.LiveListInfo" 来切割字符串,这样切割出来的字符串仍然是一个JSON字符串
words = root_html.split(',"tImgRecInfo":{"sType":"","sValue":"","sTypeDesc":"","_classname":"LiveList.ImgRecInfo"},"iIsGaming":0,"_classname":"LiveList.LiveListInfo"')
for index, word in enumerate(words):
if word[0:2] == "},":
word = word[2:]
list_result.append(word + "}")
# 删除list最后一个元素
list_result.pop()
# 将list中的每个元素转换为json字符串
json_result = []
for word in list_result:
json_string = json.loads(word)
json_result.append(json_string)
return json_result
def go(self):
data = self.__fetch_content()
results = self.__analysis(data)
return results
# 打印数据
spider = Spider()
results = spider.go()
for result in results:
print(result)
感谢支持
更多内容,请移步《超级个体》。