抓包分析技术
原创大约 3 分钟
Fiddler是一款免费的互联网通信调试工具,它位于客户端和服务器端之间,既可以用来充当服务器代理,也可以作为抓包工具捕捉每一条通信数据。
可以给Fiddler搭配一款专用的抓包浏览器,避免频繁设置的麻烦,例如,Firefox。

通过Fiddler监控页面变化,捕获Ajax动态请求地址,然后再以Python爬取数据。
# 爬取微博当日热点新闻中的转发博主名称
import re
from urllib import request
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Content-Type": "application/javascript",
}
opener = request.build_opener()
head_all = []
for key, value in header.items():
item = (key, value)
head_all.append(item)
opener.addheaders = head_all
request.install_opener(opener)
for i in range(1, 4):
# Ajax自动翻页
url = "http://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23%E8%BF%99%E4%BD%8D%E6%B8%85%E5%8D%8E%E5%8E%A8%E5%B8%88%E7%9A%84%E7%94%BB%E8%A2%AB%E5%A4%96%E4%BA%A4%E9%83%A8%E6%B0%B8%E4%B9%85%E6%94%B6%E8%97%8F%23&page_type=searchall&page=" + str(i)
data = request.urlopen(url).read().decode("utf-8", "ignore")
pat = '"screen_name":"(.*?)"'
results = re.compile(pat, re.S).findall(data)
for result in results:
print(result.encode('utf-8').decode('unicode-escape'))
当页面地址没有变化时,说明该页面是通过POST实现方法调用的,因此只能通过Fiddler来抓包完成数据的爬取。
这期间还要注意有无隐藏变量或者动态生成的数值,例如,JWT、Token等。
'''
通过抓包一步步分析隐藏的数据
'''
from urllib import request
from urllib import parse
import execjs
import http.cookiejar
import re
import uuid
import random
# 生成guid
guid = uuid.uuid4()
# 在前面一步步追踪,通过分析发现页面需要vjkl5这个动态变量的值,而这个值是通过js函数生成的
fh = open("./base64.js", "r")
js1 = fh.read()
fh.close()
fh = open("./md5.js", "r")
js2 = fh.read()
fh.close()
fh = open("./getkey.js", "r")
js3 = fh.read()
fh.close()
# 合并js文件
js_all = js1 + js2 + js3
cjar = http.cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cjar))
opener.addheaders = [("Referer", "http://xxx.com/list/list/?sorttype=...")]
request.install_opener(opener)
# 用户代理池
uapools = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]
request.urlopen("http://xxx.com/list/list/?sorttype=...").read().decode("utf-8", "ignore")
# 找到vjkl5变量的值
pat = "vjkl5=(.*?)\s"
vjkl5 = re.compile(pat, re.S).findall(str(cjar))
# 替换vjkl5变量的值
if len(vjkl5) > 0:
vjkl5 = vjkl5[0]
else:
vjkl5 = 0
print("vjkl5:" + str(vjkl5))
js_all = js_all.replace("ce7c8849dffea151c0179187f85efc9751115a7b", str(vjkl5))
# 执行js代码
compile_js = execjs.compile(js_all)
vl5x = compile_js.call("getKey")
print("vl5x:" + str(vl5x))
# 待爬页面
url = "http://xxx.com/List/ListContent"
# 爬取各页面数据
for i in range(0, 10):
try:
# 通过调用“http://xxx.com/ValiCode/GetCode”得到number变量值
url = "http://xxx.com/ValiCode/GetCode"
numdata = parse.urlencode({
"guid": guid,
}).encode('utf-8')
req = request.Request(url, numdata)
req.add_header('User-Agent', random.choice(uapools))
numdata = request.urlopen(req).read().decode("utf-8", "ignore")
data = parse.urlencode({
"Param": "XX类型:YYYY,年份:2024,地域:XX市",
"Index": str(i + 1),
"Page": "20",
"Order": "层级",
"Direction": "asc",
# 隐藏变量,需要调用http://xxx.com/ValiCode/GetCode得到
"number": str(numdata),
# 可以自主生成的变量
"guid": guid,
# 隐藏变量,需要调用好几个js函数才能得到,初始值来自于首页Cookie部分字段
"vl5x": vl5x,
}).encode('utf-8')
req = request.Request(url, data)
req.add_header('User-Agent', random.choice(uapools))
data = request.urlopen(req).read().decode("utf-8", "ignore")
pat = '文书ID.*?".*?"(.*?)."'
ids = re.compile(pat).findall(data)
print(ids)
except Exception as err:
print(err)
感谢支持
更多内容,请移步《超级个体》。