爬取微信文章
原创大约 2 分钟
构建用户代理和IP池
'''
同时使用用户代理池和IP代理池
'''
from urllib import request
import time
# 调用动态IP接口
# ip_pools:动态IP池
# target_url:要爬取的目标网页地址
# api_url:动态IP接口地址
def agent_ip(ip_pools, target_url, api_url):
import random
uapools = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]
def api(api_url):
print("调用了接口")
request.urlcleanup()
result = request.urlopen(api_url).read().decode("utf-8", "ignore")
return result
def ip(ip_pools, uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ("User-Agent", thisua)
thisip = ip_pools
print("当前用的IP是:" + thisip)
proxy = request.ProxyHandler({"http": thisip})
opener = request.build_opener(proxy, request.HTTPHandler)
opener.addheaders = [headers]
request.install_opener(opener)
if (ip_pools == 0):
while True:
ippools = api(api_url)
print("提取IP完成")
ip(ippools, uapools)
print("正在验证IP有效性")
data1 = request.urlopen("http://www.baidu.com").read().decode("utf-8", "ignore")
if (len(data1) > 5000):
print("当前IP有效")
break
else:
print("当前IP无效,正在延时")
time.sleep(60)
else:
ip(ip_pools, uapools)
data = request.urlopen(target_url).read().decode("utf-8", "ignore")
return ip_pools, data
爬取微信文章
'''
爬取微信文章
'''
import re
from wechat import *
key = "python爬虫"
key = request.quote(key)
name = "微信文章_" + str(key)
api = 'http://api.xxx.com/...'
for i in range(0, 100):
url = "http://weixin.sogou.com/weixin?oq=&query=" + key + "&type=2&page=" + str(i + 1) + "&ie=utf8"
if (i % 3 == 0 and i == 0):
ip_pools, data = agent_ip(0, url, api)
elif (i % 3 == 0):
print("正在延时中...")
time.sleep(15)
print("延时完成,正在调取IP")
ip_pools, data = agent_ip(0, url, api)
print("IP调取完成")
else:
ip_pools, data = agent_ip(ip_pools, url, api)
print(url)
print(len(data))
pat1 = '模式识别'
rst1 = re.compile(pat1, re.S).findall(data)
if (len(rst1) == 0):
print("当前页爬取失败")
continue
for j in range(0, len(rst1)):
current_url = rst1[j]
pat2 = 'amp;'
current_url = url.replace(pat2, "")
print(current_url)
ip_pools, current_data = agent_ip(ip_pools, current_url, api)
print("文章爬取成功,长度为:" + str(len(current_data)))
fh = open("~/微信文章数据/" + str(i) + str(j) + ".html", "w", encoding="utf-8")
fh.write(current_data)
fh.close()
感谢支持
更多内容,请移步《超级个体》。