爬取微信文章

添码座原创大约 2 分钟

构建用户代理和IP池

'''
    同时使用用户代理池和IP代理池
'''

from urllib import request
import time

# 调用动态IP接口
# ip_pools：动态IP池
# target_url：要爬取的目标网页地址
# api_url：动态IP接口地址
def agent_ip(ip_pools, target_url, api_url):
    import random
    uapools = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    ]
    def api(api_url):
        print("调用了接口")
        request.urlcleanup()
        result = request.urlopen(api_url).read().decode("utf-8", "ignore")
        return result

    def ip(ip_pools, uapools):
        thisua = random.choice(uapools)
        print(thisua)
        headers = ("User-Agent", thisua)
        thisip = ip_pools
        print("当前用的IP是：" + thisip)
        proxy = request.ProxyHandler({"http": thisip})
        opener = request.build_opener(proxy, request.HTTPHandler)
        opener.addheaders = [headers]
        request.install_opener(opener)

    if (ip_pools == 0):
        while True:
            ippools = api(api_url)
            print("提取IP完成")
            ip(ippools, uapools)
            print("正在验证IP有效性")
            data1 = request.urlopen("http://www.baidu.com").read().decode("utf-8", "ignore")
            if (len(data1) > 5000):
                print("当前IP有效")
                break
            else:
                print("当前IP无效，正在延时")
                time.sleep(60)
    else:
        ip(ip_pools, uapools)
    data = request.urlopen(target_url).read().decode("utf-8", "ignore")
    return ip_pools, data

爬取微信文章

'''
    爬取微信文章
'''

import re
from wechat import *

key = "python爬虫"
key = request.quote(key)
name = "微信文章_" + str(key)
api = 'http://api.xxx.com/...'
for i in range(0, 100):
    url = "http://weixin.sogou.com/weixin?oq=&query=" + key + "&type=2&page=" + str(i + 1) + "&ie=utf8"
    if (i % 3 == 0 and i == 0):
        ip_pools, data = agent_ip(0, url, api)
    elif (i % 3 == 0):
        print("正在延时中...")
        time.sleep(15)
        print("延时完成，正在调取IP")
        ip_pools, data = agent_ip(0, url, api)
        print("IP调取完成")
    else:
        ip_pools, data = agent_ip(ip_pools, url, api)
    print(url)
    print(len(data))
    pat1 = '模式识别'
    rst1 = re.compile(pat1, re.S).findall(data)
    if (len(rst1) == 0):
        print("当前页爬取失败")
        continue
    for j in range(0, len(rst1)):
        current_url = rst1[j]
        pat2 = 'amp;'
        current_url = url.replace(pat2, "")
        print(current_url)
        ip_pools, current_data = agent_ip(ip_pools, current_url, api)
        print("文章爬取成功，长度为：" + str(len(current_data)))
        fh = open("~/微信文章数据/" + str(i) + str(j) + ".html", "w", encoding="utf-8")
        fh.write(current_data)
        fh.close()

感谢支持

更多内容，请移步《超级个体》。