仓颉编程语言(Cangjie)是华为编译器与编程语言实验室仓颉语言团队推出的一款面向全场景应用开发的编程语言,基本上就是为了辅助HarmonyOS而出现的。
原创大约 1 分钟
我码字搬砖的地方
仓颉编程语言(Cangjie)是华为编译器与编程语言实验室仓颉语言团队推出的一款面向全场景应用开发的编程语言,基本上就是为了辅助HarmonyOS而出现的。
先准备好需要的组件包。
> pip install requests
> pip install prettytable
> pip install DrissionPage
依然使用DrissionPage来抓取。
'''
基于DrissionPage实现某鹅视频评论数据的爬取
'''
from DrissionPage import ChromiumPage
import json
import base64
# 打开浏览器
page = ChromiumPage()
# 监听响应网址
page.listen.start("trpc.universal_backend_service.page_server_rpc.PageServer/GetPageData?video_appid=1000005&vversion_name=1.0.0&")
page.get('https://v.qq.com/x/cover/75m13e64doz91ul/a0017zmel91.html')
i = 0
# 自动翻页
while True:
# 等待请求加载完毕,并拿到数据
print(f"=========== 开始滚动第{i + 1}次 ===========")
page.scroll.to_bottom()
i += 1
page.wait(2)
if not page.scroll.to_bottom():
break
resp = page.listen.wait().response
jsonp = resp.body
data = jsonp['data']['module_list_datas']
for item in data:
complex_json = item['module_datas'][0]['item_data_lists']['item_datas'][0]
jsonp = json.loads(complex_json['complex_json'])
# decoded_string = base64.b64decode(jsonp)
# 拿到某个用户评论相关的所有数据
# 得到用户名和用户评论信息
username = base64.b64decode(jsonp['user']['base']['name']).decode('utf-8')
comment = base64.b64decode(jsonp['content']['content']).decode('utf-8')
print(f'{username} --------- {comment}')
'''
同时使用用户代理池和IP代理池
'''
from urllib import request
import time
# 调用动态IP接口
# ip_pools:动态IP池
# target_url:要爬取的目标网页地址
# api_url:动态IP接口地址
def agent_ip(ip_pools, target_url, api_url):
import random
uapools = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]
def api(api_url):
print("调用了接口")
request.urlcleanup()
result = request.urlopen(api_url).read().decode("utf-8", "ignore")
return result
def ip(ip_pools, uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ("User-Agent", thisua)
thisip = ip_pools
print("当前用的IP是:" + thisip)
proxy = request.ProxyHandler({"http": thisip})
opener = request.build_opener(proxy, request.HTTPHandler)
opener.addheaders = [headers]
request.install_opener(opener)
if (ip_pools == 0):
while True:
ippools = api(api_url)
print("提取IP完成")
ip(ippools, uapools)
print("正在验证IP有效性")
data1 = request.urlopen("http://www.baidu.com").read().decode("utf-8", "ignore")
if (len(data1) > 5000):
print("当前IP有效")
break
else:
print("当前IP无效,正在延时")
time.sleep(60)
else:
ip(ip_pools, uapools)
data = request.urlopen(target_url).read().decode("utf-8", "ignore")
return ip_pools, data
from urllib import request
import re
import random
# 用户代理池
pools = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]
# 每次都随机选择一个User-Agent
def agent():
# 随机选择一个User-Agent
ua = random.choice(pools)
ua = ("User-Agent", ua)
opener = request.build_opener()
opener.addheaders = [ua]
request.install_opener(opener)
# print("当前使用的User-Agent:" + str(ua))
for i in range(0, 10):
try:
agent()
url = "http://baike.baidu.com/item/" + str(i + 1)
data = request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<h1 class="lemmaTitle_pFwpd J-lemma-title">(.*?)</h1>'
rst = re.compile(pat, re.S).findall(data)
for j in range(0, len(rst)):
print(rst[j])
print("------------------------")
except Exception as e:
print(e)