原创大约 6 分钟
先准备好需要的组件包。
> pip install requests
> pip install prettytable
> pip install DrissionPage
原创大约 2 分钟
依然使用DrissionPage来抓取。
'''
基于DrissionPage实现某鹅视频评论数据的爬取
'''
from DrissionPage import ChromiumPage
import json
import base64
# 打开浏览器
page = ChromiumPage()
# 监听响应网址
page.listen.start("trpc.universal_backend_service.page_server_rpc.PageServer/GetPageData?video_appid=1000005&vversion_name=1.0.0&")
page.get('https://v.qq.com/x/cover/75m13e64doz91ul/a0017zmel91.html')
i = 0
# 自动翻页
while True:
# 等待请求加载完毕,并拿到数据
print(f"=========== 开始滚动第{i + 1}次 ===========")
page.scroll.to_bottom()
i += 1
page.wait(2)
if not page.scroll.to_bottom():
break
resp = page.listen.wait().response
jsonp = resp.body
data = jsonp['data']['module_list_datas']
for item in data:
complex_json = item['module_datas'][0]['item_data_lists']['item_datas'][0]
jsonp = json.loads(complex_json['complex_json'])
# decoded_string = base64.b64decode(jsonp)
# 拿到某个用户评论相关的所有数据
# 得到用户名和用户评论信息
username = base64.b64decode(jsonp['user']['base']['name']).decode('utf-8')
comment = base64.b64decode(jsonp['content']['content']).decode('utf-8')
print(f'{username} --------- {comment}')
原创大约 1 分钟
原创大约 1 分钟
构建用户代理和IP池
'''
同时使用用户代理池和IP代理池
'''
from urllib import request
import time
# 调用动态IP接口
# ip_pools:动态IP池
# target_url:要爬取的目标网页地址
# api_url:动态IP接口地址
def agent_ip(ip_pools, target_url, api_url):
import random
uapools = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]
def api(api_url):
print("调用了接口")
request.urlcleanup()
result = request.urlopen(api_url).read().decode("utf-8", "ignore")
return result
def ip(ip_pools, uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ("User-Agent", thisua)
thisip = ip_pools
print("当前用的IP是:" + thisip)
proxy = request.ProxyHandler({"http": thisip})
opener = request.build_opener(proxy, request.HTTPHandler)
opener.addheaders = [headers]
request.install_opener(opener)
if (ip_pools == 0):
while True:
ippools = api(api_url)
print("提取IP完成")
ip(ippools, uapools)
print("正在验证IP有效性")
data1 = request.urlopen("http://www.baidu.com").read().decode("utf-8", "ignore")
if (len(data1) > 5000):
print("当前IP有效")
break
else:
print("当前IP无效,正在延时")
time.sleep(60)
else:
ip(ip_pools, uapools)
data = request.urlopen(target_url).read().decode("utf-8", "ignore")
return ip_pools, data
原创大约 2 分钟
用户代理池
from urllib import request
import re
import random
# 用户代理池
pools = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]
# 每次都随机选择一个User-Agent
def agent():
# 随机选择一个User-Agent
ua = random.choice(pools)
ua = ("User-Agent", ua)
opener = request.build_opener()
opener.addheaders = [ua]
request.install_opener(opener)
# print("当前使用的User-Agent:" + str(ua))
for i in range(0, 10):
try:
agent()
url = "http://baike.baidu.com/item/" + str(i + 1)
data = request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<h1 class="lemmaTitle_pFwpd J-lemma-title">(.*?)</h1>'
rst = re.compile(pat, re.S).findall(data)
for j in range(0, len(rst)):
print(rst[j])
print("------------------------")
except Exception as e:
print(e)
原创大约 6 分钟
本质上,点赞
、关注
、收藏
这三连
的逻辑都是一样的,所以就只以点赞
为例来说明。
点赞数据源
在src/main/ets/datasource/
目录中创建LikeDataSource.ets
文件,内容如下。
import { VideoInfo } from '../model/VideoInfo';
import { BaseDataSource } from './BaseDataSource';
/**
* 点赞信息数据源
*
*/
export class LikeDataSource extends BaseDataSource<VideoInfo> {
constructor(videoArray: Array<VideoInfo>) {
super(videoArray)
}
/**
* 是否已经点赞过
*/
existLike(id: number): boolean {
let dataSource: Array<VideoInfo> = this.getDataSource();
// 遍历数组
for (let i: number = 0; i < dataSource.length; i++) {
if (dataSource[i].videoId === id) {
return true;
}
}
return false;
}
/**
* 根据ID删除数据
*/
removeById(id: number): void {
let dataSource: Array<VideoInfo> = this.getDataSource();
// 遍历数据并删除
for (let i: number = 0; i < dataSource.length; i++) {
if (dataSource[i].videoId === id) {
this.remove(i);
break;
}
}
}
}
原创大约 4 分钟
页面整体布局
修改之前的src/main/ets/pages/Me.ets
页面内容,代码如下。
import { UserVideoDataSource } from "../datasource/UserVideoDataSource";
import { MeClassification } from "../view/MeClassification";
import { MeMenu } from "../view/MeMenu";
import { MeUserInfo } from "../view/MeUserInfo";
/**
* “我”页面
*
*/
@Component
export struct Me {
// 用户上传的视频数据源
@Link
userVideoDataSource: UserVideoDataSource;
build() {
Column() {
// 菜单栏
MeMenu()
.padding({ right: 10, top: 10 })
// 个人信息展示
MeUserInfo()
.padding({ left: 10, top: 60 })
// 作品分类
MeClassification({
userVideoDataSource: this.userVideoDataSource
})
.padding({ left: 10, right: 10, top: 10, bottom: 10 })
}
.width("100%")
.height("100%")
.backgroundImage($rawfile('wallpaper.png'))
.backgroundImageSize({ width: '100%', height: '100%'})
}
}
原创大约 6 分钟