快速上手
from urllib import request
import re
url = "http://www.baidu.com"
result = request.urlopen(url)
# 打印状态码
print(result.code)
# 快速爬取网页的三种方法
# 爬到内存中(方法1)
data1 = result.read().decode("utf-8", "ignore")
print(data1)
# 爬到内存中(方法2)
data2 = request.urlopen(request.Request(url)).read().decode("utf-8", "ignore")
print(data2)
# 保存到硬盘
request.urlretrieve(url, filename="C:/baidu.html")
# 循环爬取慕课网页面数据
for i in range(100, 200):
url = "https://coding.imooc.com/class/" + str(i) + ".html"
data = request.urlopen(url).read().decode("utf-8", "ignore")
title = '<h1>(.*?)</h1>'
price = '<div class="cur-price">¥(.*?)</div>'
title_result = re.compile(title, re.S).findall(data)
price_result = re.compile(price, re.S).findall(data)
if len(title_result) > 0:
print(title_result[0])
else:
continue
if len(price_result) > 0:
print(price_result[0])
else:
print("免费")
print("------------------------")
大约 11 分钟