爬虫-代理
post请求
- 表单数据的处理
form_data = urllib.parse.urlencode(form_data).encode()
- fiddler抓包,一个本上面有个箭头,代表就是post请求
- 示例:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24import urllib.request
import urllib.parse
# 获取posturl的地址
post_url = 'https://fanyi.baidu.com/sug'
word = input('请输入您要查询的英文单词:')
# word = 'honey'
# 构建post表单数据
form_data = {
'kw': word,
}
# 发送请求的过程
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 构建请求对象
request = urllib.request.Request(url=post_url, headers=headers)
# 处理post表单数据
form_data = urllib.parse.urlencode(form_data).encode()
# 发送请求
response = urllib.request.urlopen(request, data=form_data)
print(response.read().decode())1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40import urllib.request
import urllib.parse
post_url = 'https://fanyi.baidu.com/v2transapi'
word = 'baby'
form_data = {
'from': 'en',
'to': 'zh',
'query': word,
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '814534.560887',
'token': '62b9d9d88a0b0093eb21e74ca353f235',
}
headers = {
'Host': 'fanyi.baidu.com',
# 'Connection': 'keep-alive',
# 'Content-Length': '121',
# 'Accept': ' */*',
'Origin': 'https://fanyi.baidu.com',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh&decrypt_suc=1',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cookie': 'BIDUPSID=48F178A24F2B41ED9ADBF815F62012FC; PSTM=1576704115; BAIDUID=48F178A24F2B41ED3DCD926B7B7AC1AC:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1425_21112_30210_30283; BDUSS=FTWmVPemEzVXF0b2lEZEdKWDNQVWVUa21TTWRqQ3kzTktEfmpuZklvSmQtaUplSVFBQUFBJCQAAAAAAAAAAAEAAACBLk3ry9XS-lRvb2xzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAF1t-11dbftdU1; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_PREFER_SWITCH=1; SOUND_SPD_SWITCH=1; HISTORY_SWITCH=1; APPGUIDE_8_2_2=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1576764318,1576837331,1576837728; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1576837747; __yjsv5_shitong=1.0_7_7cef52fd882c7f23ca09b06921f7066d5914_300_1576837747568_183.128.93.78_abeb0413; yjs_js_security_passport=01f67439da35b1641e3f5b4abfeaf0b74b1731ab_1576837748_js',
}
request = urllib.request.Request(url=post_url, headers=headers)
form_data = urllib.parse.urlencode(form_data).encode()
response = urllib.request.urlopen(request, form_data)
print(response.read().decode())
ajax
get
1 | import urllib.request |
post
1 | import urllib.request |
复杂get
-
url = 'http://tieba.baidu.com/f?ie=utf-8&kw=python&red_tag=a0457584153'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49import urllib.request
import urllib.parse
import os
url = 'http://tieba.baidu.com/f?ie=utf-8&'
# 1 pn == 0
# 2 pn == 50
# 3 pn == 100
# 4 pn == 150
# n pn == (n - 1)*50
# 需求:输入吧名,输入起始页码,输入结束页码,然后在当前文件夹中创建一个以吧名为名字的文件夹,里面是每一页的html内容,文件名是吧名_page.html
ba_name = input('请输入要爬取的吧名:')
start_page = int(input('请输入要爬取的起始页码:'))
end_page = int(input('请输入要爬取的结束页码:'))
# 创建文件夹
if not os.path.exists(ba_name):
os.mkdir(ba_name)
# 搞个循环,依次爬取每一页数据
for page in range(start_page, end_page + 1):
# page就是当前页
# 拼接url的过程
data = {
'kw': ba_name,
'pn': (page - 1) * 50
}
data = urllib.parse.urlencode(data)
# 生成指定的url
url_t = url + data
# print(url_t)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
request = urllib.request.Request(url=url_t, headers=headers)
print('第%s页开始下载.....' % page)
response = urllib.request.urlopen(request)
# 生成文件名
filename = ba_name + '_' + str(page) + '.html'
# 拼接文件路径
filepath = ba_name + '/' + filename
# 写内容
with open(filepath, 'wb') as fp:
fp.write(response.read())
print('第%s页结束下载.....' % page)
URLError、HTTPError
- 这两个类都在
urllib.error
里面 - NameError、TypeError、FileNotFound、异常
- 异常处理,结构:
try-except
URLError
:- 没有网
- 服务器连接失败
- 找不到指定的服务器
HTTPError
:是 URLError 的子类- 注意:两个同时捕获的时候,需要将
HTTPError
写到上面,URLError
写到下面 - 示例:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15import urllib.request
import urllib.parse
import urllib.error
# url = 'http://www.maodan.com/'
url = 'https://blog.csdn.net/thinkcortex/article/details/5206282'
try:
response = urllib.request.urlopen(url)
print(response)
except urllib.error.HTTPError as e:
print(e)
print(e.code)
except urllib.error.URLError as e:
print(e)
Handler处理器、自定义Opener
- **
urlopen()
:**给一个url,发送请求,获取响应,不能定制请求头 - **
Request()
:**定制请求头、创建请求对象 - **高级功能:**使用代理、cookie
- **基本用法:**见代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21import urllib.request
import urllib.parse
url = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 创建一个handler
handler = urllib.request.HTTPHandler()
# 通过handler创建一个opener
# opener就是一个对象,一会发送请求的时候,直接使用opener里面的方法即可,不要使用urlopen了
opener = urllib.request.build_opener(handler)
# 构建请求对象
request = urllib.request.Request(url, headers=headers)
# 发送请求
response = opener.open(request)
print(response.read().decode())
代理
-
代理是什么?
- 生活中的代理:微商、代练、代考、代驾、代购
- 程序中的代理:
- 正向代理:代理客户端获取数据
- 反向代理:代理服务端提供数据
-
代理分类
- 透明代理:对方服务器可以知道你使用了代理,并且也知道你的真实IP
- 匿名代理:对方服务器可以知道你使用了代理,但是不知道你的真实IP
- 高度匿名代理:对方服务器不知道你使用了代理,更不知道你的真实IP
-
配置
- 浏览器配置(Chrome)
右上角三点==>设置==>高级==>点击代理==>局域网设置==>为LAN使用代理==>输入IP和端口号即可 - Windows 10
点击开始==>点击设置==>搜索“代理”==>点击代理服务器==>输入IP地址和端口即可 - 代码配置
1
2handler = urllib.request.ProxyHandler({'http': '114.215.95.188:3128'})
opener = urllib.request.build_opener(handler)后续都使用opener.open方法去发送请求即可
- 浏览器配置(Chrome)
-
示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19import urllib.request
import urllib.parse
# 113.79.75.104:9797
# 创建handler
handler = urllib.request.ProxyHandler({'http': '114.215.95.188:3128'})
# 创建opener
opener = urllib.request.build_opener(handler)
url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = opener.open(request)
with open('ip.html', 'wb') as fp:
fp.write(response.read())
cookie
- cookie是什么?
- http协议,无状态
- 网站登录时候的问题,用来记录用户身份的
- 模拟登陆
抓包获取cookie,发送请求 - 示例:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15import urllib.request
import urllib.parse
url = 'http://www.renren.com/973133005/profile'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Cookie': 'anonymid=k4f217lxji8opq; depovince=GW; _r01_=1; JSESSIONID=abcrsxBwzbnAofhMpNM8w; ick_login=8d72474b-dcca-4b0c-9f51-a9c8d6f69a2f; loginfrom=null; t=29024a60d070130725138781a679644a5; societyguester=29024a60d070130725138781a679644a5; id=973133005; xnsid=d943025d; jebecookies=2b2827de-10af-404c-be81-5fa42e9c8c5e|||||; ver=7.0; wp_fold=0'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
with open('renren.html', 'wb') as fp:
fp.write(response.read())
cookie登录
1 | import urllib.request |