爬虫 - 代理

post 请求

  • 表单数据的处理
  • form_data = urllib.parse.urlencode(form_data).encode()
  • fiddler 抓包,一个本上面有个箭头,代表就是 post 请求
  • 示例:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    import urllib.request
    import urllib.parse

    # 获取posturl的地址
    post_url = 'https://fanyi.baidu.com/sug'
    word = input('请输入您要查询的英文单词:')
    # word = 'honey'
    # 构建post表单数据
    form_data = {
    'kw': word,
    }

    # 发送请求的过程
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    # 构建请求对象
    request = urllib.request.Request(url=post_url, headers=headers)
    # 处理post表单数据
    form_data = urllib.parse.urlencode(form_data).encode()
    # 发送请求
    response = urllib.request.urlopen(request, data=form_data)

    print(response.read().decode())
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    import urllib.request
    import urllib.parse

    post_url = 'https://fanyi.baidu.com/v2transapi'
    word = 'baby'
    form_data = {
    'from': 'en',
    'to': 'zh',
    'query': word,
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '814534.560887',
    'token': '62b9d9d88a0b0093eb21e74ca353f235',
    }

    headers = {
    'Host': 'fanyi.baidu.com',
    # 'Connection': 'keep-alive',
    # 'Content-Length': '121',
    # 'Accept': ' */*',
    'Origin': 'https://fanyi.baidu.com',
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh&decrypt_suc=1',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cookie': 'BIDUPSID=48F178A24F2B41ED9ADBF815F62012FC; PSTM=1576704115; BAIDUID=48F178A24F2B41ED3DCD926B7B7AC1AC:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1425_21112_30210_30283; BDUSS=FTWmVPemEzVXF0b2lEZEdKWDNQVWVUa21TTWRqQ3kzTktEfmpuZklvSmQtaUplSVFBQUFBJCQAAAAAAAAAAAEAAACBLk3ry9XS-lRvb2xzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAF1t-11dbftdU1; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_PREFER_SWITCH=1; SOUND_SPD_SWITCH=1; HISTORY_SWITCH=1; APPGUIDE_8_2_2=1; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1576764318,1576837331,1576837728; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1576837747; __yjsv5_shitong=1.0_7_7cef52fd882c7f23ca09b06921f7066d5914_300_1576837747568_183.128.93.78_abeb0413; yjs_js_security_passport=01f67439da35b1641e3f5b4abfeaf0b74b1731ab_1576837748_js',

    }

    request = urllib.request.Request(url=post_url, headers=headers)

    form_data = urllib.parse.urlencode(form_data).encode()

    response = urllib.request.urlopen(request, form_data)

    print(response.read().decode())

    参考链接

ajax

get

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import urllib.request
import urllib.parse

url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'

page = int(input('请输入想要第几页的数据:'))
# start=0 limit=20
# start=1
number = 20

# 构建get参数
data = {
'start': (page - 1) * number,
'limit': number,
}
# 将字典转化为query_string
query_string = urllib.parse.urlencode(data)
# 修改url
url += query_string

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)

response = urllib.request.urlopen(request)

print(response.read().decode())

post

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import urllib.request
import urllib.parse

post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

city = input('请输入要查询的城市:')
page = input('请输入要查询第几页:')
size = input('请输入要多少个:')
form_data = {
'cname': city,
'pid': '',
'pageIndex': page,
'pageSize': size,
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}

request = urllib.request.Request(url=post_url, headers=headers)
form_data = urllib.parse.urlencode(form_data).encode()

response = urllib.request.urlopen(request, data=form_data)

print(response.read().decode())

复杂 get

  • url = 'http://tieba.baidu.com/f?ie=utf-8&kw=python&red_tag=a0457584153'

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    import urllib.request
    import urllib.parse
    import os

    url = 'http://tieba.baidu.com/f?ie=utf-8&'

    # 1 pn == 0
    # 2 pn == 50
    # 3 pn == 100
    # 4 pn == 150
    # n pn == (n - 1)*50
    # 需求:输入吧名,输入起始页码,输入结束页码,然后在当前文件夹中创建一个以吧名为名字的文件夹,里面是每一页的html内容,文件名是吧名_page.html

    ba_name = input('请输入要爬取的吧名:')
    start_page = int(input('请输入要爬取的起始页码:'))
    end_page = int(input('请输入要爬取的结束页码:'))

    # 创建文件夹
    if not os.path.exists(ba_name):
    os.mkdir(ba_name)

    # 搞个循环,依次爬取每一页数据
    for page in range(start_page, end_page + 1):
    # page就是当前页
    # 拼接url的过程
    data = {
    'kw': ba_name,
    'pn': (page - 1) * 50
    }
    data = urllib.parse.urlencode(data)
    # 生成指定的url
    url_t = url + data
    # print(url_t)
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    request = urllib.request.Request(url=url_t, headers=headers)
    print('第%s页开始下载.....' % page)
    response = urllib.request.urlopen(request)

    # 生成文件名
    filename = ba_name + '_' + str(page) + '.html'
    # 拼接文件路径
    filepath = ba_name + '/' + filename

    # 写内容
    with open(filepath, 'wb') as fp:
    fp.write(response.read())
    print('第%s页结束下载.....' % page)

URLError、HTTPError

  • 这两个类都在 urllib.error 里面
  • NameError、TypeError、FileNotFound、异常
  • 异常处理,结构:try-except
  • URLError
    • 没有网
    • 服务器连接失败
    • 找不到指定的服务器
  • HTTPError:是 URLError 的子类
  • 注意:两个同时捕获的时候,需要将 HTTPError 写到上面,URLError 写到下面
  • 示例:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    import urllib.request
    import urllib.parse
    import urllib.error

    # url = 'http://www.maodan.com/'
    url = 'https://blog.csdn.net/thinkcortex/article/details/5206282'

    try:
    response = urllib.request.urlopen(url)
    print(response)
    except urllib.error.HTTPError as e:
    print(e)
    print(e.code)
    except urllib.error.URLError as e:
    print(e)

Handler 处理器、自定义 Opener

  • urlopen()给一个 url,发送请求,获取响应,不能定制请求头
  • Request()定制请求头、创建请求对象
  • 高级功能:使用代理、cookie
  • 基本用法:见代码
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    import urllib.request
    import urllib.parse

    url = 'https://www.baidu.com/'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }

    # 创建一个handler
    handler = urllib.request.HTTPHandler()
    # 通过handler创建一个opener
    # opener就是一个对象,一会发送请求的时候,直接使用opener里面的方法即可,不要使用urlopen了
    opener = urllib.request.build_opener(handler)

    # 构建请求对象
    request = urllib.request.Request(url, headers=headers)

    # 发送请求
    response = opener.open(request)

    print(response.read().decode())

代理

  • 代理是什么?

    • 生活中的代理:微商、代练、代考、代驾、代购
    • 程序中的代理:
      • 正向代理:代理客户端获取数据
      • 反向代理:代理服务端提供数据
  • 代理分类

    • 透明代理:对方服务器可以知道你使用了代理,并且也知道你的真实 IP
    • 匿名代理:对方服务器可以知道你使用了代理,但是不知道你的真实 IP
    • 高度匿名代理:对方服务器不知道你使用了代理,更不知道你的真实 IP
  • 配置

    • 浏览器配置 (Chrome)
      右上角三点 ==> 设置 ==> 高级 ==> 点击代理 ==> 局域网设置 ==> 为 LAN 使用代理 ==> 输入 IP 和端口号即可
    • Windows 10
      点击开始 ==> 点击设置 ==> 搜索 “代理”==> 点击代理服务器 ==> 输入 IP 地址和端口即可
    • 代码配置
      1
      2
      handler = urllib.request.ProxyHandler({'http': '114.215.95.188:3128'})
      opener = urllib.request.build_opener(handler)

      后续都使用 opener.open 方法去发送请求即可

  • 示例

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    import urllib.request
    import urllib.parse

    # 113.79.75.104:9797
    # 创建handler
    handler = urllib.request.ProxyHandler({'http': '114.215.95.188:3128'})
    # 创建opener
    opener = urllib.request.build_opener(handler)

    url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    request = urllib.request.Request(url, headers=headers)

    response = opener.open(request)

    with open('ip.html', 'wb') as fp:
    fp.write(response.read())
  • cookie 是什么?
    • http 协议,无状态
    • 网站登录时候的问题,用来记录用户身份的
  • 模拟登陆
    抓包获取 cookie,发送请求
  • 示例:
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    import urllib.request
    import urllib.parse

    url = 'http://www.renren.com/973133005/profile'

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'Cookie': 'anonymid=k4f217lxji8opq; depovince=GW; _r01_=1; JSESSIONID=abcrsxBwzbnAofhMpNM8w; ick_login=8d72474b-dcca-4b0c-9f51-a9c8d6f69a2f; loginfrom=null; t=29024a60d070130725138781a679644a5; societyguester=29024a60d070130725138781a679644a5; id=973133005; xnsid=d943025d; jebecookies=2b2827de-10af-404c-be81-5fa42e9c8c5e|||||; ver=7.0; wp_fold=0'
    }

    request = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(request)

    with open('renren.html', 'wb') as fp:
    fp.write(response.read())

cookie 登录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import urllib.request
import urllib.parse
import http.cookiejar

# 真实的模拟浏览器,当发送完post请求的时候,将cookie保存到代码中
# 创建一个cookiejar对象
cj = http.cookiejar.CookieJar()
# 通过cookiejar创建一个handler
handler = urllib.request.HTTPCookieProcessor(cj)
# 根据handler创建一个opener
opener = urllib.request.build_opener(handler)

post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201843935495'
formdata = {
'email': '17701256561',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '3f410d8973fe8372d1500bfc877b475b1e3f0cec62ab8cf28f7024fdeb6e2a99',
'rkey': '5359624622816e72df1281517296edda',
'f': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DYxPWlNcZg9d5hfNaZgVM3_qb_yK0LL92LzZ4FeynfMq%26wd%3D%26eqid%3Dfc4556f900040a75000000035b0df91b',
}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}

request = urllib.request.Request(url=post_url, headers=headers)

formdata = urllib.parse.urlencode(formdata).encode()
response = opener.open(request, data=formdata)

print(response.read().decode())
print('*' * 50)

get_url = 'http://www.renren.com/960481378/profile'

request = urllib.request.Request(url=get_url, headers=headers)
response = opener.open(request)

print(response.read().decode())