zoukankan html css js c++ java

Python爬虫：urllib库的基本使用

请求网址获取网页代码

import urllib.request
url = "http://www.baidu.com"
response = urllib.request.urlopen(url)
data = response.read()
# print(data)
# 将文件获取的内容转换成字符串
str_data = data.decode("utf-8")
print(str_data)
# 将结果保存到文件中
with open("baidu.html", "w", encoding="utf-8") as f:
    f.write(str_data)

get带参数请求

import urllib.request

def get_method_params(wd):
    url = "http://www.baidu.com/s?wd="
    # 拼接字符串
    final_url = url + wd
    # 发送网络请求
    response = urllib.request.urlopen(final_url)
    print(response.read().decode("utf-8"))

get_method_params("美女")

直接这么写会报错：

原因是，网址里面包含了汉字，但是ascii码是没有汉字的，需要转义一下：

import urllib.request
import urllib.parse
import string

def get_method_params(wd):
    url = "http://www.baidu.com/s?wd="
    # 拼接字符串
    final_url = url + wd
    # 将包含汉字的网址进行转义
    encode_new_url = urllib.parse.quote(final_url, safe=string.printable)
    # 发送网络请求
    response = urllib.request.urlopen(encode_new_url)
    print(response.read().decode("utf-8"))

get_method_params("美女")

使用字典拼接参数

import urllib.request
import urllib.parse
import string

def get_params():
    url = "http://www.baidu.com/s?w"

    params = {
        "wd": "美女",
        "key": "zhang",
        "value": "san"
    }

    str_params = urllib.parse.urlencode(params)
    print(str_params)

    final_url = url + str_params
    # 将带有中文的url转义
    encode_url = urllib.parse.quote(final_url, safe=string.printable)

    response = urllib.request.urlopen(encode_url)
    data = response.read().decode("utf-8")
    print(data)

get_params()

设置请求的超时时间

urlopen的参数，timeout：可以设置请求的超时时间

post请求

urllib.request.urlopen(url, data="服务器接收的数据")

User-Agent

可以伪装请求头的用户信息

常用的请求头整理：https://www.cnblogs.com/wbyixx/p/12231755.html

import urllib.request
import urllib.parse
import string
import random


def get_random_user_agent():
    import random
    user_agent_list = [......]
    random_user_agent = random.choice(user_agent_list)
    return random_user_agent


url = "http://www.baidu.com"
request = urllib.request.Request(url)
# 添加请求头信息
request.add_header("User-Agent", get_random_user_agent())
# 请求数据
response = urllib.request.urlopen(request)
# print(response.read().decode("utf-8"))
# 获取请求头信息，注意这里的agent小写
print(request.get_header("User-agent"))

handler处理器的使用

import urllib.request


def handler_openner():
    # urlopen为什么可以请求数据
    # 根据源码可以看出，是由于 openner，而openner又由handler而来

    url = "http://www.baidu.com"
    # 创建自己的处理器
    handler = urllib.request.HTTPHandler()
    # 创建自己的openner
    openner = urllib.request.build_opener(handler)
    # 用openner去请求
    response = openner.open(url)
    print(response.read().decode("utf-8"))


handler_openner()

添加ip代理

ip代理的分类

免费的：时效性差，错误率高
付费的：贵，也有失效不能用的

性质分类：

透明：对方知道我们的ip
匿名：对方不知道我们的真实ip，但是知道我们使用了代理
高匿：对方不知道我们的真实ip，也不知道我们使用了代理

使用代理ip去请求

创建 ProxyHandler

import urllib.request


def create_proxy_handler():
    url = "http://www.baidu.com"

    proxy = {
        # 免费的写法
        "http": "http://36.27.28.215:9999"
    }

    # 代理的处理器
    proxy_handler = urllib.request.ProxyHandler(proxy)

    # 创建自己的openner
    openner = urllib.request.build_opener(proxy_handler)
    # 拿着代理ip去发送请求
    response = openner.open(url)

    print(response.read().decode("utf-8"))


create_proxy_handler()

付费的代理写法：

{"http": "username:password@ip:port"}
使用密码管理器

password_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_manager.add_password(None, proxy_uri, user, pwd)
handler_auth_proxy = urllib.request.ProxyBasicAuthHandler(password_manager)
openner_auth_proxy = urllib.request.build_opener(handler_auth_proxy)
response = openner_auth_proxy.open(url)

Cookie验证请求

手动粘贴cookie

import urllib.request

url = "需要cookie验证才能访问的链接"

headers = {
    'User-Agent': 'User-Agent,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    "Cookie": 'xxx手动粘贴cookie到这里'
}

request = urllib.request.Request(url, headers=headers)

response = urllib.request.urlopen(request)

data = response.read()

with open("data.html", "wb") as f:
    f.write(data)

自动获取cookie

使用代码发送登录请求，获取有效的cookie
自动带着cookie去请求其他页面

import urllib.request
import urllib.parse
from http import cookiejar

headers = {
    'User-Agent': 'User-Agent,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}

login_url = "..."
target_url = "https://i-beta.cnblogs.com/posts"

login_form_data = {
    "username": "xxx",
    "password": "xxx",
    # ......其他参数
}

# 转码
login_form_data = urllib.parse(login_form_data).encode("utf-8")

# 发送请求，拿到response里的cookie

cookie_jar = cookiejar.CookieJar()
# 定义有添加cookie功能的处理器
cookie_handler = urllib.request.HTTPCookieProcessor(cookie_jar)
# 根据处理器生成openner
openner = urllib.request.build_opener(cookie_handler)

# 带着参数，发送post请求
login_request = urllib.request.Request(login_url, headers=headers, data=login_form_data)
# 如果登录成功，cookiejar会自动保存cookie
openner.open(login_request)

target_request = urllib.request.Request(target_url, headers=headers)
response = openner.open(target_request)
data = response.read()
print(data)

错误处理

常见的Error

HTTPError

UrlError

查看全文

相关阅读:
学习ASP.NET MVC3（5） Controller
关于测试
 [JAVA SE] Java反射机制
 Windows 8 的软件开发架构
 Servlet生命周期与工作原理
 展望未来，总结过去10年的程序员生涯，给程序员小弟弟小妹妹们的一些总结性忠告（转载）
JAVA小游戏代码（剪刀石头布）
[JAVA SE] JSP中pageEncoding和charset区别，中文乱码解决方案
 我是工程师，不是编译器
 自己对三层架构理论的理解

原文地址：https://www.cnblogs.com/wbyixx/p/12231527.html