zoukankan html css js c++ java

Python爬虫的一些操作

1.先来个不反爬的

"""这个不设置反爬措施，练手最好用"""
import requests
from bs4 import BeautifulSoup


response = requests.get("https://www.autohome.com.cn/news/")
# 转换编码
response.encoding = 'gbk'
# 封装html到soup
soup = BeautifulSoup(response.text, 'html.parser')
# 找到匹配的第一个div
div = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
# 找到此div下所有li
li_list = div.find_all(name='li')
# 循环获取数据
for li in li_list:
    title = li.find(name='h3')
    if not title:
        continue
    p = li.find(name='p')
    a = li.find(name='a')
    print(title.text)
    print(a.attrs.get('href'))
    print(p.text)
    img = li.find(name='img')
    src = img.get('src')
    src = "https:" + src
    print(type(src))
    print(type(title.text))

    # 再次发起请求，下载图片到本地
    file_name = src.rsplit('/', maxsplit=1)[1]
    ret = requests.get(src)
    with open(file_name, 'wb') as f:
        f.write(ret.content)

View Code

2.来个获取数据的

"""进阶爬虫1"""
import requests
from bs4 import BeautifulSoup


res = requests.get(
    url="http://jandan.net/",
)
soup = BeautifulSoup(res.text, "html.parser")
div = soup.find(name="div", attrs={"id": "content"})
div_list = div.find_all(name="div", attrs={"class": "post f list-post"})
for div in div_list:
    print(div.text.strip())    # 获取所有文本
    # img = div.find(name="img")
    # src = img.get("src")
    # if not src:
    #     continue
    # src = "https:" + src
    # print(src)    获取图片
    # h = div.find(name="h2")
    # a = h.find(name="a")
    # print(a.text)    获取标题

View Code

3.来个有点难度的

"""爬虫进阶2"""
import requests
# 1. 查看首页
r1 = requests.get(
    url='https://dig.chouti.com/',
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
)

# 2. 提交用户名和密码
r2 = requests.post(
    url='https://dig.chouti.com/login',
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    },
    data={
        'phone':'86你的账号',
        'password':'你的密码',
        'oneMonth':1
    },
    cookies=r1.cookies.get_dict()
)


# 3. 点赞
r3 = requests.post(
    url='https://dig.chouti.com/link/vote?linksId=20435396',
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    },
    cookies=r1.cookies.get_dict()
)
print(r3.text)

View Code

4.来个再难一点的

"""进阶爬取3"""
import requests
import re
from bs4 import BeautifulSoup

# 先伪装login请求
res = requests.get(
    url="https://passport.lagou.com/login/login.html",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360"
    }
)
# print(res.text)   原话(动态token，防御伪造请求，重复提交)(小坑)
# 笑一会儿
# 获取token(正则匹配)
X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", res.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", res.text, re.S)[0]

ret = requests.post(
    url="https://passport.lagou.com/login/login.json",      # 登录网址发送前发个错的获取登录url
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "X-Anit-Forge-Token": X_Anti_Forge_Token,
        "X_Anti_Forge_Code": X_Anti_Forge_Code,
        "Referer": "https://passport.lagou.com/login/login.html",     # 上一次提交地址(小坑)
    },
    data={           # 发送post数据
        "isValidate": True,
        "username": 你的账号,
        "password": "你的密码",
        "request_form_verifyCode": "",
        "submit": "",
        "challenge": "c87407cd89add055d8f1b54ad579cec8",
    },
    cookies=res.cookies.get_dict(),     # 带着登录页面的cookies获取权限(小坑)
)

r1 = requests.get(
    url="https://www.lagou.com/zhaopin/Python/?labelWords=label",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "Referer": "https://www.lagou.com/",    # 上一次的登录网址(可以re匹配一下动态获取)
    },
    cookies=ret.cookies.get_dict(),
)

soup = BeautifulSoup(r1.text, "html.parser")
div = soup.find(name="div", attrs={"id": "s_position_list"})
li_list = div.find_all(name="li")
for li in li_list:
    title = li.find(name="h3")
    if not title:
        continue
    money = li.find(name="span")
    div = li.find(name="div", attrs={"class": "li_b_l"})
    a = li.find(name="a")
    print(title.text)
    print(money.text)
    print(div.text)
    print(a.text)

View Code

5.来个github的

"""进阶爬取4"""
import requests
from bs4 import BeautifulSoup


r1 = requests.get(
    url="https://github.com/session",     # 这点注意url,登录是login获取cookies是session(小坑)
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }
)
soup = BeautifulSoup(r1.text, "html.parser")
inp = soup.find(name="input", attrs={"name": "authenticity_token"})
cookies = r1.cookies.get_dict()
token = inp.get("value")
# 登录
r2 = requests.post(
    url="https://github.com/login",
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    },
    data={
        "commit": "Sign in",
        "utf8": "✓",
        "authenticity_token": token,
        "login": "你的账号",
        "password": "你的密码",
    },
    cookies=cookies
)
# 后续要啥随你
print(r2.text)

View Code

6.先来安装爬虫界大佬级框架

window    安装步骤
pip3 install wheel
pip3 install pywin32
去下载twisted压缩包：http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
pip install Twisted‑18.4.0‑cp36‑cp36m‑win_amd64.whl
pip3 install scrapy 
OK！！！
linux  简单的不行
pip3 install scrapy

View Code

查看全文

相关阅读:
COJ979 WZJ的数据结构（负二十一）
COJ980 WZJ的数据结构（负二十）
奇怪的错误
 COJ883 工艺品
 COJ885 LCS？？？
COJ559 回文
 hdu1505（dp求最大子矩阵）
hdu1506（dp求最大子矩阵）
hdu2569（递推dp）
hdu1081（最大子矩阵）

原文地址：https://www.cnblogs.com/Guishuzhe/p/9806771.html