zoukankan      html  css  js  c++  java
  • Python爬虫的一些操作

    1.先来个不反爬的

    """这个不设置反爬措施,练手最好用"""
    import requests
    from bs4 import BeautifulSoup
    
    
    response = requests.get("https://www.autohome.com.cn/news/")
    # 转换编码
    response.encoding = 'gbk'
    # 封装html到soup
    soup = BeautifulSoup(response.text, 'html.parser')
    # 找到匹配的第一个div
    div = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
    # 找到此div下所有li
    li_list = div.find_all(name='li')
    # 循环获取数据
    for li in li_list:
        title = li.find(name='h3')
        if not title:
            continue
        p = li.find(name='p')
        a = li.find(name='a')
        print(title.text)
        print(a.attrs.get('href'))
        print(p.text)
        img = li.find(name='img')
        src = img.get('src')
        src = "https:" + src
        print(type(src))
        print(type(title.text))
    
        # 再次发起请求,下载图片到本地
        file_name = src.rsplit('/', maxsplit=1)[1]
        ret = requests.get(src)
        with open(file_name, 'wb') as f:
            f.write(ret.content)
    View Code

    2.来个获取数据的

    """进阶爬虫1"""
    import requests
    from bs4 import BeautifulSoup
    
    
    res = requests.get(
        url="http://jandan.net/",
    )
    soup = BeautifulSoup(res.text, "html.parser")
    div = soup.find(name="div", attrs={"id": "content"})
    div_list = div.find_all(name="div", attrs={"class": "post f list-post"})
    for div in div_list:
        print(div.text.strip())    # 获取所有文本
        # img = div.find(name="img")
        # src = img.get("src")
        # if not src:
        #     continue
        # src = "https:" + src
        # print(src)    获取图片
        # h = div.find(name="h2")
        # a = h.find(name="a")
        # print(a.text)    获取标题
    View Code

    3.来个有点难度的

    """爬虫进阶2"""
    import requests
    # 1. 查看首页
    r1 = requests.get(
        url='https://dig.chouti.com/',
        headers={
            'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
    )
    
    # 2. 提交用户名和密码
    r2 = requests.post(
        url='https://dig.chouti.com/login',
        headers={
            'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        },
        data={
            'phone':'86你的账号',
            'password':'你的密码',
            'oneMonth':1
        },
        cookies=r1.cookies.get_dict()
    )
    
    
    # 3. 点赞
    r3 = requests.post(
        url='https://dig.chouti.com/link/vote?linksId=20435396',
        headers={
            'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        },
        cookies=r1.cookies.get_dict()
    )
    print(r3.text)
    View Code

    4.来个再难一点的

    """进阶爬取3"""
    import requests
    import re
    from bs4 import BeautifulSoup
    
    # 先伪装login请求
    res = requests.get(
        url="https://passport.lagou.com/login/login.html",
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360"
        }
    )
    # print(res.text)   原话(动态token,防御伪造请求,重复提交)(小坑)
    # 笑一会儿
    # 获取token(正则匹配)
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", res.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", res.text, re.S)[0]
    
    ret = requests.post(
        url="https://passport.lagou.com/login/login.json",      # 登录网址发送前发个错的获取登录url
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
            "X-Anit-Forge-Token": X_Anti_Forge_Token,
            "X_Anti_Forge_Code": X_Anti_Forge_Code,
            "Referer": "https://passport.lagou.com/login/login.html",     # 上一次提交地址(小坑)
        },
        data={           # 发送post数据
            "isValidate": True,
            "username": 你的账号,
            "password": "你的密码",
            "request_form_verifyCode": "",
            "submit": "",
            "challenge": "c87407cd89add055d8f1b54ad579cec8",
        },
        cookies=res.cookies.get_dict(),     # 带着登录页面的cookies获取权限(小坑)
    )
    
    r1 = requests.get(
        url="https://www.lagou.com/zhaopin/Python/?labelWords=label",
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
            "Referer": "https://www.lagou.com/",    # 上一次的登录网址(可以re匹配一下动态获取)
        },
        cookies=ret.cookies.get_dict(),
    )
    
    soup = BeautifulSoup(r1.text, "html.parser")
    div = soup.find(name="div", attrs={"id": "s_position_list"})
    li_list = div.find_all(name="li")
    for li in li_list:
        title = li.find(name="h3")
        if not title:
            continue
        money = li.find(name="span")
        div = li.find(name="div", attrs={"class": "li_b_l"})
        a = li.find(name="a")
        print(title.text)
        print(money.text)
        print(div.text)
        print(a.text)
    View Code

    5.来个github的

    """进阶爬取4"""
    import requests
    from bs4 import BeautifulSoup
    
    
    r1 = requests.get(
        url="https://github.com/session",     # 这点注意url,登录是login获取cookies是session(小坑)
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
    )
    soup = BeautifulSoup(r1.text, "html.parser")
    inp = soup.find(name="input", attrs={"name": "authenticity_token"})
    cookies = r1.cookies.get_dict()
    token = inp.get("value")
    # 登录
    r2 = requests.post(
        url="https://github.com/login",
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        },
        data={
            "commit": "Sign in",
            "utf8": "",
            "authenticity_token": token,
            "login": "你的账号",
            "password": "你的密码",
        },
        cookies=cookies
    )
    # 后续要啥随你
    print(r2.text)
    View Code

     6.先来安装爬虫界大佬级框架

    window    安装步骤
    pip3 install wheel
    pip3 install pywin32
    去下载twisted压缩包:http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
    pip install Twisted‑18.4.0‑cp36‑cp36m‑win_amd64.whl
    pip3 install scrapy 
    OK!!!
    linux  简单的不行
    pip3 install scrapy 
    View Code
  • 相关阅读:
    【5min+】 秋名山的竞速。 ValueTask 和 Task
    int16、int32、int64的范围
    C#实现的一些常见时间格式
    C# WPF抽屉效果实现(C# WPF Material Design UI: Navigation Drawer & PopUp Menu)
    如何为.NETCore安装汉化包智能感知
    .NetCore学习笔记:三、基于AspectCore的AOP事务管理
    C#设计模式学习笔记:简单工厂模式(工厂方法模式前奏篇)
    .net core3.0 webapi搭建(一)
    [Abp vNext 源码分析]
    WPF 控件功能重写(ComboBox回车搜索)
  • 原文地址:https://www.cnblogs.com/Guishuzhe/p/9806771.html
Copyright © 2011-2022 走看看