zoukankan      html  css  js  c++  java
  • python爬虫基础_requests和bs4

    这些都是笔记,还缺少详细整理,后续会更新。

    下面这种方式,属于入门阶段,手动成分比较多.

    首先安装必要组件:

    pip3 install requests

    pip3 install beautifulsoup4

    一、爬汽车之家

    #!/usr/bin/env python
    # coding:utf-8
    
    import requests
    from bs4 import BeautifulSoup
    
    # 1.下载页面
    ret = requests.get(url="https://www.autohome.com.cn/news/")
    # print(ret) # 得到对象
    # ret.encoding="gbk" # 指定编码
    # print(ret.apparent_encoding)
    ret.encoding = ret.apparent_encoding  # 指定编码等于原始页面编码
    # print(ret.text)
    
    
    # 2. 解析:获取想要的指定内容 beautifulsoup
    soup = BeautifulSoup(ret.text, 'html.parser')  # 使用lxml则速度更快
    
    # 如果要加class,则前面加下划线
    # div = soup.find(name='div', id='auto-channel-lazyload-article', _class='article-wrapper')  # 找到外部DIV
    div = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"})  # 使用属性字典方式
    
    li_list = div.find_all(name='li')
    
    for li in li_list:
        h3 = li.find(name='h3')
        if not h3:
            continue
        print(h3.text)
    
        a = li.find('a')
        # print(a.attrs)
        print(a.get('href'))
    
        p = li.find(name='p')
        print(p.text)
        print('----->' * 20)
    
        img = li.find(name='img')
        src = img.get('src')
    
        filename = src.rsplit('__', maxsplit=1)[1]
    
        down_img = requests.get(url='https:' + src)
        with open(filename, 'wb') as f:
            f.write(down_img.content)

    当然,从for循环输出开始,将内容写入文件或数据库,就随需求了。 

    import requests
    from bs4 import BeautifulSoup
    
    # 1.下载页面
    ret = requests.get(url="https://www.autohome.com.cn/news/")
    ret.encoding = ret.apparent_encoding  # 指定编码等于原始页面编码
    
    # 2. 解析:获取想要的指定内容 beautifulsoup
    soup = BeautifulSoup(ret.text, 'html.parser')  # 使用lxml则速度更快
    
    # 如果要加class,则前面加下划线 # 使用属性字典方式
    div = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"}) 
    
    li_list = div.find_all(name='li')
    
    with open('res.txt','w',encoding='utf-8') as t:
        for li in li_list:
            h3 = li.find(name='h3')
            if not h3:
                continue
            t.write(h3.text+'
    ')
    
            a = li.find('a')
            t.write(a.get('href')+'
    ')
    
            p = li.find(name='p')
            txt = p.text.split('  ',1)[1]
            t.write(txt+'
    ')
            t.write('
    ')
    
            img = li.find(name='img')
            src = img.get('src')
    
            filename = src.rsplit('__', maxsplit=1)[1]
    
            down_img = requests.get(url='https:' + src)
            with open('./img/'+filename, 'wb') as f:
                f.write(down_img.content)
    View Code

    二、登录抽屉

    #!/usr/bin/env python
    # coding:utf-8
    
    import requests
    
    # 请求头要加,先访问普通网页,伪造得越像浏览器越好
    # 1. 先访问网页,获取cookie(未授权)
    ret = requests.get(
        url="https://dig.chouti.com/all/hot/recent/1",
        headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', }
    )
    # print(ret.text)
    r1_cookie_dict = ret.cookies.get_dict()
    
    # 2. 登录 发送用户名和密码认证, 带上未授权的cookie
    # 需要注意反爬虫策略
    response_login = requests.post(
        url="https://dig.chouti.com/login",
        data={
            "phone": "8618912600100",
            "password": "wodemima",
            "oneMonth": "1"
        },
        headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        },
        cookies=r1_cookie_dict
    )
    
    # print(response_login.text)
    # cookie_dict=response_login.cookies.get_dict() # 第二次返回的cookie
    
    
    # 点赞
    r1 = requests.post(
        url="https://dig.chouti.com/link/vote?linksId=20630611",
        headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'},
        cookies=r1_cookie_dict
    )
    print(r1.text)
    # {"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_53074732774","likedTime":"1530752755154000","lvCount":"21","nick":"aabbccdd","uvCount":"1","voteTime":"小于1分钟前"}}}

    requests和bs4的几个小片段:

    #!/usr/bin/env python
    # coding:utf-8
    
    import requests,re
    from bs4 import BeautifulSoup
    
    '''
    requests.get(url="http://www.baidu.com")  # requests.request(method="get",url="xxx")
    requests.post(url="http://www.baidu.com")  # requests.request(method="post",url="xxx")
    
    
    可以传的参数:
    url: 地址
    params: URL中传入的参数
    headers: 请求头
    cookies: Cookie
    data: 数据
        以上必需牢记
    '''
    
    ret = requests.get(
        url="https://www.baidu.com/s",
        params={"wd": "王历宏"},  # https://www.baidu.com/s?wd=%E6%9D%8E%E5%81%A5
        headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', },
    )
    ret.encoding = ret.apparent_encoding
    # print(ret.text)
    
    soup = BeautifulSoup(ret.text, 'html.parser')
    
    div = soup.find(name='span', attrs={"class":"nums_text"})
    # lis = re.findall("d+",div.text)
    # print("".join(lis))
    
    print(div.text)
    
    
    '''
    ### json参数
    requests.post(
        url="http://www.baidu.com",
        # json={
        #     'name':'alex',
        #     'passwd':'123456',
        # },
        headers={},
        cookies={},
    
        # 如果搞不清对方是要Form_data 还是payload 就使用下面的方式。
        data=json_dumps({
            'name':'alex',
            'pwd':'123456',
        })
    )
    
    '''
    ## 上传文件
    
    
    # auth 基本弹窗验证
    from requests.auth import HTTPBasicAuth,HTTPDigestAuth
    
    res = requests.get(
        'https://api.github.com/user', auth=HTTPBasicAuth("abc@163.com","11223344")
        # 'https://api.github.com/user', auth=HTTPDigestAuth("abc@163.com","11223344") # 方法不一样
    )
    print(res.text)
    
    
    # timeout 超时时间
    
    
    #  allow_redirects
    
    
    
    ##  proxies  代理
    '''
    proxies ={
        "http":"61.172.249.96:80",
        "https":"http://61.185.219.126:3128",
    }
    
    ret = requests.get("http://www.proxy360.cn/Proxy",proxies=proxies)
    
    
    proxies2 = {"http://10.20.1.128":"http://10.10.1.10:5323"}
    '''
    
    
    # 使用代理字典,以及用户名密码
    '''
    from requests.auth import HTTPProxyAuth
    
    proxy_dict={
        'http':'77.75.105.165',
        'https':'77.75.105.166'
    }
    
    auth=HTTPProxyAuth('username','mypwd')
    
    r = requests.get("http://www.google.com",proxies=proxy_dict,auth=auth)
    '''

    我上交的作业,还是有不少问题。

    #!/usr/bin/env python
    # coding:utf-8
    
    import requests
    from bs4 import BeautifulSoup
    
    username = input("请输入github账号:")
    pwd = input("请输入github密码:")
    print("请稍等几秒... ")
    
    # 1. 打开登录页
    ret1 = requests.get(
        url="https://github.com/login",
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
        }
    )
    r1_cookie_dict = ret1.cookies.get_dict()  # 首次获取cookie
    
    soup1 = BeautifulSoup(ret1.text, features='lxml')
    token1 = soup1.find(name="input", attrs={"name": "authenticity_token"}).get("value")  # 拿到页面token
    # print(token1) # 是否取到 authenticity_token
    
    
    # 2. 登录动作
    ret2 = requests.post(
        url="https://github.com/session",
        data={
            "commit": "Sign in",
            "utf8": "",
            "authenticity_token": token1,
            "login": username,
            "password": pwd,
        },
        headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
        },
        cookies=r1_cookie_dict # 带上首次的cookie
    )
    r2_cookie_dict = ret2.cookies.get_dict() # 获取登录成功后返回的cookie
    # print(ret2.text) # 确实是慢了点
    
    
    # 3. 作业中要求获取个人信息,所以打开个人settings页
    ret3 = requests.get(
        url="https://github.com/settings/profile",
        headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
        },
        cookies=r2_cookie_dict # 带上登录成功后的cookie
    )
    # print(ret3.text)
    
    # 4. 查找并打印个人信息
    soup3 = BeautifulSoup(ret3.text, features='lxml')
    
    user_info_name= soup3.find(name="input", attrs={"name": "user[profile_name]"}).get("value")
    user_info_email = soup3.find(name="select", attrs={"name": "user[profile_email]"}).get("option") # 可能有问题
    user_info_bio = soup3.find(name="textarea", attrs={"name": "user[profile_bio]"}).get("value")
    user_info_url = soup3.find(name="input", attrs={"name": "user[profile_blog]"}).get("value")
    user_info_company = soup3.find(name="input", attrs={"name": "user[profile_company]"}).get("value")
    user_info_location = soup3.find(name="input", attrs={"name": "user[profile_location]"}).get("value")
    
    
    print('Name: ',user_info_name)
    print('Public email: ',user_info_email)
    print('Bio: ',user_info_bio)
    print('URL: ',user_info_url)
    print('Company: ',user_info_company)
    print('Location: ',user_info_location)
    
    
    '''
    以下是API的方式,试过,直接得到字典。
    
    from requests.auth import HTTPBasicAuth
    
    res = requests.get(
        'https://api.github.com/user', auth=HTTPBasicAuth(username, pwd)
    )
    print(res.text)
    '''

    以下是老师给的指导意见,真是非常好的反馈:

    1.请了解下python的pep8规范
    
    2.你的请求头一定要写完整,不要这么暴露你的爬虫请求,这种行为是不好的习惯。
    
    3.你代码的注释写在文档里最好了。
    
    4.你每个请求一定要try一下这在爬虫里很重要你要保证你的爬虫稳定运行
    
    5.你的代码应该封装成函数
    
    6.你写任何项目的时候注意下项目结构哈
    
    7.同学作业写的很好了,其实生产中bs4还是不多的。pyquery或者路径获取的方式用的很多。
  • 相关阅读:
    人月神话阅读笔记01
    梦断代码阅读笔记03
    构建之法阅读笔记03
    构建之法阅读笔记02
    个人课程总结
    第十六周进度总结
    计算最长英语单词链
    第十五周进度总结
    浪潮之巅阅读笔记03
    冲刺2-10
  • 原文地址:https://www.cnblogs.com/frx9527/p/requests.html
Copyright © 2011-2022 走看看