zoukankan      html  css  js  c++  java
  • 学习笔记 requests + BeautifulSoup

    第一步:requests

    get请求

    # -*- coding:utf-8  -*-
    # 日期:2018/5/15 17:46
    # Author:小鼠标
    import requests
    url = "http://www.baidu.com"
    #res = requests.get(url)  #方法1

    res = requests.request('get',url) #方法2
    print('响应状态码:',res.status_code) print('响应内容:',res.text)

    post请求

    # -*- coding:utf-8  -*-
    # 日期:2018/5/15 17:46
    # Author:小鼠标
    import requests
    url = "http://www.baidu.com"
    data = {
        'username': 'xiaoshubiao',
        'pwd': 'xiaoshubiao'
    }
    res = requests.post(url,data)
    print('响应状态码:',res.status_code)
    print('响应内容:',res.text)

    第二步:伪装浏览器和伪造cookie

    # -*- coding:utf-8  -*-
    # 日期:2018/5/15 17:46
    # Author:小鼠标
    import requests
    url = "http://www.baidu.com"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
                             ' (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.39'
                             '64.2 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0'
                          '.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive'
              }
    cookies = dict(name='xiaoshubiao')
    res = requests.get(url,headers = headers,cookies = cookies)
    print('响应状态码:',res.status_code)
    print('响应内容:',res.text)

    第三步:使用代理ip

    # -*- coding:utf-8  -*-
    # 日期:2018/5/15 17:46
    # Author:小鼠标
    import requests
    url = "http://www.baidu.com"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
                             ' (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.39'
                             '64.2 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0'
                          '.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive'
                }
    cookies = dict(name='xiaoshubiao')
    proxies = {'http':'218.73.134.234:36602'}
    res = requests.get(url,headers = headers,cookies = cookies,proxies = proxies)
    print('响应状态码:',res.status_code)
    print('响应内容:',res.text)

     第四步:内容解析

    # -*- coding:utf-8  -*-
    # 日期:2018/5/15 17:46
    # Author:小鼠标
    import requests
    from bs4 import BeautifulSoup
    url = "http://news.sina.com.cn/guide/"
    res = requests.get(url)
    res.encoding = 'utf-8'
    web_data = res.text
    #内容解析
    soup = BeautifulSoup(web_data,'lxml')
    title_list = soup.select('title') #获取标签内容 返回为列表
    a_list = soup.select('a')
    ul_list = soup.select('ul.list01') #获取类名为list01的ul的内容 返回为列表
    div_list = soup.select('div#tab01') #获取id为tab01的内容 返回为列表
    for title , a in zip(title_list,a_list):
        title_content = title.get_text()    #获取标签内容的值
        a_href = a.get('href')  #获取标签的属性的值
        print(title_content,a_href)

     

  • 相关阅读:
    Codeforces Round #694 (Div.1, Div.2)题解(2A-1D)(1E-1F待填)
    【Luogu日报#294】OI中你可能会用到的一些不等式及其证明 | 习题题解
    SP10570 LONGCS
    Unity 数字孪生笔记 工具介绍
    Unity3D 最佳数字孪生插件(一个基于Unity的自动化概念设计,仿真,虚拟调试和3D HMI的框架)
    Unity 数字孪生笔记 PiXYZ快速入门
    数据结构:堆排序
    HDU 4704 Sum (欧拉降幂+二项式公式+快速幂)
    Codeforces Goodbye2020部分题解
    Apache架构师的30条设计原则
  • 原文地址:https://www.cnblogs.com/7749ha/p/9042395.html
Copyright © 2011-2022 走看看