zoukankan      html  css  js  c++  java
  • 【Python】python 爬虫学习

    response = requests.get("http://www.baidu.com")

     response.content.decode("utf-8")  返回bytes类型 decode解码

     response.text    request.encoding = "gbk" # 修改编码 返回str类型 

     获取图片

    # coding=utf-8
    import requests
    url = "http://wap.jiapai.net.cn/images/1.jpg"
    
    response = requests.get(url)
    with open("baidu.png","wb") as f:
        f.write(response.content)
                                 

    ---

    # 状态码 

    response.status_code 

    # 响应头

    response.headers

    # 请求头

    response.request.headers

    200
    {'Content-Length': '20851', 'Content-Type': 'image/jpeg', 'Last-Modified': 'Sun, 28 Jul 2019 04:29:48 GMT', 'Accept-Ranges': 'bytes', 'ETag': '"1f3f6d17fd44d51:0"', 'Set-Cookie': 'sdwaf-test-item=1ed57f5405075208510954035156575b5c5754065406040d015701515e520c; path=/; HttpOnly', 'X-Powered-By': 'SDWAF', 'Date': 'Tue, 05 May 2020 01:56:48 GMT'} {'User-Agent': 'python-requests/2.23.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}

    ---

    # 发送带header的请求 

    # coding=utf-8
    import requests
    url = "http://wap.jiapai.net.cn/images/1.jpg"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    
    response = requests.get(url,headers=headers)
    print(response.status_code)
    print(response.headers)
    print(response.request.headers)

    ---

    # 发送带参数的请求 

    params = {"":""}

    url_temp = "www.baidu.com/s?"

    requests.get(url_temp,params=params)

    ---

    # 占位符 建议使用format+ {} 代替 

    input_string = input("")

    url = "http://www.baidu.com/s?wd={}".format(input_string) || url = "https://www.baidu.com/s?wd=%s"%input_string 

    ---

    列表推导式

    # 范围0~9 

    [i for i in range(10)]

    # i对2取余 输出 

    [i%2 for i in range(10)]

    # i对2取余,如果对而取余等于0 则输出 

    [i%2 for i in range(10) if i%2==0]

    ---

    ## 面向对象 

    - 对象

      - 生活中的事务

    - 类

      - 对事务的抽象 在代码中实现class 类型

    - 实例

      - 使用之前对类的实例化之后的结果

    --- 

    # get 请求贴吧 

    # coding=utf-8
    import requests
    
    class TiebaSpider:
        def __init__(self,tieba_name):
            self.tieba_name = tieba_name
            self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"
            self.headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    
        def get_url_list(self):
            return [self.url_temp.format(i*50) for i in range(10)]
    
        def parse_url(self, url):
            response = requests.get(url, headers=self.headers)
            return response.content.decode("utf-8")
    
        def save_html_str(self, html_str, page_num):
            file_path = "{}-第{}页".format(self.tieba_name, page_num)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(html_str)
    
        def run(self):
            # 1 构造url 
            url_list = self.get_url_list()
    
            # 2 发送请求 获取相应 
    for url in url_list:
                html_str = self.parse_url(url)
    
            # 3 保存 
                page_num = url_list.index(url) + 1
                self.save_html_str(html_str, page_num)
    
    if __name__ == "__main__":
        tieba_spider = TiebaSpider("李毅")
        tieba_spider.run()
                                          

    # 保存贴吧内容到本地 

    ---

    # Post 请求  安全 大文本传输 

    data = {"":""}  # 字典

    requests.post("https://www.baidu.com",data = data,headers=headers)

  • 相关阅读:
    Python基础语法 第2节课(数据类型转换、运算符、字符串)
    python基础语法 第5节课 ( if 、 for )
    python基础语法 第4节课 (字典 元组 集合)
    Python基础语法 第3节课 (列表)
    A. Peter and Snow Blower 解析(思維、幾何)
    C. Dima and Salad 解析(思維、DP)
    D. Serval and Rooted Tree (樹狀DP)
    C2. Balanced Removals (Harder) (幾何、思維)
    B. Two Fairs 解析(思維、DFS、組合)
    D. Bash and a Tough Math Puzzle 解析(線段樹、數論)
  • 原文地址:https://www.cnblogs.com/oscarli/p/12829574.html
Copyright © 2011-2022 走看看