zoukankan      html  css  js  c++  java
  • 简单爬虫

    import requests
    # requests模块,发出请求,接受响应,包括请求响应

    headers = {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
    }

    params = dict(wd="python")


    r = requests.get("http://www.baidu.com/s",headers=headers,params=params)

    print(r.content.decode())
      #获取请求的url
    print(r.request.url)


    import requests

            # "发送请求"
    r = requests.get("http://www.baidu.com")
    # "有乱码,requests模块是猜 对方web响应的返回的编码格式进行解码的"
    print(r.text)
    # 此时获取的内容比较少。因为此时还没有设置user——agent ,对方会认为时一个爬虫,有些东西不让爬
    # 打印编码格式
    # print(r.encoding)

    # r.content返回的是bite格式的数据,所以直接用 r.content.decode

    # 修改解码方式
    # r.encoding = "utf8"
    # print(r.text)

     

    # 打印请求头
    # print(r.request.headers)
    #打印响应头
    print(r.headers)


     

    import requests

     

    class TiebaSpider:

        def __init__(self,tieba_name):

          self.tieba_name = tieba_name

          self.temp_url = "https://tieba.baidu.com/f?kw=" + tieba_name + "&pn={}"

          self.headers = {
              "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko)           Chrome/60.0.3112.113 Safari/537.36"
              }

     

    #构造url列表
        def get_url_list(self):

          url_list = [self.temp_url.format(i*50) for i in range(5)]
          return url_list

     

    #发送请求,获取响应
        def parse_url(self,url):

          response = requests.get(url,headers=self.headers)
          return response.content.decode()

     

    #保存
        def save_html(self,html,page_num):

          file_path = self.tieba_name+"_"+str(page_num)+".html"

          with open(file_path, "w", encoding="utf-8") as f:

          f.write(html)

     


        def run(self):
          url_list = self.get_url_list()

          for url in url_list:
            html_str = self.parse_url(url)

            page_num = url_list.index(url)+1

            self.save_html(html_str,page_num)

     


    if __name__ == "__main__":
    tieba = TiebaSpider("蒋欣")
    tieba.run()

    life is short,i need python
  • 相关阅读:
    17.1.2.1 Advantages and Disadvantages of Statement-Based and Row-Based Replication
    17.1.2 Replication Formats
    Setting the Master Configuration on the Slave
    17.1.1.9 Introducing Additional Slaves to an Existing Replication Environment
    17.1.1.8 Setting Up Replication with Existing Data
    17.1.1.7 Setting Up Replication with New Master and Slaves
    17.1.1.6 Creating a Data Snapshot Using Raw Data Files
    列出display的值,并说明它们的作用
    CSS设置DIV居中
    CSS选择符有哪些?哪些属性可以继承?优先级算法如何计算?
  • 原文地址:https://www.cnblogs.com/lvhonglei-python/p/7525559.html
Copyright © 2011-2022 走看看