zoukankan      html  css  js  c++  java
  • 百度贴吧

    • 0
    • 0
    • #coding:utf-8
      
      import urllib
      import urllib2
      
      def loadPage(url,filename):
          '''
              作用:根据url发送请求,获取服务器响应文件
              url:需要爬去的url地址
              filename : 处理的文件名
          '''
          print "正在下载" + filename
          headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
          request = urllib2.Request(url, headers = headers)
          return urllib2.urlopen(request).read()
      
      
      def writePage(html,filename):
          '''
              作用:将html内容写入到本地
              html:服务器相应文件内容
      
      
          '''
      
          print "正在保存" + filename
          with open(filename, "w") as f:
              f.write(html)
          print "_" * 30
      
      def tiebaSpider(url, beginPage, endPage):
          '''
              作用:贴吧爬虫调度器,负责组合处理每个页面的url
              url: url的前部分
              beginPage: 起始页
              endPage: 结束页
          '''
      
          for page in range(beginPage, endPage + 1):
              pn = (page -1)*50
              filename = "" + str(page) + "页.html"
              fullurl = url + "&pn=" + str(pn)
              html = loadPage(fullurl, filename)
              #print html
              writePage(html, filename)
              print "谢谢使用"
      
      if __name__ == "__main__":
          kw = raw_input("请输入需要爬取的贴吧名:")
          beginPage = int(raw_input("请输入起始页:"))
          endPage = int(raw_input("请输入结束页:"))
      
          url = "http://tieba.baidu.com/f?"
          key = {"kw" : kw}
          key = urllib.urlencode(key)
          fullurl = url + key
          tiebaSpider(fullurl, beginPage, endPage)
  • 相关阅读:
    组合模式
    MySQL8.0 下载安装启动(Windows10)
    OI如逆旅,我亦是行人——省选
    闲话—江湖痴情浅,信步余生。平剑红烛,青丝微绾,却话奁中。
    此时彼方
    CSP 2019游记 & 退役记
    西狂 杨过
    SDOI 2019 Round1 游记
    NOIP2018游记
    未来可期,不知所终
  • 原文地址:https://www.cnblogs.com/hizf/p/8325220.html
Copyright © 2011-2022 走看看