zoukankan      html  css  js  c++  java
  • beautifulsoup爬取糗事百科

     1 # _*_ coding:utf-8 _*_
     2 import urllib2
     3 from bs4 import BeautifulSoup
     4 
     5 user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
     6 headers = {'User-Agent':user_agent}
     7 url = "https://www.qiushibaike.com"
     8 # 爬取article链接的content内容
     9 def getContent(article_url, headers):
    10     request = urllib2.Request(article_url,data=None,headers=headers)
    11     response = urllib2.urlopen(request,timeout=60)
    12     html = response.read().decode('utf-8')
    13     soup = BeautifulSoup(html,'html.parser')
    14     contents= soup.select('.content')[0].strings
    15     print u"内容:"
    16     for content in contents:
    17         print u"%s" % content.strip()
    18     print '
    '
    19 # ----------------------------
    20 # 获取"https://www.qiushibaike.com/hot/page/1/"页面的作者、好笑、评论,文章地址信息
    21 def getData(url, headers,pages=1):
    22     for page in range(1,pages+1):
    23         page_url = url + "/hot/page/" + str(page)
    24         print "正在爬取第 %s 页+++" % page
    25         request = urllib2.Request(page_url,data=None,headers=headers)
    26         response = urllib2.urlopen(request,timeout=60)
    27         html = response.read().decode('utf-8')
    28         # print html
    29         soup = BeautifulSoup(html,'html.parser')
    30 
    31         authors = soup.select('h2')
    32         smile_nums = soup.select('.stats-vote > .number')
    33         comment_nums = soup.select('.stats-comments > .qiushi_comments > .number')
    34         article_urls = soup.select('.contentHerf')
    35         for i in range(25):
    36             print "正在爬取第 %s 页的第 %s 条数据---" % (page,i+1)
    37             author = authors[i].string.strip()
    38             print u"作者: %s" % author
    39             funny_num = smile_nums[i].string
    40             comment_num = comment_nums[i].string
    41             print u"好笑: %s" % funny_num
    42             print u"评论: %s"% comment_num
    43             article_url = article_urls[i]['href']
    44 
    45             article_url = url + article_url
    46             # print article_url
    47             getContent(article_url, headers)
    48 # ---------------------------------
    49 getData(url,headers,pages=10)
  • 相关阅读:
    linux下错误的捕获:errno和strerror的使用
    三角识别注意事项
    关于udo3d双目相机的嵌入式板子系统重装
    为网页背景添加一个跟随鼠标变幻的动态线条
    API工具下载地址记录一下
    Eclipse 安装 SVN 插件的两种方法
    java技术面试之面试题大全
    烧绳子问题
    Java web 项目 web.xml 配置文件加载过程
    mysql绿色版安装配置
  • 原文地址:https://www.cnblogs.com/stonelovy/p/7624685.html
Copyright © 2011-2022 走看看