zoukankan      html  css  js  c++  java
  • 爬虫之爬取糗事百科

    #!/usr/bin/env python
    #-*- coding:utf-8 -*-
    import urllib2,re
    from bs4 import BeautifulSoup

    def getContentOrComment(argurl):
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    headers = {'User-Agent':user_agent}
    #加上头部信息,反爬虫
    req = urllib2.Request(url=argurl,headers=headers)
    #try:
    response = urllib2.urlopen(req) #打开网址
    content = response.read() #读取源代码
    #print content
    #except Exception,e:
    #content = None
    return content

    articleUrl = 'http://www.qiushibaike.com/textnew/page/%d' #文章地址
    commentUrl = 'http://www.qiushibaike.com/article/%s' #评论地址

    page = 0

    while True:
    raw = raw_input('点击enter查看或者输入exit退出,请输入你的选择:')
    if raw == 'exit':
    break
    page += 1
    Url = articleUrl % page
    print Url


    articlePage = getContentOrComment(Url)
    articleFloor = 1

    soup = BeautifulSoup(articlePage,'html.parser') #解析网页
    for string in soup.find_all(attrs='article block untagged mb15'):
    commentId = str(string.get('id')).strip().split('_')[2]
    #print commentId
    print ' '
    print articleFloor,'.',string.find(attrs='content').get_text().strip()
    articleFloor +=1

    #获取评论
    commentPage = getContentOrComment(commentUrl % commentId)
    if commentPage is None:
    continue
    soupComment = BeautifulSoup(commentPage,'html.parser')
    commentFloor = 1
    for comment in soupComment.find_all(attrs='body'):
    print ' ',commentFloor,'楼回复:',comment.get_text().strip()
    commentFloor +=1

  • 相关阅读:
    C# 利用DataTable批处理数据导入数据库
    人员基础信息一体化采集系统建设方案
    定时调用WebService方法同步数据
    进程间通信
    信号
    Linux进程基础
    来自硬件男的《信号与系统》整理笔记
    shell脚本编程(ubantu)
    Linux系统c语言开发环境
    Linux系统用户管理及VIM配置
  • 原文地址:https://www.cnblogs.com/shanhua-fu/p/6934509.html
Copyright © 2011-2022 走看看