zoukankan      html  css  js  c++  java
  • python抓取伯乐在线的全部文章,对标题分词后存入mongodb中

    依赖包:

    1.pymongo

    2.jieba


    # -*- coding: utf-8 -*-

    """
    @author: jiangfuqiang
    """
    from HTMLParser import HTMLParser
    import urllib2
    import sys
    import pymongo
    import time
    import jieba
    import traceback

    default_encoding = 'utf-8'
    if sys.getdefaultencoding() != default_encoding:
        reload(sys)
        sys.setdefaultencoding(default_encoding)
    class FetchJobble(HTMLParser):

        def __init__(self):
            HTMLParser.__init__(self)
            self.isPostThumb = False
            self.isPostMeta = False
            self.isMetaTitle = False
            self.isCategoryTag = False
            self.isComment = False
            self.isexcerpt = False
            self.isReadMore = False
            self.isPicture = False
            self.data = {}
            self.result = []

        def handle_starttag(self,tag,attrs):
            if tag == 'div':
                for key,value in attrs:
                    if key == 'class':
                        if value == 'post-thumb':
                            self.isPostThumb = True
                        elif value == 'meta-title':
                            self.isMetaTitle = True
            elif tag == 'a' and self.isPostThumb == True:

                for key, value in attrs:
                    if self.isReadMore:
                        if key == 'href':
                            self.data['redmoreLink'] = value
                            self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
                            self.result.append(self.data)
                            self.isPostThumb = False
                            self.isMetaTitle = False
                            self.isReadMore = False
                            self.isCategoryTag = False
                            self.isComment = False
                            self.isexcerpt = False
                            self.isPicture = False

                            self.data = {}
                    else:
                        if key == 'class':
                            if value == 'meta-title':
                                self.isMetaTitle = True
                        elif key == 'rel':
                            if value == 'category tag':
                                self.isCategoryTag = True
                        elif key =='href':
                            if value.find('#respond') > 0:
                                self.isComment = True
            elif tag == 'span' and self.isComment == True:
                for key, value in attrs:
                    if key == 'class' and value == 'excerpt':
                        self.isexcerpt = True
                    elif key == 'class' and value == 'read-more':
                        self.isReadMore = True
            elif tag == 'img' and self.isPostThumb and self.isPostMeta == False:
                for key, value in attrs:
                    if key == 'src':
                        self.data['imgSrc'] = value

        def handle_endtag(self,tag):

            pass

        def handle_data(self,data):
             if self.isMetaTitle:
                self.data['title'] = data
                self.isMetaTitle = False
             elif self.isCategoryTag:
                 ct = ''
                 if 'tag' in self.data.keys() :
                     ct = self.data['tag'] + "," + data
                 else:
                     ct = data
                 self.data['tag'] = ct
                 self.isCategoryTag = False
             elif self.isComment and 'comment' not in self.data.keys():
                 self.data['comment'] = data.split(" ")[0]
             elif self.isexcerpt:
                 self.data['desc'] = data
                 self.isexcerpt = False


        def getResult(self):
            return self.result

    if __name__ == "__main__":
        con = pymongo.Connection('localhost', 27017)
        db = con.blog
      
        fetchblog = db.fetch_blog

        url = "http://blog.jobbole.com/all-posts/page/%d"
        count = 1
        flag = False
        headers={
                 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        while flag == False:
            try:
                req = urllib2.Request(url%count,headers=headers)
                request = urllib2.urlopen(req)
                data = request.read()
                fj = FetchJobble()
                fj.feed(data)
                result = fj.getResult()
                if len(result) < 1:
                    flag = True
                else:
                    for doc in result:
                        fetchblog.insert(doc)
                    print "page is %d"%count
                    count += 1

                    time.sleep(5)
            except Exception, e:
                traceback.print_exc()
                print "parse error",e

  • 相关阅读:
    git如何从远端获取某个文件
    git显示不出来图标标志
    sublime text3设置
    怎么解决sublime的插件自动被禁用
    外甥语录
    sublime Text3支持vue高亮,sublime Text3格式化Vue
    sass安装方法,绝对好用的方式
    win10 安装nodejs,报错there is a problem in the windows installer package
    npm下载模块提速方法
    npm如何删除node_modules文件夹
  • 原文地址:https://www.cnblogs.com/lcchuguo/p/4008352.html
Copyright © 2011-2022 走看看