zoukankan      html  css  js  c++  java
  • python抓取百度百科点赞数等动态数据

    利用selenium 模拟浏览器打开页面,加载后抓取数据

    #!/usr/bin/env python
    # coding=utf-8
    
    import urllib2
    import re
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import time 
    
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class BaikeSpider():
    
        def __init__(self):
            self.queue = ["http://baike.baidu.com/view/8095.htm",
                          "http://baike.baidu.com/view/2227.htm"]
            self.base = "http://baike.baidu.com"
            self.crawled = set()
            self.crawled_word = set()
    
    #        client = MongoClient("localhost",27017)
    #        self.db = client["baike_db"]["html"]
    
        def crawl(self):
            browser = webdriver.Chrome()
            cnt = 0
            fw = open('./baike_keywords.txt','wb')
            while self.queue:
                url = self.queue.pop(0)
                if url in self.crawled :
                    continue
                self.crawled.add(url)
                try:
                    browser.get(url)
                    res = {}
                    links = BeautifulSoup(urllib2.urlopen(url).read(),'lxml').find_all("a")
                    links = list(set(links))
                    for link in links:
                        if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])<8:
                            continue
                        url = link['href']
                        if re.search(u"baike.baidu.com/view/d+|baike.baidu.com/subview/d+/d+.htm",url) and url not in self.crawled:
                            self.queue.append(url)
                        elif re.match(u"view/d+",url):
                            url = self.base+ url
                            if url not in self.crawled:                            
                                self.queue.append(url)
                    
                    cnt += 1
                    print cnt
                    if cnt % 10 == 0:
                        print 'queue',len(self.queue)
                        fw.close()
                        fw = open('./baike_keywords.txt','a+')
                
                    res['url'] = url
                    res['title'] = browser.title.split(u"_")[0]
    
                    if res['title'] in self.crawled_word:
                        print 'title',res['title'],'has crawled'
                        continue
                    
                    vote = browser.find_element_by_class_name("vote-count")
                    view = browser.find_element_by_id("j-lemmaStatistics-pv")
    
                    res['voted'] = vote.text
                    res['viewed'] = view.text
                    
                    line = []
                    line.append(res['title'])
                    line.append(res['viewed'])
                    line.append(res['voted'])
                    line.append(res['url'])
                    
                    line = '	'.join(line)
                    fw.write(line+'
    ')
                    self.crawled_word.add(res["title"])
    
                except Exception,e:
                    print e
                    continue
    
    
    if __name__=='__main__':
        test = BaikeSpider()
        test.crawl()

    另外,使用chrome加载会比firefox快,且少报错,异常退出!

    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    TWinHttp之二
    日志池
    TWinHTTP
    TJSONTableObject跨平台解析JSON
    TSQLTableJSON解析JSON
    TDocVariantData解析JSON
    Vue.js常用指令汇总(v-if//v-show//v-else//v-for//v-bind//v-on等)
    VUE -- 十分钟入门 Less
    VUE -- ejs模板的书写
    Go VUE --- vuejs在服务器部署?
  • 原文地址:https://www.cnblogs.com/jkmiao/p/5073699.html
Copyright © 2011-2022 走看看