zoukankan html css js c++ java

python抓取百度百科点赞数等动态数据

利用selenium 模拟浏览器打开页面，加载后抓取数据

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time 

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class BaikeSpider():

    def __init__(self):
        self.queue = ["http://baike.baidu.com/view/8095.htm",
                      "http://baike.baidu.com/view/2227.htm"]
        self.base = "http://baike.baidu.com"
        self.crawled = set()
        self.crawled_word = set()

#        client = MongoClient("localhost",27017)
#        self.db = client["baike_db"]["html"]

    def crawl(self):
        browser = webdriver.Chrome()
        cnt = 0
        fw = open('./baike_keywords.txt','wb')
        while self.queue:
            url = self.queue.pop(0)
            if url in self.crawled :
                continue
            self.crawled.add(url)
            try:
                browser.get(url)
                res = {}
                links = BeautifulSoup(urllib2.urlopen(url).read(),'lxml').find_all("a")
                links = list(set(links))
                for link in links:
                    if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])<8:
                        continue
                    url = link['href']
                    if re.search(u"baike.baidu.com/view/d+|baike.baidu.com/subview/d+/d+.htm",url) and url not in self.crawled:
                        self.queue.append(url)
                    elif re.match(u"view/d+",url):
                        url = self.base+ url
                        if url not in self.crawled:                            
                            self.queue.append(url)
                
                cnt += 1
                print cnt
                if cnt % 10 == 0:
                    print 'queue',len(self.queue)
                    fw.close()
                    fw = open('./baike_keywords.txt','a+')
            
                res['url'] = url
                res['title'] = browser.title.split(u"_")[0]

                if res['title'] in self.crawled_word:
                    print 'title',res['title'],'has crawled'
                    continue
                
                vote = browser.find_element_by_class_name("vote-count")
                view = browser.find_element_by_id("j-lemmaStatistics-pv")

                res['voted'] = vote.text
                res['viewed'] = view.text
                
                line = []
                line.append(res['title'])
                line.append(res['viewed'])
                line.append(res['voted'])
                line.append(res['url'])
                
                line = '	'.join(line)
                fw.write(line+'
')
                self.crawled_word.add(res["title"])

            except Exception,e:
                print e
                continue


if __name__=='__main__':
    test = BaikeSpider()
    test.crawl()

另外，使用chrome加载会比firefox快，且少报错，异常退出！

每天一小步，人生一大步！Good luck~

查看全文

相关阅读:
Ubuntu 截图工具Flameshot
django 执行原生的sql
tensorflow学习笔记(3)前置数学知识
 tensorflow学习笔记(2)-反向传播
 tensorflow学习笔记(1)-基本语法和前向传播
 深度学习-tensorflow学习笔记(1)-MNIST手写字体识别预备知识
 数据结构-查找-散列表的线性探测已经拉链法的查找
 数据结构-查找-折半查找-二叉排序树查找
 数据结构-查找-线性表查找技术
 数据结构-排序-直接插入排序

原文地址：https://www.cnblogs.com/jkmiao/p/5073699.html