zoukankan      html  css  js  c++  java
  • 爬取豆瓣古典文学(数据库存储)

    杂谈:

    爬取豆瓣读书的古典文学专栏,提取了主要的七个字段,然后将数据保存到sqlite中。

    由于没有使用IP代理,导致爬到差不多第一千本书的时候被豆瓣暂时封了IP。因此接下来就要研究如何使用IP代理了。

     

    爬取数据:

    代码如下:

      1 # coding:utf-8
      2 import cPickle
      3 import random
      4 import requests
      5 from lxml import etree
      6 import time
      7 import re
      8 import sys
      9 import codecs
     10 import sqlite3
     11 from pyquery import PyQuery as Q
     12 
     13 class Spider:
     14     def __init__(self):
     15         self.con = sqlite3.connect(r'BookInformation.db')
     16         self.cur = self.con.cursor()
     17         # sql = '''
     18         # CREATE TABLE BookInfo(
     19         #     Name varchar(200),
     20         #     Author varchar(100),
     21         #     Tag text,
     22         #     Rating double,
     23         #     ContentIntro text,
     24         #     AuthorIntro text,
     25         #     Catalogue text,
     26         #     Commentary text )
     27         # '''
     28         # self.cur.execute(sql)
     29         # self.con.commit()
     30         self.home = ''
     31         self.Referer = 'https://book.douban.com/'
     32         self.user_agent_list = []
     33         with open('user_agent.txt', 'rb') as f:
     34             self.user_agent_list = cPickle.load(f)
     35 
     36     def GetHeaders(self):
     37         UserAgent = random.choice(self.user_agent_list)
     38         headers = {'Referer': self.Referer, 'User-Agent': UserAgent}
     39         return headers
     40 
     41     def SaveBook(self,info):
     42         sql = 'INSERT INTO BookInfo VALUES(?,?,?,?,?,?,?,?)'
     43         info_list = (info["Name"],info["Author"],info["Tag"],info["Rating"],info["ContentIntro"],info["AuthorIntro"],info["Catalogue"],info["Commentary"])
     44         self.cur.execute(sql, info_list)
     45         self.con.commit()
     46 
     47     def Crawl(self):
     48         for index in range(0,50):
     49             self.home = 'https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6?start='+str(index*20)+'&type=T'
     50             html = requests.get(self.home, headers=self.GetHeaders()).text
     51             html_tree = etree.HTML(html)
     52             booksList = html_tree.xpath('/html/body/div[3]/div[1]/div/div[1]/div/ul/li')
     53             for book in booksList:
     54                 time.sleep(random.randint(2,5))
     55                 bookUrl = book.xpath('div[2]/h2/a')[0].get('href')
     56                 pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text
     57                 page_tree = etree.HTML(pageHtml)
     58                 book_info = self.GetPage(bookUrl,page_tree)
     59                 print book_info['Name']
     60                 # self.SaveBook(book_info)
     61         self.con.close()
     62 
     63 
     64     def GetPage(self, page_url,page_tree):
     65         book_info = {}
     66         try:
     67             Name = self.GetName(page_tree)
     68             book_info['Name'] = Name
     69         except:
     70             book_info['Name'] = ''
     71         try:
     72             Author = self.GetAuthor(page_tree)
     73             book_info['Author'] = Author
     74         except:
     75             book_info['Author'] = ''
     76         try:
     77             Rating = self.GetRating(page_tree)
     78             book_info['Rating'] = Rating
     79         except:
     80             book_info['Rating'] = ''
     81         try:
     82             ContentIntro = self.GetContentIntro(page_tree)
     83             book_info['ContentIntro'] = ContentIntro
     84         except:
     85             book_info['ContentIntro'] = ''
     86         try:
     87             AuthorIntro = self.GetAuthorIntro(page_tree)
     88             book_info['AuthorIntro'] = AuthorIntro
     89         except:
     90             book_info['AuthorIntro'] = ''
     91         try:
     92             Catalogue = self.GetCatalogue(page_url,page_tree)
     93             book_info['Catalogue'] = Catalogue
     94         except :
     95             book_info['Catalogue'] = ''
     96         try:
     97             Tag = self.GetTag(page_tree)
     98             book_info['Tag'] = Tag
     99         except :
    100             book_info['Tag'] = ''
    101         try:
    102             Commentary = self.GetCommentary(page_tree)
    103             book_info['Commentary'] = Commentary
    104         except :
    105             book_info['Commentary'] = ''
    106         return book_info
    107 
    108     def GetName(self, page_tree):
    109         return page_tree.xpath('/html/body/div[3]/h1/span')[0].text
    110 
    111     def GetAuthor(self,page_tree):
    112         author_list = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a')
    113         result = ''
    114         if len(author_list) is not 0:
    115             list = []
    116             for author in author_list:
    117                 list.append(author.text.strip())
    118             result = '/'.join(list)
    119         else:
    120             result = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a')[0].text.strip()
    121         return re.sub(r's+',' ',result)
    122 
    123 
    124     def GetRating(self, page_tree):
    125         rating = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong')[0].text.strip()
    126         return eval(rating)
    127 
    128     def GetContentIntro(self, page_tree):
    129         para_div = page_tree.xpath('//*[@id="link-report"]//div[@class="intro"]')
    130         result = ''
    131         if len(para_div) is not 0:
    132             para_para = para_div[len(para_div)-1].xpath('p')
    133             for para in para_para:
    134                 result = result+'	'+para.text+'
    '
    135         return result
    136 
    137     def GetAuthorIntro(self, page_tree):
    138         para_div = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"]')
    139         result = ''
    140         if len(para_div) is not 0:
    141             para_para = para_div[-1].xpath('p')
    142             for para in para_para:
    143                 result = result + '	' + para.text + '
    '
    144         return result
    145 
    146     def GetCatalogue(self, page_url, page_tree):
    147         bookid = page_url.split('/')[-2]
    148         result = ''
    149         para_div = page_tree.xpath('//*[@id="dir_'+bookid+'_full"]')
    150         if len(para_div) is 0:
    151             para_div = page_tree.xpath('//*[@id="dir_'+bookid+'_short"]')
    152         if len(para_div) is not 0:
    153                 result = Q(etree.tostring(para_div[0])).text()
    154         return result
    155 
    156     def GetTag(self, page_tree):
    157         list = page_tree.xpath('//*[@id="db-tags-section"]/div[@class="indent"]//a[@class="  tag"]')
    158         result = []
    159         if len(list) is not 0:
    160             for tag in list:
    161                 result.append(tag.text)
    162         return '/'.join(result)
    163 
    164     def GetCommentary(self, page_tree):
    165         list = page_tree.xpath('//*[@class="comment-list hot show"]//p[@class="comment-content"]')
    166         result = ''
    167         num = 0
    168         if len(list) is not 0:
    169             for comment in list:
    170                 num = num+1
    171                 result = result + '	'+str(num)+'.'+comment.text+'
    '
    172         return result
    173 
    174 
    175 if __name__ == '__main__':
    176     s = Spider()
    177     s.Crawl()
  • 相关阅读:
    Nginx配置文件
    SSM三层模型之间的参数传递
    Junit4用法
    常量类的设计
    初识Oracle
    sss
    sss
    sss
    sss
    sss
  • 原文地址:https://www.cnblogs.com/DOLFAMINGO/p/9210568.html
Copyright © 2011-2022 走看看