杂谈:
爬取豆瓣读书的古典文学专栏,提取了主要的七个字段,然后将数据保存到sqlite中。
由于没有使用IP代理,导致爬到差不多第一千本书的时候被豆瓣暂时封了IP。因此接下来就要研究如何使用IP代理了。
爬取数据:
代码如下:
1 # coding:utf-8 2 import cPickle 3 import random 4 import requests 5 from lxml import etree 6 import time 7 import re 8 import sys 9 import codecs 10 import sqlite3 11 from pyquery import PyQuery as Q 12 13 class Spider: 14 def __init__(self): 15 self.con = sqlite3.connect(r'BookInformation.db') 16 self.cur = self.con.cursor() 17 # sql = ''' 18 # CREATE TABLE BookInfo( 19 # Name varchar(200), 20 # Author varchar(100), 21 # Tag text, 22 # Rating double, 23 # ContentIntro text, 24 # AuthorIntro text, 25 # Catalogue text, 26 # Commentary text ) 27 # ''' 28 # self.cur.execute(sql) 29 # self.con.commit() 30 self.home = '' 31 self.Referer = 'https://book.douban.com/' 32 self.user_agent_list = [] 33 with open('user_agent.txt', 'rb') as f: 34 self.user_agent_list = cPickle.load(f) 35 36 def GetHeaders(self): 37 UserAgent = random.choice(self.user_agent_list) 38 headers = {'Referer': self.Referer, 'User-Agent': UserAgent} 39 return headers 40 41 def SaveBook(self,info): 42 sql = 'INSERT INTO BookInfo VALUES(?,?,?,?,?,?,?,?)' 43 info_list = (info["Name"],info["Author"],info["Tag"],info["Rating"],info["ContentIntro"],info["AuthorIntro"],info["Catalogue"],info["Commentary"]) 44 self.cur.execute(sql, info_list) 45 self.con.commit() 46 47 def Crawl(self): 48 for index in range(0,50): 49 self.home = 'https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6?start='+str(index*20)+'&type=T' 50 html = requests.get(self.home, headers=self.GetHeaders()).text 51 html_tree = etree.HTML(html) 52 booksList = html_tree.xpath('/html/body/div[3]/div[1]/div/div[1]/div/ul/li') 53 for book in booksList: 54 time.sleep(random.randint(2,5)) 55 bookUrl = book.xpath('div[2]/h2/a')[0].get('href') 56 pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text 57 page_tree = etree.HTML(pageHtml) 58 book_info = self.GetPage(bookUrl,page_tree) 59 print book_info['Name'] 60 # self.SaveBook(book_info) 61 self.con.close() 62 63 64 def GetPage(self, page_url,page_tree): 65 book_info = {} 66 try: 67 Name = self.GetName(page_tree) 68 book_info['Name'] = Name 69 except: 70 book_info['Name'] = '' 71 try: 72 Author = self.GetAuthor(page_tree) 73 book_info['Author'] = Author 74 except: 75 book_info['Author'] = '' 76 try: 77 Rating = self.GetRating(page_tree) 78 book_info['Rating'] = Rating 79 except: 80 book_info['Rating'] = '' 81 try: 82 ContentIntro = self.GetContentIntro(page_tree) 83 book_info['ContentIntro'] = ContentIntro 84 except: 85 book_info['ContentIntro'] = '' 86 try: 87 AuthorIntro = self.GetAuthorIntro(page_tree) 88 book_info['AuthorIntro'] = AuthorIntro 89 except: 90 book_info['AuthorIntro'] = '' 91 try: 92 Catalogue = self.GetCatalogue(page_url,page_tree) 93 book_info['Catalogue'] = Catalogue 94 except : 95 book_info['Catalogue'] = '' 96 try: 97 Tag = self.GetTag(page_tree) 98 book_info['Tag'] = Tag 99 except : 100 book_info['Tag'] = '' 101 try: 102 Commentary = self.GetCommentary(page_tree) 103 book_info['Commentary'] = Commentary 104 except : 105 book_info['Commentary'] = '' 106 return book_info 107 108 def GetName(self, page_tree): 109 return page_tree.xpath('/html/body/div[3]/h1/span')[0].text 110 111 def GetAuthor(self,page_tree): 112 author_list = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a') 113 result = '' 114 if len(author_list) is not 0: 115 list = [] 116 for author in author_list: 117 list.append(author.text.strip()) 118 result = '/'.join(list) 119 else: 120 result = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a')[0].text.strip() 121 return re.sub(r's+',' ',result) 122 123 124 def GetRating(self, page_tree): 125 rating = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong')[0].text.strip() 126 return eval(rating) 127 128 def GetContentIntro(self, page_tree): 129 para_div = page_tree.xpath('//*[@id="link-report"]//div[@class="intro"]') 130 result = '' 131 if len(para_div) is not 0: 132 para_para = para_div[len(para_div)-1].xpath('p') 133 for para in para_para: 134 result = result+' '+para.text+' ' 135 return result 136 137 def GetAuthorIntro(self, page_tree): 138 para_div = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"]') 139 result = '' 140 if len(para_div) is not 0: 141 para_para = para_div[-1].xpath('p') 142 for para in para_para: 143 result = result + ' ' + para.text + ' ' 144 return result 145 146 def GetCatalogue(self, page_url, page_tree): 147 bookid = page_url.split('/')[-2] 148 result = '' 149 para_div = page_tree.xpath('//*[@id="dir_'+bookid+'_full"]') 150 if len(para_div) is 0: 151 para_div = page_tree.xpath('//*[@id="dir_'+bookid+'_short"]') 152 if len(para_div) is not 0: 153 result = Q(etree.tostring(para_div[0])).text() 154 return result 155 156 def GetTag(self, page_tree): 157 list = page_tree.xpath('//*[@id="db-tags-section"]/div[@class="indent"]//a[@class=" tag"]') 158 result = [] 159 if len(list) is not 0: 160 for tag in list: 161 result.append(tag.text) 162 return '/'.join(result) 163 164 def GetCommentary(self, page_tree): 165 list = page_tree.xpath('//*[@class="comment-list hot show"]//p[@class="comment-content"]') 166 result = '' 167 num = 0 168 if len(list) is not 0: 169 for comment in list: 170 num = num+1 171 result = result + ' '+str(num)+'.'+comment.text+' ' 172 return result 173 174 175 if __name__ == '__main__': 176 s = Spider() 177 s.Crawl()