1 # requests+xpath+map爬取百度贴吧 2 # 目标内容:跟帖用户名,跟帖内容,跟帖时间 3 # 分解: 4 # requests获取网页 5 # xpath提取内容 6 # map实现多线程爬虫 7 import requests 8 from requests.exceptions import RequestException 9 from lxml import etree 10 import json 11 from multiprocessing.dummy import Pool as ThreadPool 12 13 def get_html(url): 14 try: 15 response = requests.get(url) 16 if response.status_code == 200: 17 return response.text 18 else: 19 return None 20 except RequestException: 21 return None 22 23 def parse_html(html): 24 selector = etree.HTML(html) 25 data = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') 26 for each in data: 27 rs = each.xpath('@data-field')[0] 28 rs = json.loads(rs) 29 author = rs.get('author').get('user_name') 30 author_id = rs.get('content').get('post_id') 31 content = each.xpath('div/div/cc/div[@id="post_content_%s"]/text()'% author_id)[0].strip() 32 date = rs.get('content').get('date') 33 yield { 34 'author':author, 35 'content':content, 36 'date':date 37 } 38 39 def save_to_txt(result): 40 print('正在存储:',result) 41 42 with open('tieba.txt','a',encoding='utf-8') as f: 43 f.write('回帖作者:'+result['author']+' ') 44 f.write('回帖内容:'+result['content']+' ') 45 f.write('回帖时间:'+result['date']+' ') 46 f.write(' ') 47 48 49 def main(url): 50 html = get_html(url) 51 if html: 52 for result in parse_html(html): 53 save_to_txt(result) 54 55 if __name__=='__main__': 56 57 pool = ThreadPool(4) 58 urls=[] 59 base_url = 'http://tieba.baidu.com/p/3522395718?pn=' 60 for page_num in range(1, 21): 61 url = base_url + str(page_num) 62 urls.append(url) 63 64 pool.map(main,urls) 65 pool.close() 66 pool.join()