zoukankan html css js c++ java

python之爬虫（二）

 1 #!/usr/bin/env python3
 2 # -*- coding: utf-8 -*-
 3 # author：Momo time:2018/6/30
 4 
 5 """
 6     目标网站：http://tieba.baidu.com/p/3522395718
 7     目标内容：跟帖用户名，跟帖内容，跟帖时间
 8     涉及知识：
 9                 Requests 获取网页
10                 XPath 提取内容
11                 map 实现多线程爬虫
12     掌握以下知识：使用xpath进行网页提取
13                  使用map实现多线程爬虫
14 """
15 
16 from lxml import etree
17 from multiprocessing.dummy import Pool as ThreadPool
18 import urllib.request
19 import json
20 # from imp import reload
21 
22 # # "将贴吧拷下的代码保存为utf-8"
23 # import sys
24 # reload(sys)
25 # sys.setdefaultencoding('utf-8')
26 
27 def towrite(contentdict):
28     f.writelines(u'回帖时间：' + str(contentdict['topic_reply_time']) + '\n' )
29     f.writelines(u'回帖内容：' + contentdict['topic_reply_content'] + '\n')
30     f.writelines(u'回帖人：' + str(contentdict['user_name']) + '\n\n')
31 
32 def spider(url):
33     html_page = urllib.request.urlopen(url)
34     html_code = html_page.read().decode('utf-8')
35     selector = etree.HTML(html_code)
36     contetnt_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
37     item = {}
38     for each in contetnt_field:
39         reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot', ''))
40         author = reply_info['author']['user_name']
41         content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
42         reply_time = reply_info['content']['date']
43         print(content)
44         print(reply_time)
45         print(author)
46         item['user_name'] = author
47         item['topic_reply_content'] = content
48         item['topic_reply_time'] = reply_time
49         towrite(item)
50 
51 if __name__ == '__main__':
52     pool = ThreadPool(4)
53     f = open('content.txt', 'a',encoding='utf-8')
54     page = []
55     for i in range(1,21):
56         newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
57         page.append(newpage)
58 
59     results = pool.map(spider, page )
60     pool.close()
61     pool.join()
62     f.close()

查看全文

相关阅读:
minixml3.1库的使用
 linux coredump及函数栈空间大小分析
 linx 设备名字来由 sd sr sg st
gcc 遇到过的语法问题
 I帧、B帧、P帧、NALU类型
 linux grub 使用
 结构体sockadrr、sockaddr_in、in_addr的定义
 linux c log 日志接口
 关于32位/64位版本头文件的重要
 汇编指令缩写

原文地址：https://www.cnblogs.com/momo072994MLIA/p/9249637.html