zoukankan      html  css  js  c++  java
  • python之爬虫(二)

     1 #!/usr/bin/env python3
     2 # -*- coding: utf-8 -*-
     3 # author:Momo time:2018/6/30
     4 
     5 """
     6     目标网站:http://tieba.baidu.com/p/3522395718
     7     目标内容:跟帖用户名,跟帖内容,跟帖时间
     8     涉及知识:
     9                 Requests 获取网页
    10                 XPath 提取内容
    11                 map 实现多线程爬虫
    12     掌握以下知识:使用xpath进行网页提取
    13                  使用map实现多线程爬虫
    14 """
    15 
    16 from lxml import etree
    17 from multiprocessing.dummy import Pool as ThreadPool
    18 import urllib.request
    19 import json
    20 # from imp import reload
    21 
    22 # # "将贴吧拷下的代码保存为utf-8"
    23 # import sys
    24 # reload(sys)
    25 # sys.setdefaultencoding('utf-8')
    26 
    27 def towrite(contentdict):
    28     f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n' )
    29     f.writelines(u'回帖内容:' + contentdict['topic_reply_content'] + '\n')
    30     f.writelines(u'回帖人:' + str(contentdict['user_name']) + '\n\n')
    31 
    32 def spider(url):
    33     html_page = urllib.request.urlopen(url)
    34     html_code = html_page.read().decode('utf-8')
    35     selector = etree.HTML(html_code)
    36     contetnt_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
    37     item = {}
    38     for each in contetnt_field:
    39         reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot', ''))
    40         author = reply_info['author']['user_name']
    41         content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
    42         reply_time = reply_info['content']['date']
    43         print(content)
    44         print(reply_time)
    45         print(author)
    46         item['user_name'] = author
    47         item['topic_reply_content'] = content
    48         item['topic_reply_time'] = reply_time
    49         towrite(item)
    50 
    51 if __name__ == '__main__':
    52     pool = ThreadPool(4)
    53     f = open('content.txt', 'a',encoding='utf-8')
    54     page = []
    55     for i in range(1,21):
    56         newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
    57         page.append(newpage)
    58 
    59     results = pool.map(spider, page )
    60     pool.close()
    61     pool.join()
    62     f.close()
  • 相关阅读:
    minixml3.1库的使用
    linux coredump及函数栈空间大小分析
    linx 设备名字来由 sd sr sg st
    gcc 遇到过的语法问题
    I帧、B帧、P帧、NALU类型
    linux grub 使用
    结构体sockadrr、sockaddr_in、in_addr的定义
    linux c log 日志接口
    关于32位/64位版本头文件的重要
    汇编指令缩写
  • 原文地址:https://www.cnblogs.com/momo072994MLIA/p/9249637.html
Copyright © 2011-2022 走看看