zoukankan      html  css  js  c++  java
  • 爬虫学习(十七)——多线程爬取数据案例

    from typing import Optional, Callable, Iterable, Mapping, Any

    import requests

    from lxml import etree

    from threading import Thread

    from queue import Queue

    import json

    url = 'https://www.qiushibaike.com/text/page/%d/'

    queue_url = Queue(13)

    queue_html = Queue(13)

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9'}

    exitFlag = False


    class CrawlThread(Thread):

    def __init__(self, queue, thread_id) -> None:
    super().__init__()
    self.queue = queue
    self.thread_id = thread_id

    def run(self) -> None:
    super().run()
    print('-------------爬虫线程--%d--启动--------------' % (self.thread_id))
    self.get_html()
    print('-------------爬虫线程--%d--终止--------------' % (self.thread_id))

    def get_html(self):

    while True:
    # 根据队列是否为空,退出
    if self.queue.empty():
    break
    try:
    page = self.queue.get(block=False)

    response = requests.get(url=url % (page), headers=headers)

    response.encoding = 'utf-8'

    html = response.text

    queue_html.put((html, page))

    self.queue.task_done()

    print('----------------爬虫线程--%d--获取--%d--页的数据-----------------' % (self.thread_id, page))
    except Exception as e:
    pass

    pass


    class ParseThread(Thread):

    def __init__(self, queue, thread_id, fp):
    super().__init__()
    self.queue = queue
    self.thread_id = thread_id
    self.fp = fp

    def run(self):

    print('-------------解析线程--%d--启动--------------' % (self.thread_id))
    self.parse_html()
    print('-------------解析线程--%d--终止--------------' % (self.thread_id))

    def parse_html(self):

    while True:
    if exitFlag:
    break

    try:
    html,page = self.queue.get(block=False)

    tree = etree.HTML(html)

    divs = tree.xpath('//div[contains(@id,"qiushi_tag_")]')

    # 遍历divs(列表)---------div 元素(包含我们想要的内容)
    for div in divs:
    try:
    content = div.xpath('.//div[@class="content"]/span/text()')[0].strip()
    # 点赞
    zan = div.xpath('.//span[@class="stats-vote"]/i/text()')[0].strip()
    # 评论
    comment = div.xpath('.//span[@class="stats-comments"]//i/text()')[0].strip()
    # 作者
    author = div.xpath('.//div[@class="author clearfix"]//h2/text()')[0].strip()
    item = {}
    item['author'] = author
    item['zan'] = zan
    item['comment'] = comment
    item['content'] = content
    json.dump(item,fp,ensure_ascii=False)
    except Exception as e:
    print('----------解析异常,页码是:%d---------'%(page))
    pass
    print('------------解析线程--%d--解析页码--%d--任务-------------'%(self.thread_id,page))
    self.queue.task_done()
    except Exception as e:
    pass

    pass


    if __name__ == '__main__':
    for i in range(13):
    queue_url.put(i + 1)

    # 开启网络请求
    for i in range(5):
    t = CrawlThread(queue_url, i)
    t.start()

    fp = open('./糗事百科.txt', mode='a', encoding='utf-8')
    # 解析线程的任务
    for i in range(3):
    t = ParseThread(queue_html, i, fp)
    t.start()


    # 队列锁,队列中任务必须全部完成才可以执行下一步
    queue_url.join()
    queue_html.join()
    exitFlag = True

    fp.close()
  • 相关阅读:
    图数据库-Neo4j使用
    [原创]networkx 画中文节点
    python networkx:绘制网络图
    Python二维数组,坑苦了
    xgboost
    python 机器学习
    计算机网络知识
    前端优化不完全指南(转)
    FlashFXP 4.3.1 注册码
    你真的会使用Chrome开发者工具吗?
  • 原文地址:https://www.cnblogs.com/kuangkuangduangduang/p/10415633.html
Copyright © 2011-2022 走看看