zoukankan      html  css  js  c++  java
  • 爬虫_古诗文网(队列,多线程,锁,正则,xpath)

     
     1 import requests
     2 from queue import Queue
     3 import threading
     4 from lxml import etree
     5 import re
     6 import csv
     7 
     8 
     9 class Producer(threading.Thread):
    10     headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'}
    11     def __init__(self, page_queue, poem_queue, *args, **kwargs):
    12         super(Producer, self).__init__(*args, **kwargs)
    13         self.page_queue = page_queue
    14         self.poem_queue = poem_queue
    15 
    16 
    17     def run(self):
    18         while True:
    19             if self.page_queue.empty():
    20                 break
    21             url = self.page_queue.get()
    22             self.parse_html(url)
    23 
    24 
    25     def parse_html(self, url):
    26         # poems = []
    27         headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'}
    28         response = requests.get(url, headers=headers)
    29         response.raise_for_status()
    30         html = response.text
    31         html_element = etree.HTML(html)
    32         titles = html_element.xpath('//div[@class="cont"]//b/text()')
    33         contents = html_element.xpath('//div[@class="contson"]')
    34         hrefs = html_element.xpath('//div[@class="cont"]/p[1]/a/@href')
    35         for index, content in enumerate(contents):
    36             title = titles[index]
    37             content = etree.tostring(content, encoding='utf-8').decode('utf-8')
    38             content = re.sub(r'<.*?>|
    |', '', content)
    39             content = re.sub(r'u3000u3000', '', content)
    40             content = content.strip()
    41             href = hrefs[index]
    42             self.poem_queue.put((title, content, href))
    43 
    44 
    45 class Consumer(threading.Thread):
    46 
    47     def __init__(self, poem_queue, writer, gLock, *args, **kwargs):
    48         super(Consumer, self).__init__(*args, **kwargs)
    49         self.writer = writer
    50         self.poem_queue = poem_queue
    51         self.lock = gLock
    52 
    53     def run(self):
    54         while True:
    55             try:
    56                 title, content, href = self.poem_queue.get(timeout=20)
    57                 self.lock.acquire()
    58                 self.writer.writerow((title, content, href))
    59                 self.lock.release()
    60             except:
    61                 break
    62 
    63 
    64 def main():
    65     page_queue = Queue(100)
    66     poem_queue = Queue(500)
    67     gLock = threading.Lock()
    68     fp = open('poem.csv', 'a',newline='', encoding='utf-8')
    69     writer = csv.writer(fp)
    70     writer.writerow(('title', 'content', 'href'))
    71    
    72 
    73     for x in range(1, 100):
    74         url = 'https://www.gushiwen.org/shiwen/default.aspx?page=%d&type=0&id=0' % x
    75         page_queue.put(url)
    76 
    77     for x in range(5):
    78         t = Producer(page_queue, poem_queue)
    79         t.start()
    80 
    81     for x in range(5):
    82         t = Consumer(poem_queue, writer, gLock)
    83         t.start()
    84 
    85 if __name__ == '__main__':
    86     main()

    运行结果

     

  • 相关阅读:
    Future接口和Callable接口以及FeatureTask详解
    puppet的使用:ERB模板介绍
    puppet的使用:依赖关系整理
    数字证书常见格式整理
    c3p0配置文件
    dockerfile简述
    Grape简介
    keytool和openssl生成的证书转换
    Grape教程-params
    耿丹CS16-2班助教总结
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9460507.html
Copyright © 2011-2022 走看看