zoukankan      html  css  js  c++  java
  • 【爬虫】多线程爬取糗事百科写入文件

    '''
    爬取糗事百科的段子,将内容和连接爬取下来,写入scv
    使用技术:多线程,锁,队列,xpath,csv
    '''
    
    import requests
    import csv
    from queue import Queue
    from lxml import etree
    import threading
    
    
    class Creeper(threading.Thread):
        def __init__(self,url_queue,content_queue,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.url_queue = url_queue
            self.content_queue = content_queue
    
        def run(self):
            while True:
                if self.url_queue.empty():
                    break
                url = self.url_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
            response = requests.get(url,headers=headers)
            text = etree.HTML(response.text)
            divEle = text.xpath('//div[contains(@class,"article block")]')
            for div in divEle:
                content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()')
                new_content = "
    ".join(list(map(lambda x:x.replace('
    ',''),content)))
                a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0]
                self.content_queue.put((new_content,a_url))
    
    class SaveFile(threading.Thread):
        def __init__(self,content_queue,writer,lock,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.content_queue = content_queue
            self.writer = writer
            self.lock = lock
    
        def run(self):
            while True:
                try:
                    content,link = self.content_queue.get(timeout=30)       # 设置超时时间
                    # 写入文件必须加锁
                    self.lock.acquire()
                    self.writer.writerow((content,link))
                    self.lock.release()
                    print('保存一条')
                except:
                    break
    
    
    def main():
        url_queue = Queue(100)
        content_queue = Queue(300)
        base_url = "https://www.qiushibaike.com/text/page/{}/"
        gLock = threading.Lock()
        # 解决写入中文乱码
        f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="")
        header = ['content','link']
        writer = csv.writer(f)
        writer.writerow(header)
    
        for i in range(1,13):
            url = base_url.format(i)
            url_queue.put(url)
    
        for i in range(2):
            c = Creeper(url_queue, content_queue)
            c.start()
    
        for i in range(2):
            s = SaveFile(content_queue,writer,gLock)
            s.start()
    
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    PAT (Basic Level) Practice (中文)1002 写出这个数 (20 分)
    PAT (Advanced Level) Practice 1001 A+B Format (20 分)
    BP神经网络(原理及MATLAB实现)
    问题 1676: 算法2-8~2-11:链表的基本操作
    问题 1744: 畅通工程 (并查集)
    链表的基本操作(创建链表,查询元素,删除元素,插入元素等)
    问题 1690: 算法4-7:KMP算法中的模式串移动数组
    问题 1923: [蓝桥杯][算法提高VIP]学霸的迷宫 (BFS)
    Hdu1372 Knight Moves (BFS)
    Problem 2285 迷宫寻宝 (BFS)
  • 原文地址:https://www.cnblogs.com/st-st/p/10413603.html
Copyright © 2011-2022 走看看