zoukankan      html  css  js  c++  java
  • 【爬虫】多线程爬取糗事百科写入文件

    '''
    爬取糗事百科的段子,将内容和连接爬取下来,写入scv
    使用技术:多线程,锁,队列,xpath,csv
    '''
    
    import requests
    import csv
    from queue import Queue
    from lxml import etree
    import threading
    
    
    class Creeper(threading.Thread):
        def __init__(self,url_queue,content_queue,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.url_queue = url_queue
            self.content_queue = content_queue
    
        def run(self):
            while True:
                if self.url_queue.empty():
                    break
                url = self.url_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
            response = requests.get(url,headers=headers)
            text = etree.HTML(response.text)
            divEle = text.xpath('//div[contains(@class,"article block")]')
            for div in divEle:
                content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()')
                new_content = "
    ".join(list(map(lambda x:x.replace('
    ',''),content)))
                a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0]
                self.content_queue.put((new_content,a_url))
    
    class SaveFile(threading.Thread):
        def __init__(self,content_queue,writer,lock,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.content_queue = content_queue
            self.writer = writer
            self.lock = lock
    
        def run(self):
            while True:
                try:
                    content,link = self.content_queue.get(timeout=30)       # 设置超时时间
                    # 写入文件必须加锁
                    self.lock.acquire()
                    self.writer.writerow((content,link))
                    self.lock.release()
                    print('保存一条')
                except:
                    break
    
    
    def main():
        url_queue = Queue(100)
        content_queue = Queue(300)
        base_url = "https://www.qiushibaike.com/text/page/{}/"
        gLock = threading.Lock()
        # 解决写入中文乱码
        f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="")
        header = ['content','link']
        writer = csv.writer(f)
        writer.writerow(header)
    
        for i in range(1,13):
            url = base_url.format(i)
            url_queue.put(url)
    
        for i in range(2):
            c = Creeper(url_queue, content_queue)
            c.start()
    
        for i in range(2):
            s = SaveFile(content_queue,writer,gLock)
            s.start()
    
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    战争迷雾Fog Of War
    [UE4]运行时UMG组件跟随鼠标的逻辑:拖拽UMG组件(蓝图)
    [UE4]FString常用API
    用PNG作为Texture创建Material
    [UE4]C++代码操作SplineMesh
    [UE4]Visual Studio的相关插件安装:UE4.natvis和UnrealVS Extension
    TSubobjectPtr和C++传统指针的区别
    组件Slate教程 & UMG widget构造初始化函数中获取其内部组件
    设置UMG的ComboBox(String)字体大小
    UMG设置组件自适应居中或靠边
  • 原文地址:https://www.cnblogs.com/st-st/p/10413603.html
Copyright © 2011-2022 走看看