zoukankan      html  css  js  c++  java
  • 【爬虫】多线程爬取糗事百科写入文件

    '''
    爬取糗事百科的段子,将内容和连接爬取下来,写入scv
    使用技术:多线程,锁,队列,xpath,csv
    '''
    
    import requests
    import csv
    from queue import Queue
    from lxml import etree
    import threading
    
    
    class Creeper(threading.Thread):
        def __init__(self,url_queue,content_queue,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.url_queue = url_queue
            self.content_queue = content_queue
    
        def run(self):
            while True:
                if self.url_queue.empty():
                    break
                url = self.url_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
            response = requests.get(url,headers=headers)
            text = etree.HTML(response.text)
            divEle = text.xpath('//div[contains(@class,"article block")]')
            for div in divEle:
                content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()')
                new_content = "
    ".join(list(map(lambda x:x.replace('
    ',''),content)))
                a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0]
                self.content_queue.put((new_content,a_url))
    
    class SaveFile(threading.Thread):
        def __init__(self,content_queue,writer,lock,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.content_queue = content_queue
            self.writer = writer
            self.lock = lock
    
        def run(self):
            while True:
                try:
                    content,link = self.content_queue.get(timeout=30)       # 设置超时时间
                    # 写入文件必须加锁
                    self.lock.acquire()
                    self.writer.writerow((content,link))
                    self.lock.release()
                    print('保存一条')
                except:
                    break
    
    
    def main():
        url_queue = Queue(100)
        content_queue = Queue(300)
        base_url = "https://www.qiushibaike.com/text/page/{}/"
        gLock = threading.Lock()
        # 解决写入中文乱码
        f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="")
        header = ['content','link']
        writer = csv.writer(f)
        writer.writerow(header)
    
        for i in range(1,13):
            url = base_url.format(i)
            url_queue.put(url)
    
        for i in range(2):
            c = Creeper(url_queue, content_queue)
            c.start()
    
        for i in range(2):
            s = SaveFile(content_queue,writer,gLock)
            s.start()
    
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    ios系统滚动穿透
    移动h5 开发遇到ios系统的各种问题汇总
    移动端选择时间时软键盘弹出问题
    ios系统设置-webkit-overflow-scrolling: touch导致z-index 失效 (弹窗层级设置无效)- 替代方案
    npm i 报错 npmERR! code Z_BUF_ERROR errno -5 // 后继 chromedriver.zip 相关问题报错解决
    mysql、orcale、sql server的区别
    jsp中的select选择
    sql面试
    java面试题
    struts2总结
  • 原文地址:https://www.cnblogs.com/st-st/p/10413603.html
Copyright © 2011-2022 走看看