zoukankan      html  css  js  c++  java
  • 爬虫----配合多线程的思路

    from pyquery import PyQuery as pq
    import os
    from queue import Queue
    from threading import Thread
    class txtparser(Thread):
        def __init__(self,queue):
            Thread.__init__(self)
            self.queue = queue
            #文件夹目录
    
        def run(self):
            #path = "E:辰东heTian\395020.html"
            while True:
                content = self.queue.get()
                html=""
                try:
                    with open (content,"r",encoding='utf-8') as reader:
                         html = reader.read()
                except Exception:
                    with open (content,"r") as reader:
                         html = reader.read()
                #print(html)
                try:
                    doc = pq(html)
                    title = doc("#main .content_read .box_con .bookname h1")
                    print("标题=====",title.text())
                    clipname = content.split("\")[-2]
                    #junkp = doc(".content").find('p').remove()
                    passage = doc("#content").text()
                except Exception:
                    continue
                print("正文======",str.replace(passage,"<br/>",""))
                try:
                    clipname = str.replace(clipname,"","")
                    clipname = str.replace(clipname,"","")
                except Exception:
                    clipname = clipname
                if os.path.exists(clipname):
                    pass
                else:
                    os.mkdir(clipname)
                try:
                    with open(clipname+"\"+title.text()+".txt","w",encoding="gbk") as writer:
                        writer.write(passage)
                    print("完成{}的写入".format(clipname+"\"+title.text()+".txt"))
                except Exception:
                    with open("errorecorder.log","a") as writer:
                        writer.write(clipname+"\"+title.text()+".txt"+"
    ")
                print("文件夹名称======",clipname)
    
    def launchtxtparser(parentdir):
        rootdir = parentdir
        queue = Queue()
        print(rootdir)
        for i in os.listdir(rootdir):
            print(i)
            if os.path.isdir(rootdir+"\"+i):
                print(rootdir+"\"+i)
                g = (k for k in os.listdir(rootdir+"\"+i))
                print(next(g))
                while True:
                     try:
                         filename = next(g)
                         fullfilename = rootdir+"\"+i+"\"+filename
                         queue.put(fullfilename)
                         print(fullfilename)
                     except StopIteration:
                         print("ooooophs~处理完毕")
                         break
        for i in range(10):
            cpc = txtparser(queue)
            cpc.daemon=True
            cpc.start()
        queue.join()
    #print(os.listdir(rootdir))
    launchtxtparser("E:月关")
  • 相关阅读:
    yepnope.js 异步加载资源文件
    省心选房5步走 买房前先算经济账还要多打听
    css中inline、block、inlineblock的区别
    web标准化设计:常用的CSS命名规则
    用css的手段解决Google Chrome浏览器的字体最小12px问题
    HTML元素的默认样式
    CSS中 常见中文字体的英文名称
    《重构 改善既有代码的设计》书摘
    手机号码匹配规则
    WEB开发——大批量数据导出经验谈
  • 原文地址:https://www.cnblogs.com/saintdingspage/p/10582296.html
Copyright © 2011-2022 走看看