zoukankan html css js c++ java

多线程爬取 threading.Thread 文件名支持gbk编码

# - *- coding:utf-8-*-
import urllib2
import re
import os
import threading
import sys
reload(sys)
sys.setdefaultencoding('utf-8') #编码
from bs4 import BeautifulSoup
os.mkdir(u'小说0')
os.chdir(u'小说0')
def get_url():
    User_Agent= 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
    url="http://f.qidian.com/all?size=-1&sign=-1&tag=-1&chanId=-1&subCateId=-1&orderId=&update=-1&page=1&month=-1&style=1&action=1"
    headers={'User-Agent':User_Agent}
    request=urllib2.Request(url,headers=headers)
    html=urllib2.urlopen(request).read()
    soup = BeautifulSoup(html, 'html.parser')
    l = soup.find_all('div', class_ = 'book-mid-info')
    print #

    for htmltile in l:
        name = htmltile.find('h4').encode('utf-8')
        reg=r'<h4><a data-bid=".*?" data-eid=".*?" href="(.*?)" target="_blank">(.*?)</a></h4>'
        text=re.findall(reg,name)

        return text
def get_content(curl,title):
    os.mkdir(title.encode('gbk'))  #创建目录
    #os.chdir(title.encode('gbk'))   #在当前目录下操作
    html1 = urllib2.urlopen('http:'+curl+'#Catalog').read()
    reg=re.compile(r'<li data-rid=".*?"><a href="(.*?)" target="_blank" data-eid="qd_G55" data-cid=".*?" title=".*?">(.*?)</a>')
    titles=re.finditer(reg,html1)

    for n in titles:
        curl_=n.group(1)
        names=n.group(2)


        fd=open(title.encode('gbk')+'/'+names.encode('gbk')+'.txt','wb') #在指定目录下创建文件
        #fd=open(names.encode('gbk')+'.txt','wb')
        print "正在爬取%s本"%names
        htmlll=urllib2.urlopen('http:'+curl_).read()
        regs=re.compile(r'<div class="read-content j_readContent">s*([sS]*?)s*</div>') #正则多行时注意用s*
        content=re.findall(regs,htmlll)
        for m in content:
            contents=m.replace('<p>','
')
            fd.write(names+'
'+contents)
            print "已完成%s"%names
            fd.close()

threads=[]
def main():
    for i in get_url():
        th=threading.Thread(target= get_content,args=(i[0],i[1]))
        threads.append(th)
    for t in threads:
        t.start()
        while True:
            if len(threading.enumerate())<10:#控制线程数量
                break
if __name__=='__main__':
    main()

查看全文

相关阅读:
三国演义中的十大谎言 VS 程序员十大口头禅
 Node.js 的单线程事件驱动模型和内置的线程池模型
 为了让医院把医护还给患者，他们用了这个…
华为云数据库内核专家为您揭秘：GaussDB(for MySQL)并行查询有多快？
brew
Python
Python
Python
AtCoder Beginner Contest 215 （个人题解 A~F）
AcWing 第 13 场周赛补题记录

原文地址：https://www.cnblogs.com/ZHANG576433951/p/6169422.html