zoukankan      html  css  js  c++  java
  • 多线程爬取斗图表情包

    和朋友在QQ上聊天感觉呀没有激情,突然,突发奇想,我写个小的爬虫 ,把表情包爬取下来随便挑,斗到他们吐血。
          
                    



    下面是爬取斗图的代码,代码可供参考

    #encoding:utf8
    #模块
    import re
    import requests
    from lxml import etree
    import os
    import random
    import threading
    import time
    import hashlib

    def makemake(path):
    path = re.sub('\,|:|?|!','',path)
    path = path.replace(' ','')
    #替换
    if os.path.isdir('F:\11\斗图\'+path):
    #创建路径
    print(path+'已经存在')
    else:
    print('开始创建'+path)
    zhi = os.makedirs('F:\11\斗图\'+path)
    # print(zhi)

    def make_files(path,source):
    path = re.sub('\,|:|?|!','',path)
    path = path.replace(' ','')
    #替换
    if os.path.isfile('F:\11\斗图\'+path):
    #判断路径是否存在,如果存在就直接打印
    print(path+'已经存在')
    else:
    #循环网页
    while True:
    n = 0
    n = n + 1
    try:
    source = requests.get('http://'+source).content
    break
    except:
    print('http://'+source+'连接出错正在重试当前次数:'+str(n))
    time.sleep(1)
    if n>6:
    print('放弃http://'+source)
    break


    print('正在下载'+path)
    file = open('F:\11\斗图\'+path,'wb')
    #打开文件夹
    file.write(source)
    #写入
    file.close()
    #关闭


    def start_spider(g):
    print('当前下载页数为:' + str(g))
    while True:
    n = 1
    try:
    yuan = requests.get('https://www.doutula.com/article/list/?page=' + str(g)).text
    break
    except:
    print('https://www.doutula.com/article/list/?page=' + str(g) + '连接出错正在重试当前次数:' + str(n))
    time.sleep(1)
    n = n + 1

    lists = etree.HTML(yuan).xpath('//*[@id="home"]/div/div/div/ul/a')
    #获取图片规则
    for i in lists:
    img_name = i.xpath('div/h4/text()')[0]
    makemake(img_name)
    for ii in i.xpath('div/div/img/@data-original'):
    img_url = ii[2:]
    wei = img_url[-4:]
    md5 = hashlib.md5(wei.encode("gb2312"))
    listss = md5.hexdigest()
    if listss in ['.jpg','.gif','.png']:
    make_files(img_name + '\' + str(random.randint(1, 99999999999999)) + listss, img_url)
    else:
    print(img_url)

    thread_list = []

    for g in range(1,499):

    while True:
    print(len(thread_list))
    if len(thread_list)<21:
    the_thread = threading.Thread(target=start_spider,args=(g,))
    the_thread.setDaemon(True)
    the_thread.start()
    thread_list.append(the_thread)
    break
    else:
    print('线程数为:'+str(len(thread_list))+'等待清空')
    time.sleep(1)
    for thread in thread_list:
    if not thread.is_alive():
    thread_list.remove(thread)

    for t in thread_list:
    t.join()


    print('完成')



  • 相关阅读:
    cms模板内的标签替换思路(不可能比这更优秀的了)
    Mysql数据不算大,备份却非常慢
    PHP防止盗链
    Flash+PHP多文件上传,可将PHP换成别的语言脚本,如asp;jsp等
    把Linux Ubuntu安装到U盘上
    PHP中功能强大却少使用的函数 为你提供更多的思路
    云端計算詳解
    PHP循环输出变量
    HipHop
    80后传记
  • 原文地址:https://www.cnblogs.com/lianghongrui/p/7060113.html
Copyright © 2011-2022 走看看