zoukankan      html  css  js  c++  java
  • 爬取7160

    优化后的代码如下,

    先用循环创建20个目录,然后循环写入这20个目录,每个最多写入50000

    #coding=utf-8
    import os
    import random
    import sys


    import urllib.request
    from bs4 import BeautifulSoup
    from urllib import error
    import re
    ls = ['meinv','zhenrenxiu',"lianglichemo",'rentiyishu','xiaohua','lianglichemo']
    file_list = os.listdir("d:\craw\")

    def validateTitle(title):
    rstr = r"[/\:*?"<>|]" # '/ : * ? " < > |'
    new_title = re.sub(rstr, "_", title) # 替换为下划线
    return new_title

    def get_file_name():
    file = random.sample(file_list,1)[0]
    path = 'd:\craw/'+ str(file);
    if os.path.isdir(path):
    total_num = len(os.listdir('d:\craw\'+ str(file)))
    if total_num >= 50000:
    file = get_file_name()
    else:
    os.mkdir(path)
    print("创建目录"+ str(path))

    return str(path)+'\'
    for k in ls:
    for j in range(1,101111):
    url_origin = "http://www.7160.com/"+str(k)+"/"+str(j)
    print(url_origin)
    try:
    page_obj = urllib.request.urlopen(url_origin)
    page_soup = BeautifulSoup(page_obj,'lxml')
    total_page_obj = page_soup.find(text=re.compile('共')).string
    pattern = re.compile(r'd+')
    match = pattern.search(total_page_obj)

    if match == None:
    total_page = 0;
    else:
    total_page = match.group();

    for i in range(1,int(total_page)+1):
    if i == 1 :
    url = url_origin+"/index.html"
    else:
    url = url_origin+"/index_"+str(i)+".html"
    request = urllib.request.Request(url)
    try:
    res = urllib.request.urlopen(request)

    soup = BeautifulSoup(res,'lxml')
    title_obj = soup.find(attrs={"class":"picmainer"})

    if title_obj is not None:
    print(url)
    title = title_obj.h1.string
    content = soup.find('img')
    src = content.get("src")
    file_name = validateTitle(title)+".jpg"
    urllib.request.urlretrieve(src, str(get_file_name())+file_name)
    print(str(get_file_name())+file_name+"保存成功")
    except Exception as e:
    print("异常"+str(e))
    except Exception as e:
    print("异常"+str(e))

      

  • 相关阅读:
    安卓中像素px和dp的转换
    Android 使用Vector XML文件创建矢量图片资源,editText监听
    动态设置RecyclerView的高度
    EditText一些用法
    各种加密算法比较
    多线程--Task,等待用户输入AutoResetEvent
    AutoCAD二次开发——AutoCAD.NET API开发环境搭建
    Office(Excel、Word)二次开发——VSTO
    个人信息管理PIM——密码管理工具软件
    【矩阵计算】矩阵乘法其一:基础符号和算法
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8370574.html
Copyright © 2011-2022 走看看