zoukankan      html  css  js  c++  java
  • 爬取7160

    优化后的代码如下,

    先用循环创建20个目录,然后循环写入这20个目录,每个最多写入50000

    #coding=utf-8
    import os
    import random
    import sys


    import urllib.request
    from bs4 import BeautifulSoup
    from urllib import error
    import re
    ls = ['meinv','zhenrenxiu',"lianglichemo",'rentiyishu','xiaohua','lianglichemo']
    file_list = os.listdir("d:\craw\")

    def validateTitle(title):
    rstr = r"[/\:*?"<>|]" # '/ : * ? " < > |'
    new_title = re.sub(rstr, "_", title) # 替换为下划线
    return new_title

    def get_file_name():
    file = random.sample(file_list,1)[0]
    path = 'd:\craw/'+ str(file);
    if os.path.isdir(path):
    total_num = len(os.listdir('d:\craw\'+ str(file)))
    if total_num >= 50000:
    file = get_file_name()
    else:
    os.mkdir(path)
    print("创建目录"+ str(path))

    return str(path)+'\'
    for k in ls:
    for j in range(1,101111):
    url_origin = "http://www.7160.com/"+str(k)+"/"+str(j)
    print(url_origin)
    try:
    page_obj = urllib.request.urlopen(url_origin)
    page_soup = BeautifulSoup(page_obj,'lxml')
    total_page_obj = page_soup.find(text=re.compile('共')).string
    pattern = re.compile(r'd+')
    match = pattern.search(total_page_obj)

    if match == None:
    total_page = 0;
    else:
    total_page = match.group();

    for i in range(1,int(total_page)+1):
    if i == 1 :
    url = url_origin+"/index.html"
    else:
    url = url_origin+"/index_"+str(i)+".html"
    request = urllib.request.Request(url)
    try:
    res = urllib.request.urlopen(request)

    soup = BeautifulSoup(res,'lxml')
    title_obj = soup.find(attrs={"class":"picmainer"})

    if title_obj is not None:
    print(url)
    title = title_obj.h1.string
    content = soup.find('img')
    src = content.get("src")
    file_name = validateTitle(title)+".jpg"
    urllib.request.urlretrieve(src, str(get_file_name())+file_name)
    print(str(get_file_name())+file_name+"保存成功")
    except Exception as e:
    print("异常"+str(e))
    except Exception as e:
    print("异常"+str(e))

      

  • 相关阅读:
    usaco PROB Checker Challenge 搜索
    usaco Superprime Rib 搜索
    hdu_1056_HangOver_201311071354
    hdu_1049_Climbing Worm_201311061331
    hdu_1048_The Hardest Problem Ever_201311052052
    hdu_1041_Computer Transformation_201311051648
    hdu_1039_Easier Done Than Said_201311051511
    hdu_1038_Biker's Trip Odometer_201311021643
    hdu_1037_Keep on Truckin'_201311021600
    hdu_1036_Average is not Fast Enough_201311021335
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8370574.html
Copyright © 2011-2022 走看看