zoukankan      html  css  js  c++  java
  • python 爬虫 糗百成人

    import urllib
    from time import sleep
    
    import requests
    from lxml import etree
    
    
    
    try:
        def all_links(url,page):
            # if "900.html" in url:
            #     print("结束");
            #     return None
            url = url + str(page) + ".html";
            response = requests.get(url)
            print(url, response.status_code)
            html = etree.HTML(response.content.decode('gbk'))
            ## 获取图片 并且保存
            imgs = html.xpath('.//div[@id="wrapper"]//div[@class="ui-module"]//img/@src')
            for img in imgs:
                file_name = img.split('/')[-1]
                first = img.split('/')[0]
                if first != 'http:' and first != 'https:':
                    print("错误图片"+img)
                else:
                    dir_path = "/www/spider/images/"
                    try:
                        file_content = requests.get(img)
                        if file_content.status_code != 200:
                            print(img,"下载失败")
                        else:
    
                            #urllib.request.urlretrieve(img, dir_path + file_name)
                            with open(dir_path+file_name,"wb") as f:
                                f.write(file_content.content)
                                print("保存图片" + dir_path + file_name + "成功")
                    except Exception as ee:
                        print(str(ee))
            # links = html.xpath('.//div[@class="page"]//a[contains(text(),"下一页")]/@href')
            # print(links)
            # if len(links) < 1:
            #     pass
            # else:
            sleep(1)
            host = 'http://www.qiubaichengren.net/'
            next_page = page + 1
            all_links(host,next_page)
    
        for i in range(1,991):
            all_links("http://www.qiubaichengren.net/",354)
    except Exception as e:
        print(str(e))
    

     循环的版本

    import urllib
    from time import sleep

    import requests
    from lxml import etree



    try:
    def all_links(url):
    if "100.html" in url:
    print("结束");
    return None
    response = requests.get(url)
    print(url, response.status_code)
    html = etree.HTML(response.content.decode('gbk'))
    ## 获取图片 并且保存
    imgs = html.xpath('.//div[@id="wrapper"]//div[@class="ui-module"]//img/@src')
    for img in imgs:
    file_name = img.split('/')[-1]
    first = img.split('/')[0]
    if first != 'http:' and first != 'https:':
    print("错误图片"+img)
    else:
    dir_path = "d:\www\spider\images\"
    urllib.request.urlretrieve(img, dir_path + file_name)
    print("保存图片" + dir_path + file_name + "成功")
    links = html.xpath('.//div[@class="page"]//a[contains(text(),"下一页")]/@href')
    print(links)
    if len(links) < 1:
    pass
    else:
    sleep(5)
    host = 'http://www.qiubaichengren.net/'
    new_url = host + links[0];
    all_links(new_url)
    all_links("http://www.qiubaichengren.net/8.html")
    except Exception as e:
    print(str(e))

  • 相关阅读:
    Go 函数
    Go 基础
    Emmet使用详解
    Linux系统安装7.4
    NTP时间服务
    部署Java和Tomcat
    Linux用户管理
    Linux定时任务
    Linux正则详解
    Linux目录结构
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8919958.html
Copyright © 2011-2022 走看看