zoukankan      html  css  js  c++  java
  • 彼岸网爬虫

    import requests
    from PIL import Image
    from io import BytesIO
    import re
    from requests.exceptions import HTTPError
    
    root = "http://pic.netbian.com/index_%d.html"
    # 除首页外所有分页的统一格式
    
    uni = "http://pic.netbian.com"
    # 子页面和大图的URL前缀
    
    AllPage = []
    # 要爬取的分页URL
    
    AllImgHTML = []
    # 缩略图代表的子页面的部分URL,格式为/tupian/.*.html
    
    AllImgURL = []
    
    
    # 每张大图的部分URL,格式为/uploads/allimg/.*.jpg
    
    def GetPageURL(root, Start, counts):
        # 得到每个分页的URL放到AllPage中
        if Start == 1:
            AllPage.append("http://pic.netbian.com/index.html")
            # 将非标准格式的首页URL放入
            for i in range(Start + 1, Start + counts):
                newURL = root.replace("%d", str(i))
                AllPage.append(newURL)
        else:
            for i in range(Start, Start + counts):
                newURL = root.replace("%d", str(i))
                AllPage.append(newURL)
    
    
    def GetImgHTML(AllPage):
        # 得到每个分页中子页面的URL放到AllImgHTML中
        for PageURL in AllPage:
            try:
                res = requests.get(PageURL)
                res.raise_for_status()
            except HTTPError:
                print("HTTP Error!")
    
            except ConnectionError:
                print("Failed to connect!")
    
            with open("F:\1\PageFile.txt", "w", encoding="ISO-8859-1") as PageFile:
                PageFile.write(res.text)
                PageFile.close()
    
            with open("F:\1\PageFile.txt", "r", encoding="gbk") as ReadFile:
                str = ReadFile.read()
                mid = re.split(""", str)
                print(mid)
                # 用"进行分割,以进行正则表达式匹配
                for i in mid:
                    ImgHTML = re.findall("^/tupian/.*.html$", i)
                    # 提取所有符合格式的str放到ImgHTML中
                    if len(ImgHTML) != 0:
                        AllImgHTML.append(ImgHTML[0])
    
    
    def GetImgURL():
        # 得到每个分页中每个子页面的大图的URL放到UsefulImgURL中
        UsefulImgHTML = [None for i in range(len(AllImgHTML))]
        # 为字符串拼接分配内存
        for i in range(len(AllImgHTML)):
            UsefulImgHTML[i] = uni + AllImgHTML[i]
        # 拼接后得到了可用的子页面URL,格式为http://pic.netbian.com//tupian/.*.html
    
        for html in UsefulImgHTML:
            # 对图片组进行请求
            try:
                htmlres = requests.get(html)
                htmlres.raise_for_status()
            except HTTPError:
                print("HTTP Error!")
    
            except ConnectionError:
                print("Failed to connect!")
    
            with open("F:\1\ImgHTML.txt", "w", encoding="ISO-8859-1") as ImgHTML:
                ImgHTML.write(htmlres.text)
                ImgHTML.close()
    
            with open("F:\1\ImgHTML.txt", "r", encoding="gbk") as ReadHTML:
                str = ReadHTML.read()
                mid = re.split(""", str)
    
                for i in mid:
                    ImgURL = re.search("^/uploads/allimg/.*.jpg$", i)
                    if ImgURL is not None:
                        AllImgURL.append(ImgURL[0])
                        break
                    # 爬到一个大图的URL即break。将每张大图的部分URL存入AllImgURL中,格式为/uploads/allimg/.*.jpg
    
        UsefulImgURL = [None for i in range(len(AllImgURL))]
        # 拼接得到最终可供下载的URL放到UsefulImgURL中
        for i in range(len(AllImgURL)):
            UsefulImgURL[i] = uni + AllImgURL[i]
    
        return UsefulImgURL
    
    
    def DownloadWallpaper(url, path):
        try:
            res = requests.get(url)
            res.raise_for_status()
            MyImage = Image.open(BytesIO(res.content))
            MyImage.save(path)
            print("Done...")
        except HTTPError:
            print("HTTP Error!")
        except ConnectionError:
            print("Failed to connect!")
    
    
    if __name__ == "__main__":
        GetPageURL(root, 1,2 )
        GetImgHTML(AllPage)
        UsefulImgURL = GetImgURL()
        num = []
        for i in range(len(UsefulImgURL)):
            num.append(i)
    
        UsefulSavePath = [None for i in range(len(UsefulImgURL))]
        for i in range(len(UsefulSavePath)):
            UsefulSavePath[i] = "F:\1\" + str(num[i]) + ".jpg"
        for i in range(len(UsefulImgURL)):
            print(i, end=" ")
            DownloadWallpaper(UsefulImgURL[i], UsefulSavePath[i])
        print("Task completed!")
    
  • 相关阅读:
    ububtu 14.04 问题集合
    ubuntu grub 引导修复
    Ubuntu 下 glpk 的安装及使用
    ubuntu vim 7.4 编译安装
    ubuntu 12.04 clang 3.4 安装
    CMakeLists实战解读--YouCompleteMe
    Flume安装及部署
    SpringBoot整合kafka
    linux安装kafka
    Linux安装zookeeper
  • 原文地址:https://www.cnblogs.com/liyu8/p/14243771.html
Copyright © 2011-2022 走看看