zoukankan      html  css  js  c++  java
  • 目前学习的爬取小数据图片zzz

    import os
    import threading
    import re
    import time
    from lxml import etree
    
    
    all_img_urls = []    # 图片列表页面的数组
    
    g_lock = threading.Lock()      # 初始化一个锁
    
    # 声明一个生产者的类,来不断地获取图片详情页地址,然后添加到 all_img_url列表中
    
    # url = "http://www.xiaohuar.com/"
    
    all_urls = []
    
    class Spider(object):
        # 构造函数,初始化数据实用
        def __init__(self,target_url,headers):
            self.target_url = target_url
            self.headers = headers
    
        # 获取所有的想要抓取的URL
        def getUrls(self,start_page,page_num):
            for i in range(start_page,page_num):
                url = self.target_url % i
                all_urls.append(url)
    
    
    if __name__ == '__main__':
        headers = {
                    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
                    "Host":"eclick.baidu.com",
    
                }
        target_url = "http://www.xiaohuar.com/list-1-%d.html"   # 抓取链接的样式
    
        spider = Spider(target_url,headers)    # 抓取链接的对象传入 链接与请求头
        spider.getUrls(0,14)    # 抓取的多少页面的链接
        # print (all_urls)
    
    
    class Producer(threading.Thread):    #创建一个生产者用来批量的'生产'链接
    
        def run(self):
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
                "Host": "eclick.baidu.com",
    
            }
    
            while len(all_urls) > 0:    # 这里写了一个死循环,为的是能够一直抓取为爬去数据的链接
                g_lock.acquire()     # 锁,为的是不让不同的线程共同使用同一条连接
            # for url in all_urls:
                url = all_urls.pop()    # 使用pop方法,可以获取链接
                g_lock.release()      # 获取连接后 释放锁,让其他线程可前去列表中获取链接
                response = requests.get(url,headers).text
    
                selector = etree.HTML(response)    # 使用xpath
    
                mods = selector.xpath("//div[@class='item_t']")    # 获取指定标签
    
                for i in mods:
                    img_link = i.xpath("div[@class='img']/a/img/@src")
                    name = i.xpath("div[@class='img']/span/text()")
                    name = name[0].encode("utf8")
                    img_link = img_link[0].encode("utf8")
    
                    comment = {name: img_link}
                    if img_link.startswith("/"):    # 因为抓取的链接,有一部分是本地,所以在此处将之拼接成可直接访问的url
                        str = "http://www.xiaohuar.com"
                        img_link = str + img_link
                        comment = {name: img_link}
                        all_img_urls.append(comment)
                    all_img_urls.append(comment)
    
    
    for x in range(10):     # 创建10个线程用来爬去链接
        down = Producer()
        down.run()
    # print all_img_urls
    
    
    
    
    class DownPic(threading.Thread):      # 用来下载爬取数据的类
    
        def run(self):
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
                "Host": "eclick.baidu.com",
    
            }
    
            while True:  # 这个地方写成死循环,为的是不断监控图片链接数组是否更新
                g_lock.acquire()
                if len(all_img_urls) == 0: #没有图片了,就解锁
                    g_lock.release()
                    continue
                else:
                    img = all_img_urls.pop()
                    g_lock.release()
                    # 遍历字典列表
                    for key,value in img.items():
                        path =  "xiaohua/%s.jpg"% key.decode("utf8")
                        response = requests.get(value)
                        # print path
                        with open (path,"wb") as f:
                            f.write(response.content)
                            f.close()#
    # #
    #
    #
    for x in range(10):
        down = DownPic()
        down.run()
  • 相关阅读:
    输入输出重定向
    Tkinter程序屏幕居中
    从Web Controls到DHTML学习随想
    一个没暂时没有办法实现的问题和一个有意思的小问题!
    [学习笔记]几个英语短句(1)
    [读书笔记]My LifeBill Clinton
    [学习笔记]几个英语短句(2)
    结合MS Web Controls做文件上传的解决方案!
    IIS的一个莫名错误--Server Application Unavailable
    Google Sitemaps(测试版)帮助:使用 Sitemap 协议
  • 原文地址:https://www.cnblogs.com/stfei/p/10149120.html
Copyright © 2011-2022 走看看