zoukankan      html  css  js  c++  java
  • 线程池爬取好花网

    '''
    需要修改部分:
    1. style_list
    2. skudic['type']
    3.with open ('spulist_1.json','wt') as f00: 文件存储路径
    '''
    
    from selenium import webdriver
    from selenium.webdriver.common.by import By  # 通过什么
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC  #期望的条件达到
    from selenium.webdriver.support.wait import WebDriverWait  # 等待
    from selenium.webdriver.chrome.options import Options
    import uuid
    import json
    import requests
    
    # chrome_options = Options()
    # chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统如果无界面不加这条会
    # driver = webdriver.Chrome(options=chrome_options)
    driver = webdriver.Chrome()
    # style_list = ['mg/','bhh/','knx/','xrk/','zll/','mtx/']
    style_list = ['mg/']
    base_url = 'https://www.haohua.com/xianhua/'
    
    total_url = []
    main_imgs = []
    big_lists = []  # 所有的sku的原始列表 多个[title, desc, price, sku_dic, spu_imgs]
    
    spulist = []
    skulist = []
    spu_pic_list = []
    index = 0
    
    # spu 需要内容:title,detail,spu_main_img,price(uuid 生成一个随机字符串,用来和sku 匹配)
    # spuimgs 需要内容:uid main_imgs (主图的几张图片)
    # sku 需要内容:uid,type,name,price,img
    
    
    for style in style_list:
        url = base_url + style
        driver.get(url)
    
        # 显示等待,明确等待满足某一个条件为止
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "imghover")))
        # 平滑滚动
        height = driver.execute_script("return document.body.clientHeight")
    
        driver.execute_script("""
            window.scrollTo({
                top: %s,
                behavior:"smooth"
            });
            """ % height)
        urls=[]
        # 要拿到li 标签下的a标签的href 属性
        tags = driver.find_elements_by_class_name('imghover')
    
        
        for tag in tags:
            # https: // www.haohua.com / xianhua / 45177.html
            tag_url = tag.get_attribute('href')
            urls.append(tag_url)
        total_url.append(urls)
    
    
    # 给定一个url, 进行爬取数据
    def task(single_url):
        driver.get(single_url)
        # 要爬的数据:标题,简介,价格,图片,sku名称,图片
        title = driver.find_element_by_class_name('shop-title').text
        desc = driver.find_element_by_class_name('shop-description').text
        price = driver.find_element_by_class_name('sell-val').text
        skus = driver.find_element_by_class_name('specs-item').find_elements_by_tag_name('img')
        sku_dic = {}
        for sku in skus:
            sku_img = sku.get_attribute('src')
            sku_name = sku.get_attribute('title')
            sku_dic[sku_name] = sku_img
        spu_img_tags = driver.find_element_by_class_name('shop-preview-item').find_elements_by_tag_name('li')
        spu_imgs = []
        for spu_img in spu_img_tags:
            img = spu_img.find_element_by_tag_name('img').get_attribute('src')
            spu_imgs.append(img)
        main_imgs.append(spu_imgs[0])
    
        return [title, desc, price, sku_dic, spu_imgs]
    
    
    from concurrent.futures import ThreadPoolExecutor
    import time
    
    pool = ThreadPoolExecutor(3)
    type_urls = total_url[0]
    def main():
        for type_url in type_urls:
            done = pool.submit(task,type_url)
            lis = done.result()
            big_lists.append(lis)
            main_imgs.append(lis[4][1])
            print(lis)  # 列表套列表,要取出主图放入文件夹中
    
    if __name__ == '__main__':
        main()
        pool.shutdown(wait=True)
    
    
    
    img_name_url = []
    for imgurl in main_imgs:
        img_name = imgurl.split('/')[-1]
        img_name_url.append([img_name,imgurl])
    
    
    sku_name_urls = []
    spu_name_urls = []
    
    
    for big_list in big_lists:
        spudic = {}  # 每一个spu的参数字典
    
        spudic['title'] = big_list[0].split('-')[-1]
        spudic['detail'] = big_list[1]
        spudic['price'] = big_list[2]
        spudic['spu_main_img'] = 'main_img/' + big_list[4][0].split('/')[-1]  # 'https://www.haohua.com/upload/image/2018-11/22/2761c_182d6.png',
        uid = str(uuid.uuid4())
        spudic['uid'] = uid
        spulist.append(spudic)
        # big_list[3]   {'12枝紫罗兰': img_url, '9枝紫罗兰': img_url}  sku的图片存到 SKUimg
        for sku_name in big_list[3]:
            # 把所有的sku图片放在一个列表中,以备以后下载图片
            sku_name_urls.append(big_list[3][sku_name])
            skudic = {}
            skudic['uid'] = uid
            skudic['type'] = 1
            skudic['name'] = sku_name
            skudic['price'] = big_list[2]
            skudic['img'] = 'SKUimg/' + big_list[3][sku_name].split('/')[-1]
            skulist.append(skudic)
        # 下面是 SPUpictures 参数
        for spupicture in big_list[4]:  # [url1,url2....]
            # 把所有的spu图片放在一个列表中,以备以后下载图片
            spu_name_urls.append(spupicture)
            spu_img_pic = {}
            spu_img_pic['uid'] = uid
            spu_img_pic['img'] = 'SPUimg/' + spupicture.split('/')[-1]
            index += 1
            spu_img_pic['index'] = index
            spu_pic_list.append(spu_img_pic)
    
    
    
    # 这里是一张主图 main_img
    for name_url in img_name_url:
        r = requests.get(name_url[1])
        save_url = 'F:/期中架构/practice2/main_img/' + name_url[0]
    
        if r.status_code == 200:
            content = r.content
            with open(save_url, 'ab') as f:
                f.write(content)
    
    # 这里是sku图片  sku_name_urls = []
    for sku_name in sku_name_urls:    # 一个sku 的图片
        r = requests.get(sku_name)
        save_url = 'F:/期中架构/practice2/SKUimg/' + sku_name.split('/')[-1]
    
        if r.status_code == 200:
            content = r.content
            with open(save_url, 'ab') as f:
                f.write(content)
    
    # 这里是 spu的几张主图 spu_name_urls = []
    
    for spu_name in spu_name_urls:    # 一个sku 的图片
        r = requests.get(spu_name)
        save_url = 'F:/期中架构/practice2/SPUimg/' + spu_name.split('/')[-1]
    
        if r.status_code == 200:
            content = r.content
            with open(save_url, 'ab') as f:
                f.write(content)
    
    
    # 这里是type = 5 的所有数据
    # spulist = [],skulist = [],spu_pic_list = []  写入三个文件中
    
    
    with open ('spulist_1.json','wt') as f00:
        json.dump(spulist,f00)
    
    with open ('skulist_1.json','wt') as f11:
        json.dump(skulist,f11)
    
    with open ('spu_pic_list_1.json','wt') as f22:
        json.dump(spu_pic_list,f22)
    
  • 相关阅读:
    谁是你心目中最优秀的ajax框架
    23种设计模式(1):单例模式
    23种设计模式(8):观察者模式
    设计模式六大原则(3):依赖倒置原则
    23种设计模式(2):工厂方法模式
    oracle中给表和字段添加注释
    单例模式讨论篇:单例模式与垃圾回收
    设计模式六大原则(6):开闭原则
    mysql命名锦集
    23种设计模式(3):抽象工厂模式
  • 原文地址:https://www.cnblogs.com/Afrafre/p/10767839.html
Copyright © 2011-2022 走看看