zoukankan      html  css  js  c++  java
  • lxml webdriver 抓取街拍

    案例

    import os
    from hashlib import md5
    from selenium import webdriver
    import requests
    from lxml import etree
    
    # 首页请求
    def get_response(url):
        headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
        res = requests.get(url, headers=headers)
        return res
    
    # 从返回的json中解析出组图链接
    def get_article_title_url(text, i):
        # json解析获取字典表数据
        article_url = text.json()["data"][i]['article_url']
        title = text.json()["data"][i]['title']
        return article_url,title
    
    # 从单个组图链接里解析出每张图片的URL地址-自上而下
    def parse_article_url(article_url):
        driver = webdriver.Chrome(r"D:pythoncomzxsoftpythonchromedriver.exe")
        driver.get(article_url)
        text = driver.page_source
        html = etree.HTML(text)
        hrefs = html.xpath('//div[@class="article-content"]//div[@class="pgc-img"]//img[@class="syl-page-img"]//@src')
        driver.close()
        return hrefs
    
    # 将每张图片保存在对应标题的本地文件夹下
    def save_jpg(title,href):
        res = requests.get(href)
        file_path = '{}/{}.{}'.format(title, md5(res.content).hexdigest(), 'jpg')
        with open(file_path, 'wb') as f:
            f.write(res.content)
    
    os.chdir(r"E:/ntmssFile/nv/")
    for i in range(20):
        url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab'.format(i * 20)
        r = get_response(url)
        data_length = r.json()["data"]
        for i in range(len(data_length)):
            try:  # 不是所有的列表中都有组图标题和链接信息,用try防止报错
                article_url,title_text = get_article_title_url(r, i)
                if not os.path.exists(title_text):
                    os.makedirs(title_text)
                hrefs = parse_article_url(article_url)
                for href in hrefs:
                    save_jpg(title_text,href)
            except:
                continue
    import requests
    from lxml import etree
    from hashlib import md5
    import re
    import os
    import redis
    
    r = redis.StrictRedis(host='172.16.xx.xx', port=6379, db=2, decode_responses=True)
    headers = {
        'Cookie': 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60=1632291839,1632373348; Hm_lpvt_c8263f264e5db13b29b03baeb1840f60=1632373697',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
    }
    os.chdir(r"E:homewebContainer
    tmssResource")
    urlRes = requests.get("https://www.3gbizhi.com/meinv/", headers=headers)
    html = etree.HTML(urlRes.text)
    href_list = html.xpath('//div[@class="cl r"]//ul[@class="cl"]/li/a/@href')
    for href in href_list:
        parttern_href = re.compile(r'https://www.3gbizhi.com/meinv/(.*?).html', flags=re.DOTALL)
        linkShort = re.search(parttern_href, href)
        short = linkShort.group(1)
        if not os.path.exists(short):
            os.makedirs(short)
        for i in range(2, 3):
            url = f'https://www.3gbizhi.com/meinv/' + short + '_' + str(i) + '.html'
            response = requests.get(url, headers=headers)
            html = etree.HTML(response.text)
            href_list = html.xpath('//div[@class="contlistw mtw"]//ul[@class="cl"]/li/a/@href')
            title_list = html.xpath('//div[@class="contlistw mtw"]//ul[@class="cl"]/li/a/@title')
            for href, title in zip(href_list, title_list):
                res = requests.get(href, headers=headers)
                html_data = etree.HTML(res.text)
                img_url_list = html_data.xpath('//div[@class="picimglist pos"]/ul/li/a/img/@src')
                for img_url in img_url_list:
                    img_url = ''.join(img_url.split('thumb_200_0_'))
                    result = requests.get(img_url, headers=headers).content
                    file_path = '{}/{}.{}'.format(short, md5(result).hexdigest(), 'jpg')
                    path = file_path
                    url_local = "http://192.168.31.155:8889/ntmssResource/" + path
                    r.sadd("img_list", url_local)
                    print(f'正在下载  {title} {url_local}!!!!')
                    with open(file_path, 'wb')as f:
                        f.write(result)
    故乡明
  • 相关阅读:
    linux-Redhat7 windows物理机与虚拟机设置共享目录
    解决Vue-cli3.0下scss文件编译过慢、卡顿问题
    CCS进阶——div的宽度和高度是由什么决定的?
    在线图片资源转换成Base64格式
    浅析libuv源码-node事件轮询解析(4)
    MaxCompute Studio使用心得系列7——作业对比
    from _sqlite3 import * ImportError: DLL load failed: 找不到指定的模块。
    Java高并发程序设计学习笔记(九):锁的优化和注意事项
    模块:摘要算法,hashlib
    面向对象:类的内置方法
  • 原文地址:https://www.cnblogs.com/luweiweicode/p/14335595.html
Copyright © 2011-2022 走看看