zoukankan      html  css  js  c++  java
  • 爬虫实现案例

    import requests
    from lxml import etree
    from 爬虫.old_boy.p3 import get_code_text
    
    session = requests.session()
    # session的作用与requests的作用几乎一样,都可以请求的发送,并且请求发送的方式也是一致的
    # session进行请求的发送,如果产生cookie的话会自动保存
    
    url = 'http://www.renren.com/'
    headers = {
        'Use-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'Connection': 'close',
    }
    response = session.get(url=url, headers=headers).content
    xpath_data = etree.HTML(response)
    pic = xpath_data.xpath('//*[@id="verifyPic_login"]/@src')[0]
    print(pic)
    pic = requests.get(url=pic, headers=headers).content
    
    with open('pic.jpg', 'wb') as fp:
        fp.write(pic)
    
    # 获取result
    result = get_code_text('pic.jpg')
    # print(result)
    
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019331853198 '
    data = {
        'captcha_type':    'web_login',
        'domain':    'renren.com',
        'email':    '18744585483',
        'f': 'http%3A%2F%2Fwww.renren.com%2F970459497',
        'icode': result,
        'key_id': '1',
        'origURL': 'http://www.renren.com/home',
        'password':    '9722733e821526e5879a37d439f40666e1af794712cad1fce23d83f7b2f57041',
        'rkey':    '0de33e22f20835059cb6b28da4bffdc9'
    }
    
    # 进行登录
    response = session.post(url=login_url, headers=headers, data=data)
    
    # 对登录成功的当前用户进行详情页发送访问
    detail_url = 'http://www.renren.com/970459497'
    
    # 该请求使用的是session对象
    ren_response = session.get(url=detail_url, headers=headers).content
    with open('./renren.html', 'wb') as fp:
        fp.write(ren_response)
    人人网
    import requests
    from lxml import etree
    from 爬虫.old_boy.p3 import get_code_text
    
    url = 'https://so.gushiwen.org/user/login.aspx?from='
    headers = {
        'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
    }
    
    proxies = {
        'http': '193.68.135.125:59278'
    }
    
    session = requests.session()
    response = session.get(url=url, headers=headers, verify=False, proxies=proxies).content
    xpath_data = etree.HTML(response)
    pic_src = 'https://so.gushiwen.org' + str(xpath_data.xpath('//*[@id="imgCode"]/@src')[0])
    # print(pic_src)
    pic = session.get(url=pic_src, headers=headers, verify=False, proxies=proxies).content
    with open('pic.jpg', 'wb') as fp:
        fp.write(pic)
    
    code = get_code_text('pic.jpg')
    print(code)
    
    post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
    data = {
        "__VIEWSTATE": "ahdYeAQW0HtfdBdmYQKvu1cIOsMVQy6b8+Tl3fFmuwmB//7WZsi1kJXIrAcqfvRP5UVTbb74NTJ389/H6FgBc60xjuUtXmCu6V15vp7reQ3DjcBq01LPXOubOG8=",
        "__VIEWSTATEGENERATOR": "C93BE1AE",
        "from:": "http://so.gushiwen.org/user/collect.aspx",
        "email": "862032955@qq.com",
        "pwd": "123456",
        "code": code,
        "denglu": "登录",
    }
    session.post(url=post_url, headers=headers, data=data, verify=False, proxies=proxies)
    
    detail_url = 'https://so.gushiwen.org/user/collect.aspx'
    d_response = session.get(url=detail_url, verify=False, headers=headers, proxies=proxies).content
    with open('古诗文.html', 'wb') as fp:
        fp.write(d_response)
    古诗网
    import requests
    from lxml import etree
    import re
    from multiprocessing.dummy import Pool
    import random
    
    url = 'https://www.pearvideo.com/category_8'
    headers = {
        'Use-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
    }
    
    response = requests.get(url=url, headers=headers, verify=False).content.decode()
    xpath_data = etree.HTML(response)
    li_list = xpath_data.xpath('//*[@id="listvideoListUl"]/li')
    
    # 实现并发建立的数据池
    video_url_list = []
    
    for li in li_list:
        # print(li)
        v_href = 'https://www.pearvideo.com/' + li.xpath('.//div[@class="vervideo-bd"]/a/@href')[0]
        # print(v_href)
        d_response = requests.get(url=v_href, headers=headers).content.decode()
        video_url = re.findall('srcUrl="(.*?)",', d_response, re.S)[0]
        video_url_list.append(video_url)
        # print(video_url)
    
    # 常见5个线程
    pool = Pool(5)
    dowmloadVideo = lambda link: requests.get(url=link, headers=headers).content
    # map返回的列表中存储的就是下载完毕的视频二进制数据值
    video_url_list = pool.map(dowmloadVideo, video_url_list)
    
    def save_video(data):
        i = random.randint(1, 1000)
        video_name = 'video/' + str(i) + '.mp4'
        # i = i + 1
        with open(video_name, 'wb') as fp:
            fp.write(data)
    
    pool.map(save_video, video_url_list)
    
    pool.close()
    pool.join()
    多线程
  • 相关阅读:
    shell编程系列5--数学运算
    qperf测量网络带宽和延迟
    使用gprof对应用程序做性能评测
    [转]极不和谐的 fork 多线程程序
    Emacs显示光标在哪个函数
    Iterm2的一些好用法
    [转]最佳日志实践
    Deep Introduction to Go Interfaces.
    CGo中传递多维数组给C函数
    seaweedfs 源码笔记(一)
  • 原文地址:https://www.cnblogs.com/abc23/p/10733229.html
Copyright © 2011-2022 走看看