zoukankan      html  css  js  c++  java
  • Python学习26

    python—简单数据抓取日常三(IP地址代理)

     


    学习内容:

    1、简单IP地址代理
    2、利用蘑菇代理实现IP地址代理刷新本地ip地址
    3、利用蘑菇代理实现IP地址代理抓取安居客信息并实现多线程


    1、简单IP地址代理

    import requests
    from lxml import etree
    
    # 代理IP地址
    proxy = {"http": "代理ip:端口号"}
    
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    
    source = requests.get('http://2021.ip138.com/', headers=headers, proxy=proxy).text
    demo = etree.HTML(source).xpath('/html/body/p[1]/a/text()')
    content = etree.HTML(source).xpath('/html/body/p[1]/text()[2]')
    print(demo)
    print(content)
    

    2、利用蘑菇代理实现IP地址代理刷新本地ip地址

    import requests
    from lxml import etree
    
    
    # 蘑菇代理的隧道订单
    appKey = "Nk1WTVBqODJDMlVmOWdkRDp5cGY2SWo0RGJzZGYzNnow"
    # 蘑菇隧道代理服务器地址
    ip_port = 'secondtransfer.moguproxy.com:9001'
    # 代理IP地址
    proxy = {"http": "http://" + ip_port, "https": "https://" + ip_port}
    
    headers = {
      "Proxy-Authorization": 'Basic ' + appKey,
      "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
      "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
    }
    
    source = requests.get('http://2021.ip138.com/', headers=headers, proxies=proxy,verify=False,allow_redirects=False).text
    demo = etree.HTML(source).xpath('/html/body/p[1]/a/text()')
    content = etree.HTML(source).xpath('/html/body/p[1]/text()[2]')
    print(demo)
    print(content)
    多次输出结果不同:
    ['106.35.173.120']
    ['] 来自:中国内蒙古包头 电信
    ']
    ['223.242.246.60']
    ['] 来自:中国安徽淮南田家庵区 电信
    ']

    3、利用蘑菇代理实现IP地址代理抓取安居客信息并实现多线程

    import requests
    from lxml import etree
    from multiprocessing import Pool
    import re
    # 蘑菇代理的隧道订单
    appKey = "Nk1WTVBqODJDMlVmOWdkRDp5cGY2SWo0RGJzZGYzNnow"
    # 蘑菇隧道代理服务器地址
    ip_port = 'secondtransfer.moguproxy.com:9001'
    # 代理IP地址
    proxy = {"http": "http://" + ip_port, "https": "https://" + ip_port}
    
    headers = {
      "Proxy-Authorization": 'Basic ' + appKey,
      "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
      "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
    }
    
    lists = ['xiqing', 'tanggu', 'nankai', 'jinnan', 'wuqing', 'hedong', 'hexi', 'dongli']
    
    for i in range(8):
        def index(page):
                source = requests.get('https://tianjin.anjuke.com/sale/jinnan/' + str(lists[i]) + '/p' + str(page) + '/?from=SearchBar', headers=headers, proxies=proxy, verify=False, allow_redirects=False).text
                name = etree.HTML(source).xpath('//*[@id="__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/div[1]/h3/text()')
                content1 = "".join(etree.HTML(source).xpath('//*[@id="__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/section/div[1]/p[1]/span/ text()'))
                content2 = etree.HTML(source).xpath('// *[ @ id = "__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/section/div[2]/p/text()')
                content3 = (etree.HTML(source).xpath('//*[@id="__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/section/div[3]/span/text()'))
                print(lists[i])
                print(name)
                print(content1.replace('卫', '卫,'))
                print(content2)
                print(content3)
                print('===========当前在第' + str(page) + '页=================')
    
    
    if __name__ == '__main__':
        p = Pool(1)
        for page in range(1, 51):
            p.apply_async(index, args=(page,))
        print('Waiting for all subprocesses done...')
        p.close()
        p.join()
        print('All subprocesses done.')
  • 相关阅读:
    Linux基础命令—网卡
    SHOW SLAVE STATUS解读
    perf工具crash的问题
    python学习之-requests模块基础
    DELL IDRAC API接口开发文档翻译及client模块
    cobbler ks文件解释--转载
    django学习之- 动态验证码学习
    django学习之- Ajax
    django学习之- modelForm
    django学习之- json序列化
  • 原文地址:https://www.cnblogs.com/tangmf/p/14331259.html
Copyright © 2011-2022 走看看