zoukankan      html  css  js  c++  java
  • 【爬虫】抓取xicidaili可用代理ip

    # coding=utf-8
    import requests
    from lxml import etree
    ips=[]
    def run(page):
    url="https://www.xicidaili.com/nn/{}"
    headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36,PostmanRuntime/7.16.3",
    'Accept': "*/*",
    'Cache-Control': "no-cache",
    'Postman-Token': "e17c0361-c140-4e67-b4d7-1d4297b6876d,2da41bb3-79f5-40fd-a5a7-63c0acbd4442",
    'Host': "www.xicidaili.com",
    'Accept-Encoding': "gzip, deflate",
    'Cookie': "_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWYyNTA3YjBmOWFjNDAxOWJhYWEzNDg4YWQ0OTU5ZjYyBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUkxQnBlMzlsNmR3bExnWHltNklaWjFIdDJyNkdiVzE0cXUwR094TlErczQ9BjsARg%3D%3D--108c1be9a4e23604bde585654cfee79143f53fb6",
    'cache-control': "no-cache"
    }
    r=requests.get(url.format(page),headers=headers)

    selector=etree.HTML(r.text)
    info_list=selector.xpath('//table[@id="ip_list"]//tr')
    # print(info_list)
    info_list=info_list[1:]
    for info in info_list:
    ip=''.join(info.xpath('./td[2]/text()'))
    port=''.join(info.xpath('./td[3]/text()'))
    protocol=''.join(info.xpath('./td[6]/text()'))
    ips.append(protocol+"://"+ip+":"+port)
    print(ips)

    #存储到txt文件
    def write_to_txt(lists):
    with open('ips.txt','w',encoding='utf-8') as f:
    f.write(" ".join(lists))


    if __name__=="__main__":
    for i in range(1,5):
    print("==================同步第{}页=====================".format(i))
    run(i)
    write_to_txt(ips)
    print(len(ips))

  • 相关阅读:
    deepin之创建快捷idea启动方式
    python-docx读取doc,docx文档
    Jenkins节点配置-K8S云节点
    K8S创建用户RBAC授权
    在K8S中部署禅道zentao
    yum常用操作
    Git常用命令及方法大全
    rocket mq 1
    基于SpringBoot+LayUI+Freemarker+Mybatis的通用后台管理系统
    Struts+Servlet+JDBC网上手机销售系统
  • 原文地址:https://www.cnblogs.com/winstonsias/p/11528021.html
Copyright © 2011-2022 走看看