zoukankan html css js c++ java

python爬取免费西祠代理

#!/usr/local/bin/python3.7

"""
@File    :   xicidaili.py
@Time    :   2020/06/02
@Author  :   Mozili

"""

import urllib.request
import urllib.parse
from lxml import etree
import random
import time

def handler_request(url):
    # 请求头
    headers = {
     'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
     }
    # 创建请求
    req = urllib.request.Request(url=url, headers=headers)
    # 发送请求
    res = urllib.request.urlopen(req)
    # 获取内容
    cot = res.read().decode()
    return cot

def preserve_data(ips, ports, types):

    for ip in ips:
        for i in range(len(ports)):
            for j in range(len(types)):
                str = types[j] + ' ' + ip + ':' + ports[i] + '
'
                # 删除列表中第一个元素
                del types[0]
                # print(types)
                del ports[0]
                # print(ports)
                with open('Reptile/daili.txt', 'a', encoding='utf-8') as fp:
                    fp.write(str)
                break
            break
        
def download_content(tree):
    # 获取ip
    ips = tree.xpath("//tr[@class='odd']/td[2]/text()")
    # print(ips)
    # 获取端口
    ports = tree.xpath("//tr[@class='odd']/td[3]/text()")
    # print(ports)
    # 获取类型
    types = tree.xpath("//tr[@class='odd']/td[6]/text()")
    # print(types)
    # 保存数据到txt文档
    preserve_data(ips, ports, types)


if __name__ == "__main__":
    # 输入页码
    start_page = int(input('请输入起始页码：'))
    end_page = int(input('请输入结束页码：'))
    # url列表
    url_list= [
        'https://www.xicidaili.com/nn/',
        'https://www.xicidaili.com/nt/',
        'https://www.xicidaili.com/wn/',
        'https://www.xicidaili.com/wt/',
        'https://www.xicidaili.com/qq/'
        ]
    for url in url_list:
        for page in range(start_page, end_page+1):
            new_url = url + str(page)
            # print(url)
            # 创建请求
            content = handler_request(new_url)
            # print(content)
            time.sleep(1)
            # 创建对象，网络文件
            tree = etree.HTML(content)
            # 开始爬取内容
            download_content(tree)

查看全文

相关阅读:
Linux命令格式及7个常见终端命令
 Linux主要目录速查表
 Linux和Windows系统目录结构区别
 C语言下进制的使用
 C语言变量和常量
 C语言的关键字和数据类型
 Linux下交换文件说明
 gcc编译过程
 C语言图形界面QT和MFC（待学）
字符编码问题

原文地址：https://www.cnblogs.com/lxmtx/p/13031894.html