zoukankan      html  css  js  c++  java
  • python3 获取博彩网站页面下所有域名(批量)

    已有的域名信息

    详细实现过程如下

    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    import requests
    from bs4 import BeautifulSoup as Bs4
    from urllib.parse import urlparse
    
    headers= {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    #打开域名文件1.txt
    def new_url():
        url_list  = []
        bo = open("1.txt","r")
        for i in bo:
            url_list.append(i.replace("
    ",""))
        return(url_list)
    
    
    #数据处理
    def get_url():
        head_url = new_url()
        num = 0
        for i in head_url: #按行遍历数据
            num = num +1
            print("***********************************"+ i +"***********************************")
            # head_url = "https://www.tkcp.hk/"
            try:
                response = requests.get(url="http://"+i,headers=headers)
                response.encoding = 'gb2312'
                soup = Bs4(response.text,"lxml")
                # print(soup)
                htmls = soup.find_all("a") #获取页面中的所有a标签
                # print(htmls)
                urls = []
                new_urls = []
                for html in htmls:
                    url = html.get("href") #获取页面中所有含"href"的字符串
                    urls.append(url.replace('
    ',''))
                    qc_urls = set(urls)
                for url in qc_urls: #处理数据,得到域名地址
                    if "http" in url:
                        res = urlparse(url)
                        # print("返回对象:", res)
                        # print("域名", res.netloc)
                        domain = res.netloc
                        new_urls.append(domain)
                qc_new_urls = set(set(new_urls))
                #print("***********************************"+num+"***********************************")
                print(set(qc_new_urls)) #去重
                for j in set(qc_new_urls):
                    # print(j)
                    with open("url_v1.txt","a+",encoding="utf-8") as f:
                        f.write(j+"
    ")
            except Exception as e:
                print("链接无法访问")
        result_list = []
        result = open("./url_v1.txt","r")  
        for r in result.readlines(): 
            result_list.append(r.replace("
    ",""))
        for x in set(result_list): #二次数据处理,去掉重复数据
            with open("url_end_V.txt","a+",encoding="utf-8") as f:
                print(x)
                f.write(x+"
    ")
    
    if __name__=="__main__":
        get_url()
    
  • 相关阅读:
    如何区分DDR1 DDR2 DDR3内存条
    《闪电战》德军攻略
    WINDOWS SERVER 2008 R2安装指南
    【django】django学得好迷茫啊 来个学习规划吧
    【阅读】提问的智慧+有效的报告BUG
    【Python】logging模块学习笔记
    【接口测试】进度表
    【django】django深入学习笔记
    【随笔】2014工作总结
    【英语】Bingo口语笔记(47)
  • 原文地址:https://www.cnblogs.com/dddjh/p/11806085.html
Copyright © 2011-2022 走看看