zoukankan      html  css  js  c++  java
  • python3 获取博彩网站页面下所有域名(批量)

    已有的域名信息

    详细实现过程如下

    #!/usr/bin/env python 
    # -*- coding:utf-8 -*-
    import requests
    from bs4 import BeautifulSoup as Bs4
    from urllib.parse import urlparse
    
    headers= {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    #打开域名文件1.txt
    def new_url():
        url_list  = []
        bo = open("1.txt","r")
        for i in bo:
            url_list.append(i.replace("
    ",""))
        return(url_list)
    
    
    #数据处理
    def get_url():
        head_url = new_url()
        num = 0
        for i in head_url: #按行遍历数据
            num = num +1
            print("***********************************"+ i +"***********************************")
            # head_url = "https://www.tkcp.hk/"
            try:
                response = requests.get(url="http://"+i,headers=headers)
                response.encoding = 'gb2312'
                soup = Bs4(response.text,"lxml")
                # print(soup)
                htmls = soup.find_all("a") #获取页面中的所有a标签
                # print(htmls)
                urls = []
                new_urls = []
                for html in htmls:
                    url = html.get("href") #获取页面中所有含"href"的字符串
                    urls.append(url.replace('
    ',''))
                    qc_urls = set(urls)
                for url in qc_urls: #处理数据,得到域名地址
                    if "http" in url:
                        res = urlparse(url)
                        # print("返回对象:", res)
                        # print("域名", res.netloc)
                        domain = res.netloc
                        new_urls.append(domain)
                qc_new_urls = set(set(new_urls))
                #print("***********************************"+num+"***********************************")
                print(set(qc_new_urls)) #去重
                for j in set(qc_new_urls):
                    # print(j)
                    with open("url_v1.txt","a+",encoding="utf-8") as f:
                        f.write(j+"
    ")
            except Exception as e:
                print("链接无法访问")
        result_list = []
        result = open("./url_v1.txt","r")  
        for r in result.readlines(): 
            result_list.append(r.replace("
    ",""))
        for x in set(result_list): #二次数据处理,去掉重复数据
            with open("url_end_V.txt","a+",encoding="utf-8") as f:
                print(x)
                f.write(x+"
    ")
    
    if __name__=="__main__":
        get_url()
    
  • 相关阅读:
    【Lintcode】112.Remove Duplicates from Sorted List
    【Lintcode】087.Remove Node in Binary Search Tree
    【Lintcode】011.Search Range in Binary Search Tree
    【Lintcode】095.Validate Binary Search Tree
    【Lintcode】069.Binary Tree Level Order Traversal
    【Lintcode】088.Lowest Common Ancestor
    【Lintcode】094.Binary Tree Maximum Path Sum
    【算法总结】二叉树
    库(静态库和动态库)
    从尾到头打印链表
  • 原文地址:https://www.cnblogs.com/dddjh/p/11806085.html
Copyright © 2011-2022 走看看