zoukankan      html  css  js  c++  java
  • python获取数据网页数据并创建文件夹保存(基于python3.6)

    from urllib.parse import urljoin
    import urllib.request
    
    from bs4 import BeautifulSoup
    
    
    import os
    import datetime
    import re
    import errno
    
    
    def mkdir_p(path):   #递归创建多级目录
        try:
            os.makedirs(path)
        except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else: raise
    
    def get_link(page):  # 寻找链接的href
        linkData = []
        for page in page.find_all('td'):
            links = page.select("a")
            for each in links:
                # if str(each.get('href'))[:1] == '/': 过滤if代码
                    data=each.get('href')
                    linkData.append(data)
        return(linkData)
    
    
    def gain(url):      #获取网页指定内容
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page, 'lxml')  #利用soup获取网页内容
        links = get_link(soup) #获取<a href= ? 内容
        return links
    
    
    def main():
        url = 'https://www.tide-forecast.com/countries/China'
        Web_Link=gain(url)
    
        for Link in range(len(Web_Link)):
            Link_Add = Web_Link[Link]
            Link_One = re.split("/", Link_Add)   #去除'/',将Link_Add变成数组
            Link_Address = Link_One[2]           #获取数组第3位值
            Link_Address = (Link_Address + '.js')
            url_Tide = 'https://www.tide-forecast.com/tides/'
            connet = urljoin(url_Tide, Link_Address)          # 拼接网址路径
            file = os.path.join('D:\TideData\China' + "/" )  # 拼接绝对路径
            mkdir_p(file)
            print(connet)
            if os.path.isfile(file):
               print('文件已存在')
            else:
                start = datetime.datetime.now().replace(microsecond=0)  #计时工具
                
                url = connet
                wp = urllib.request.urlopen(url)  #打开数据网页数据
                content = wp.read()                
                fp = open(file + Link_Address, "wb")     #写入指定文件夹 
                fp.write(content)            #写入数据
                fp.close()                    #关闭文件
    
                end = datetime.datetime.now().replace(microsecond=0)
                print("用时: ", end='')
                print(end - start)
    
    
    
    
    
    if __name__ == '__main__':
        main()
    
    
  • 相关阅读:
    linux samba 配置
    实例解读 linux 网卡驱动
    Linux操作系统的安全管理设置
    找回Linux/Unix下各系统的密码
    CF441E Valera and Number
    CF1175F The Number of Subpermutations 题解
    CF1553 比赛记录
    P5618 [SDOI2015]道路修建 题解
    CF 1530 比赛记录
    AT2063 [AGC005E] Sugigma: The Showdown 题解
  • 原文地址:https://www.cnblogs.com/jpfss/p/9235193.html
Copyright © 2011-2022 走看看