zoukankan      html  css  js  c++  java
  • python获取数据网页数据并创建文件夹保存(基于python3.6)

    from urllib.parse import urljoin
    import urllib.request
    from bs4 import BeautifulSoup
    import os
    import datetime
    import re
    import errno
    
    def mkdir_p(path):#递归创建多级目录
        try:
            os.makedirs(path)
        except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else: raise
    
    def get_link(page):#寻找链接的href
        linkData = []
        for page in page.find_all('td'):
            links = page.select("a")
            for each in links:
                # if str(each.get('href'))[:1] == '/': 过滤if代码
                    data=each.get('href')
                    linkData.append(data)
        return(linkData)
    
    def gain(url):#获取网页指定内容
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page, 'lxml')#利用soup获取网页内容
        links = get_link(soup)#获取<a href= ? 内容
        return links
    def main(): url = 'https://www.tide-forecast.com/countries/China' Web_Link=gain(url) for Link in range(len(Web_Link)): Link_Add = Web_Link[Link] Link_One = re.split("/", Link_Add) #去除'/',将Link_Add变成数组 Link_Address = Link_One[2] #获取数组第3位值 Link_Address = (Link_Address + '.js') url_Tide = 'https://www.tide-forecast.com/tides/' connet = urljoin(url_Tide, Link_Address) # 拼接网址路径 file = os.path.join('D:\TideData\China' + "/" ) # 拼接绝对路径 mkdir_p(file) print(connet) if os.path.isfile(file): print('文件已存在') else: start = datetime.datetime.now().replace(microsecond=0)#计时工具 url = connet wp = urllib.request.urlopen(url)#打开数据网页数据 content = wp.read() fp = open(file + Link_Address, "wb")#写入指定文件夹 fp.write(content)#写入数据 fp.close()#关闭文件 end = datetime.datetime.now().replace(microsecond=0) print("用时: ", end='') print(end - start) if __name__ == '__main__': main()

    来源于:https://www.cnblogs.com/setname/p/7453778.html

  • 相关阅读:
    redis发布订阅
    redis学习笔记(面试题)
    redis安全 (error) NOAUTH Authentication required
    HDU3001 Travelling —— 状压DP(三进制)
    POJ3616 Milking Time —— DP
    POJ3186 Treats for the Cows —— DP
    HDU1074 Doing Homework —— 状压DP
    POJ1661 Help Jimmy —— DP
    HDU1260 Tickets —— DP
    HDU1176 免费馅饼 —— DP
  • 原文地址:https://www.cnblogs.com/hankleo/p/10649952.html
Copyright © 2011-2022 走看看