zoukankan html css js c++ java

小爬虫-从PhysioNet上下载MIT-BIH Arrhythmia Database的ECG数据


import urllib.request
import os


def url_open(url):
    '''open url and return source html code'''
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) 
     AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
    return html


def save_file(file_url):
    ''' open a url and save file'''

    # get file name
    filename = file_url.split('/')[-1]
    # write file to local
    with open(filename, 'wb') as f:
        file = url_open(file_url)
        f.write(file)


def download_file(folder="files"):
    '''to download file form internet'''

    # build a folder if it doesn't exit
    if not os.path.exists(folder):
        os.makedirs(folder)
    os.chdir(folder)
    # based url
    url = "https://physionet.org/physiobank/database/mitdb/"

    for i in range(100,235):
        file_list = i
        # url of ECG signal head file '*.hea'
        file_url = url + str(file_list) + '.hea'
        # save file
        try:
            save_file(file_url)
        except:
            continue

    # discard the empty files
    file_path = 'D:\Python\PyCharm_Projects\learn_py\file'
    for root, dirs, files in os.walk(file_path):
        for f in files:
            empty_f = os.path.getsize(file_path + '\' + f)
            if empty_f <= 0:
                    os.remove(file_path+'\'+f)


if __name__=='__main__':
    download_file()

查看全文

相关阅读:
Spring IoC
常见切入点表达式的例子（aop execution 表达式）
数据结构与算法（2）栈、中缀表达式、递归
 数据结构与算法（1）稀疏数组、队列、链表
 airflow实践
 head first 设计模式笔记13-与设计模式相处，剩下的模式，模式的分类
 head first 设计模式笔记12-复合模式
 head first 设计模式笔记11-代理模式
 head first 设计模式笔记10-状态模式
 WebDriver自动化测试常用处理方法

原文地址：https://www.cnblogs.com/siucaan/p/9623186.html

最新文章
改变vs2012背景（不透明）
Compute By
中断异常
 软中断
 MVC模式
 MapReduce
30类CSS选择器
 乔姆斯基范式
 PSPACE
使用web scraper抓取分页内容