zoukankan      html  css  js  c++  java
  • 数据分析进阶 数据清洗

    练习一

    要求:使用csv模块,从给定的csv文件中获取name和data

      name的值在表头的第二列,data就是表数据

    import csv
    import os
    
    def parse_file(datafile):
        name=''
        data = []
        #以读的模式打开csv文件
        with open(datafile,'rb') as f:
           #将文件读进缓存中
            csv_data = csv.reader(f)
            #使用next方法获默认获取表头的标题,[1]表示获取第2个单元格的值
            name = csv_data.next()[1]
            #跳过表头,下移一行来获取表格的内容  
            csv_data.next()
            #循环数据,将每一行添加到data中
            for line in csv_data:
                data.append(line)
        return (name,data)

    测试函数

    def test():
    #如果执行过程中不报错,即为正确 datafile
    = os.path.join(DATADIR,DATAFILE) name,data = parse_file(datafile) assert name == "MOUNTAIN VIEW MOFFETT FLD NAS" assert data[0][1] == "01:00" assert data[2][0] == "01/01/2005" assert data[2][5] == "2" if __name__ == '__main__': test()

    2.Excel至csv

     要求:从excel中将COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST的最大的负载量和所对应的时间取出,

     然后把这些值存到新的excel表中

    import xlrd
    import csv
    import os
    #源文件
    datafile = '2013_ERCOT_Hourly_Load_Data.xls'
    #目标文件
    outfile = '2013_Max_loads.xls'
    def parse_file(datafile):
        #打开excel文件
        workbook = xlrd.open_workbook(datafile)
        #获取sheet页
        sheet = workbook.sheet_by_index(0)
        data = {}#存放结果集
        #循环需要获取的列名的下标,一共是8个
        for i in range(1,9):
            #获取列的名称,作为字典的key,第一个值是row,第二个值是column
            station = sheet.cell_value(0,i)
            #获取[1,8]的列值
            cv = sheet.col_values(i,start_rowx=1,end_rowx=None)
            #获取该列的最大值
            maxval = max(cv)
            #获取最大值的索引,因为索引从0开始计数,所以要+1
            maxpos = cv.index(maxval)+1
            #获取最大值所对应的时间
            maxtime = sheet.cell_value(maxpos,0)
            #将excel的时间转为python中的时间
            realtime = xlrd.xldate_as_tuple(maxtime,0)
            #将数据封装成字典,key是列头的名称,每一个key的value包含最大值和最大值的时间
            data[station] = {
                'maxval':maxval,
                'maxtime':realtime
            }
        return data
    def save_file(data,filename):
        #以写文件的状态打开csv文件
        with open(filename,'w') as f:
            #写文件,打开文件句柄,字段与字段之间用'|'分割
            w = csv.writer(f,delimiter='|')
            #写表头
            w.writerow(['Station','Year','Month','Day','Hour','Max Load'])
            #根据传入的data写内容
            for line in data:
    #获取时间的值 year,month,day,hour,_,_
    =data[line]['maxtime']
    #写数据 w.writerow([line,year,month,day,hour,data[line][
    'maxval']])

    测试函数

    def test():
        data = parse_file(datafile)
        save_file(data, outfile)
    
        number_of_rows = 0
        stations = []
    
        ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                            'Year': '2013',
                            'Month': '6',
                            'Day': '26',
                            'Hour': '17'}}
        correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                            'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
        fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']
    
        with open(outfile) as of:
            csvfile = csv.DictReader(of, delimiter="|")
            for line in csvfile:
                station = line['Station']
                if station == 'FAR_WEST':
                    for field in fields:
                        # Check if 'Max Load' is within .1 of answer
                        if field == 'Max Load':
                            max_answer = round(float(ans[station][field]), 1)
                            max_line = round(float(line[field]), 1)
                            assert max_answer == max_line
    
                        # Otherwise check for equality
                        else:
                            assert ans[station][field] == line[field]
    
                number_of_rows += 1
                stations.append(station)
    
            # Output should be 8 lines not including header
            assert number_of_rows == 8
    
            # Check Station Names
            assert set(stations) == set(correct_stations)
    if __name__ == "__main__":
        test()

    如果测试通过会生成'2013_Max_loads.xls'的excel文件同时函数的值不会报错

    3.读取json

    要求:从json文件中提取出相应的值,提取文章的标题和url,标题按照模块分组,url按照标准格式分组

    import json
    import codecs
    
    
    def get_from_file(kind, period):
       '''返回json格式的数据'''
       #获取json文件名
        filename = "popular-{0}-{1}.json".format(kind, period)
        #以读的形式打开此文件,同时转为json数据格式
        with open(filename, "r") as f:
            return json.loads(f.read())
    
    
    def article_overview(kind,period):
        #获取json格式的数据
        data = get_from_file(kind, period)
        titles = [] #存放需获取的标题
        urls = [] #存放需要获取的url
        for article in data:
            title = article['title'] #根据key来获取title
            section = article['section'] #根据key获取section
            #封装titles,key为section,value为title
            titles.append({section:title})
            #找media下的metadata下的format是标准格式的url
            if 'media' in article:
                for media in article['media']:#找到media
                    for metadata in media['media-metadata']:#找到media下的media-metadata
                        if metadata['format'] == 'Standard Thumbnail':#判断是否是标准格式
                            urls.append(metadata['url'])#是就加入到列表中最后返回
        return (titles,urls)

    测试函数

    def test():
    #如果执行过程中不报错,即为正确 titles, urls
    = article_overview("viewed", 1) assert len(titles) == 20 assert len(urls) == 30 assert titles[2] == {'Opinion': 'Professors, We Need You!'} assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg' if __name__ == "__main__": test()
  • 相关阅读:
    WPF ViewModel 调用任意前台控件的方法
    xxxx
    modelsim一些error(warning)的原因
    [verilog] inout端口处理
    [c语言]指针数组和数组指针
    电机控制术语
    MAC和PHY关系
    IAR map文件说明
    [corterm3]汇编语法
    TI 2802x系列中断系统及应用
  • 原文地址:https://www.cnblogs.com/luhuajun/p/7879802.html
Copyright © 2011-2022 走看看