zoukankan html css js c++ java

数据处理

# Author kevin_hou

with open('james.txt') as jaf:
    data = jaf.readline()
james = data.strip().split(',')
with open('julie.txt') as juf:
    data = juf.readline()
julie = data.strip().split(',')
with open('mikey.txt') as mif:
    data = mif.readline()
mikey = data.strip().split(',')
with open('sarah.txt') as saf:
    data = saf.readline()
sarah = data.strip().split(',')

# print(james)
# print(julie)
# print(mikey)
# print(sarah)

'''
['2:34', '3:21', '2:34', '2.45', '3.01', '2:01', '2:01', '3:10', '2:22']
['2.59', '2.11', '2:11', '2:23', '3:10', '2:23', '3:10', '3:21', '3-21']
['2:22', '3.01', '3:01', '3.02', '3:02', '3.02', '3:22', '2.49', '2:38']
['2:58', '2.58', '2:39', '2-25', '2-25', '2:54', '2.18', '2:55', '2:55']
'''

# data = [1,9,4,2,6,7,0]
# print(data) #[1, 9, 4, 2, 6, 7, 0]

# data.sort() #原地排序[0, 1, 2, 4, 6, 7, 9]
# print(data)

# data2 = sorted(data)
# print(data) #对数据完成复制排序[1, 9, 4, 2, 6, 7, 0]
# print(data2)    #复制排序[0, 1, 2, 4, 6, 7, 9]


def sanitize(time_string):
    if '-' in time_string:  #使用"in"操作符检查字符串是否包含一个短横线或冒号
        splitter = '-'
    elif ':' in time_string:
        splitter = ':'
    else:
        return(time_string) #如果字符串不需要清理，就什么也不做
    (mins, secs) = time_string.split(splitter)  #分解字符串，抽出分钟和秒部分
    return(mins + '.' + secs)


clean_james = []    #创建4个开始为空的新列表
clean_julie = []
clean_mikey = []
clean_sarah = []
for each_t in james:
    clean_james.append(sanitize(each_t))    #取原列表中的各个数据项，进行清理。
for each_t in julie:                        #然后将清理后的数据追加到适当的新列表
    clean_julie.append(sanitize(each_t))
for each_t in mikey:
    clean_mikey.append(sanitize(each_t))
for each_t in sarah:
    clean_sarah.append(sanitize(each_t))


print(sorted(clean_james))
print(sorted(clean_julie))
print(sorted(clean_mikey))
print(sorted(clean_sarah))

'''
['2.01', '2.01', '2.22', '2.34', '2.34', '2.45', '3.01', '3.10', '3.21']
['2.11', '2.11', '2.23', '2.23', '2.59', '3.10', '3.10', '3.21', '3.21']
['2.22', '2.38', '2.49', '3.01', '3.01', '3.02', '3.02', '3.02', '3.22']
['2.18', '2.25', '2.25', '2.39', '2.54', '2.55', '2.55', '2.58', '2.58']
'''
#默认的，sort（）方法和sorted()  BIF都会按升序对数据排序。
# 要以降序对数据排序，需向sort（）或sorted（）传入参数reverse=True，python会负责具体处理

clean_mikey = [sanitize(each_t) for each_t in mikey]

mins = [1,2,3]
secs = [m * 60 for m in mins]
print(secs) #[60, 120, 180]

meters = [1, 10, 3]
feet = [m*3.281 for m in meters]
print(feet) #[3.281, 32.81, 9.843]

lower = ["I", "don't", "like", "span"]
upper = [s.upper() for s in lower]
print(upper)    #['I', "DON'T", 'LIKE', 'SPAN']

dirty = ['2-22', '2:22', '2.22']
clean = [sanitize(t) for t in dirty]
print(clean)    #['2.22', '2.22', '2.22']

clean = [float(s) for s in clean]
print(clean)    #[2.22, 2.22, 2.22]

clean = [float((sanitize(t)) for t in ['2-22', '3:33', '4.44'])]
print(clean)    #[2.22, 2.22, 2.22]

查看全文

相关阅读:
Andrew Ng机器学习公开课笔记–Principal Components Analysis (PCA)
Python For Data Analysis -- Pandas
Python For Data Analysis -- NumPy
Python For Data Analysis -- IPython
Andrew Ng机器学习公开课笔记 – Factor Analysis
Andrew Ng机器学习公开课笔记 -- Mixtures of Gaussians and the EM algorithm
Andrew Ng机器学习公开课笔记 -- Online Learning
Machine Learning in Action -- Support Vector Machines
HDU-1090-A+B for Input-Output Practice (II)(骗訪问量的)
五种内部类形式将线程隐藏于类中

原文地址：https://www.cnblogs.com/kevin-hou1991/p/13636200.html