pandas 处理 纽约签到数据集
import pandas as pd
import numpy as np
import datetime
names = ['User_id','Venue_id','Venue_category_id','Venue_name','Latitude', 'Longitude','Timezone_offset','UTC_time']
file01 = 'datadataset_TSMC2014_NYC.txt'
file02 = 'datadataset_TSMC2014_TKY.txt'
save_file01 = 'nyc_data.csv'
save_file02 = 'tky_data.csv'
select = 1
if select == 1:
file = file01
save_file = save_file01
else :
file = file02
save_file = save_file02
papa = pd.read_csv(file, sep=' ', header=None, names=names)
def get_stamp(df):
time_str = df['UTC_time']
datetime_obj = datetime.datetime.strptime(time_str,'%a %b %d %H:%M:%S %z %Y')
return int(datetime_obj.timestamp())
papa['Timestamp'] = papa.apply(lambda df: get_stamp(df) , axis=1)
papa.drop(['Timezone_offset','UTC_time'], inplace=True, axis=1)
papa = papa.sort_values(axis=0,ascending= True, by=['User_id','Timestamp']).reset_index(drop=True)
papa.to_csv(save_file,index=False,header=True,na_rep="NULL")
data = pd.read_csv(save_file, sep=',')
print(data.head())
print(data.describe())