1 import pandas as pd 2 import numpy as np 3 import codecs 4 import math 5 6 doc1 = codecs.open('dataset_TIST2015/dataset_TIST2015_Checkins.txt','rU','latin-1') 7 checkins = pd.read_csv(doc1, delimiter=' ') 8 checkins.columns=['userId', 'venueId', 'timeUTC', 'timeOffset'] 9 10 len(checkins['userId'].unique()) 11 len(checkins['venueId'].unique()) 12 13 from sklearn.preprocessing import LabelEncoder 14 from scipy.sparse import csr_matrix 15 16 venueIdencoder = LabelEncoder().fit(checkins['venueId']) 17 userIdencoder = LabelEncoder().fit(checkins['userId']) 18 19 checkins['venueIdencoded'] = venueIdencoder.transform(checkins['venueId']) 20 n_venues = len(venueIdencoder.classes_) 21 22 from sklearn.cross_validation import train_test_split 23 24 train_df, test_df = train_test_split(checkins, train_size = 0.8) 25 26 train = csr_matrix((np.ones(train_df.shape[0]), (train_df.userId, train_df.venueIdencoded)), shape=((train_df.userId.max()+1),n_venues)) 27 28 test = csr_matrix((np.ones(test_df.shape[0]), (test_df.userId, test_df.venueIdencoded)), shape=((test_df.userId.max()+1),n_venues)) 29 30 #print(test.nnz) 31 #print(train.nnz) 32 33 #print(test.max()) 34 #print(train.max()) 35 36 from lightfm import LightFM 37 from lightfm.evaluation import auc_score 38 39 NUM_THREADS = 1 40 NUM_COMPONENTS = 30 41 NUM_EPOCHS = 1 42 ITEM_ALPHA = math.exp(-6) 43 44 model = LightFM(loss='warp', 45 item_alpha=ITEM_ALPHA, 46 no_components=NUM_COMPONENTS) 47 48 model.fit(train,epochs=NUM_EPOCHS,num_threads=NUM_THREADS) 49 50 51 train_auc = auc_score(model, train,num_threads=NUM_THREADS).mean() 52 test_auc = auc_score(model, test,train_interactions=train,num_threads=NUM_THREADS).mean() 53 54 print("Train_auc is %f" %train_auc) 55 print("Test_aus is %f" %test_auc)
Some problems :
Expect to get a binary marix but no...
Here is the code in console:
1 train 2 Out[6]: 3 <266910x3680125 sparse matrix of type '<class 'numpy.float64'>' 4 with 12774460 stored elements in Compressed Sparse Row format> 5 train.data.max() 6 Out[7]: 520.0 7 train.data.min() 8 Out[8]: 1.0 9 test.data.max() 10 Out[9]: 140.0 11 test.data.mean() 12 Out[10]: 1.533210711390105 13 test.data.min() 14 Out[11]: 1.0
and Running on cluster for one night but got no results showed...