淘宝大数据的游戏,我重新提高自己的思维方式,
插件和代码前前后后写在六个版本,但最好的结果其实是我的第一次2第二码。这让我很惊讶,
但它也说明了一个问题。当你更熟悉的语言,当一方,你缺少的是其他的知识,
- 首先是我的数学知识,在分析用户行为时,我们知道浏览次数和购买次数是由一定规律的,这个方面找了数学系的同学问了一些,得到的结论是:你能够进行线性拟合。这是最简单的,可是得到的结果不一定真实,于是推荐我使用高斯分布来做。可是由于自己单枪匹马,所以选了比較简单的线性拟合
- 心理学,我们能够从数据中发现。那些常常在淘宝买东西的假设是时间间隔一段就买了同一种商品的,那说明这个人的属于死宅之类的,由于这些东西一般我们旁边就有,还有,就是浏览次数和购买之间的关系,用数学来解答,心理学来分析。多天浏览和购买的关系,
先意淫这些吧,下来上三个版本号的代码:
第一版本号,简单推測浏览十五次购买一次:
import time u_id=[] b_id=[] t_id=[] b_time=[] t_num0=0 t_num1=0 t_num2=0 t_num3=0 a=True i=0 j=0 fileread=open('t_alibaba_data.csv','r') while True: fileline=fileread.readline() # print type(fileline) # print fileline, # print i filedian =fileline.find(r',') filedian1=fileline.rfind(r',') b_id1=fileline[filedian+1:filedian1-2] b_id.append(b_id1) u_id1=fileline[:filedian] u_id.append(u_id1) t_id1=fileline[filedian1-1:filedian1] t_id.append(t_id1) b_time1=fileline[filedian1:-2] b_time.append(b_time1) if not fileline: break output=open('taobao.txt','a') #print u_id #print b_id #print t_id print b_time ff=0 while True: if u_id[i]==u_id[i+1]: if ff==0: output.write(u_id[i]) output.write(' ') ff=ff+1 if b_id[i]==b_id[i+1]: if int(t_id[i])==0: t_num0=t_num0+1 elif int(t_id[i])==1: t_num1=t_num1+1 elif int(t_id[i])==2: t_num2=t_num2+1 else: t_num3=t_num3+1 else: j=j+1 print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3 if t_num0>=15 or t_num1>=1 : output.write(b_id[i]) output.write(",") # else: # output.write(b_id[i]) # output.write(',') t_num0=0 t_num1=0 t_num2=0 t_num3=0 # else: else: output.write(' ') ff=0 # print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3 i=i+1
第二版本号,观察时间和购买行为
#coding:utf-8 import time u_id=[] b_id=[] t_id=[] b_time=[] t_num0=0 t_num1=0 t_num2=0 t_num3=0 b_num1=0 b_time4=0 a=True i=0 j=0 fileread=open('t_alibaba_data.csv','r') while True: fileline=fileread.readline() # print type(fileline) # print fileline, # print i filedian =fileline.find(r',') filedian1=fileline.rfind(r',') b_id1=fileline[filedian+1:filedian1-2] b_id.append(b_id1) u_id1=fileline[:filedian] u_id.append(u_id1) t_id1=fileline[filedian1-1:filedian1] t_id.append(t_id1) b_time1=fileline[filedian1:-2] b_time.append(b_time1) if not fileline: break output=open('taobao.txt','a') #print u_id #print b_id #print t_id #print b_time ff=0 while True: if u_id[i]==u_id[i+1]: if ff==0: output.write(u_id[i]) output.write(' ') ff=ff+1 if b_id[i]==b_id[i+1]: if int(t_id[i])==0: t_num0=t_num0+1 elif int(t_id[i])==1: t_num1=t_num1+1 elif int(t_id[i])==2: t_num2=t_num2+1 elif b_time[i]!=b_time[i+1]: b_time4=b_time4+1 else: t_num3=t_num3+1 else: j=j+1 b_num1=b_num1+1 print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4 if t_num0>=15 and t_num1==0: output.write(b_id[i]) output.write(",") if b_time4>=2 and t_num1==0: output.write(b_id[i]) output.write(',') if t_num0>15 and t_num1>=2: output.write(b_id[i]) output.write(',') if t_num2>=1 and t_num1==0: output.write(b_id[i]) output.write(',') if len(b_id)<=3: output.write(b_id[i]) output.write(',') # if b_num1<=3: # output.write(b_id[i]) # output.write(',') # # t_num0=0 t_num1=0 t_num2=0 t_num3=0 b_time4=0 # else: else: output.write(' ') b_num1=b_num1+1 # print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3 i=i+1 b_num1=0
第三版本号,使用数学分析
#coding:utf-8 import time import numpy as np from scipy import optimize from math import sqrt u_id=[] b_id=[] t_id=[] b_time=[] t_num0=0 #类型 t_num1=0 t_num2=0 t_num3=0 b_num1=0 #品牌个数 b_time4=0 #时间 a=True i=0 j=0 fileread=open('t_alibaba_data.csv','r') while True: fileline=fileread.readline() # print type(fileline) # print fileline, # print i filedian =fileline.find(r',') filedian1=fileline.rfind(r',') b_id1=fileline[filedian+1:filedian1-2] b_id.append(b_id1) u_id1=fileline[:filedian] u_id.append(u_id1) t_id1=fileline[filedian1-1:filedian1] t_id.append(t_id1) b_time1=fileline[filedian1:-2] b_time.append(b_time1) if not fileline: break output=open('taobao.txt','a') #print u_id #print b_id #print t_id #print b_time t_num00=[] t_num11=[] t_num22=[] t_num33=[] t_time44=[] cc=0 ff=0 pp=0 while True: if u_id[i]==u_id[i+1]: if ff==0: output.write(u_id[i]) output.write(' ') ff=ff+1 if b_id[i]==b_id[i+1]: # cc=cc+1 if int(t_id[i])==0: t_num0=t_num0+1 elif int(t_id[i])==1: t_num1=t_num1+1 elif int(t_id[i])==2: t_num2=t_num2+1 else: t_num3=t_num3+1 if b_time[i]!=b_time[i+1]: # print b_time4 b_time4=b_time4+1 else: j=j+1 # b_num1=b_num1+1 # print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4 # if b_time4>=3: # print b_time4 # pp=pp+1 # 数据拟合分析部分 t_num00.append(t_num0) t_num11.append(t_num1) t_num22.append(t_num2) t_num33.append(t_num3) t_time44.append(b_time4) # if t_num0>=10 : # output.write(b_id[i]) #看了15次的没有买的 # output.write(",") # elif b_time4>=3 : # output.write(b_id[i]) #多天看的,没有买 # output.write(',') # # if t_num0>15 and t_num1>=2: # # output.write(b_id[i]) # # output.write(',') # elif t_num2>=1 : # output.write(b_id[i]) #收藏出可是没有买 # output.write(',') # elif t_num3>=1 : #放进购物车可是没有买 # output.write(b_id[i]) # output.write(',') # # if b_time4>=2 and t_num1>=2: # # output.write(b_id[i]) # # output.write(',') # # # elif t_num1>=1: # output.write(b_id[i]) #买过两次 # output.write(',') # # if len(b_id)<=3: # output.write(b_id[i]) # output.write(',') # if b_num1<=3: # output.write(b_id[i]) # output.write(',') # # t_num0=0 t_num1=0 t_num2=0 t_num3=0 b_time4=0 # elif b_num1<=3 and ff!=0: # print b_id[i] # output.write(b_id[i]) # output.write(' ') # ff=0 ## else: elif not u_id[i+1]: break # else: # # output.write(' ') # ff=0 # print u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3 i=i+1 b_num1=0 #分析浏览次数和购买的关系 y=np.array(t_num00) x=np.array(t_num11) def residuals(p): k,b=p return y-(k*x-b) r=optimize.leastsq(residuals,[1,0]) k,b=r[0] print "K=",k,"b=",b #分析收藏和购买的关系 x22=np.array(t_num22) def residuals(p): k,b=p return y-(k*x22-b) r=optimize.leastsq(residuals,[1,0]) k22,b22=r[0] print "Kt_num22=",k22,"b22=",b22 #分析购物车和购买的关系 x33=np.array(t_num33) def residuals(p): k,b=p return y-(k*x33-b) r=optimize.leastsq(residuals,[1,0]) k33,b33=r[0] print "kt_num33=",k33,"b33=",b33 #查看天数和购物关系 x44=np.array(t_time44) def residuals(p): k,b=p return y-(k*x44-b) r=optimize.leastsq(residuals,[1,0]) k44,b44=r[0] print "ktime=",k44,"b44=",b44 print pp #def sim_pearson()