zoukankan      html  css  js  c++  java
  • python购物淫秽数据分析(2)

    淘宝大数据的游戏,我重新提高自己的思维方式,

    插件和代码前前后后写在六个版本,但最好的结果其实是我的第一次2第二码。这让我很惊讶,

    但它也说明了一个问题。当你更熟悉的语言,当一方,你缺少的是其他的知识,

    1. 首先是我的数学知识,在分析用户行为时,我们知道浏览次数和购买次数是由一定规律的,这个方面找了数学系的同学问了一些,得到的结论是:你能够进行线性拟合。这是最简单的,可是得到的结果不一定真实,于是推荐我使用高斯分布来做。可是由于自己单枪匹马,所以选了比較简单的线性拟合
    2. 心理学,我们能够从数据中发现。那些常常在淘宝买东西的假设是时间间隔一段就买了同一种商品的,那说明这个人的属于死宅之类的,由于这些东西一般我们旁边就有,还有,就是浏览次数和购买之间的关系,用数学来解答,心理学来分析。多天浏览和购买的关系,
    先意淫这些吧,下来上三个版本号的代码:
    第一版本号,简单推測浏览十五次购买一次:
    import time
    
    u_id=[]
    b_id=[]
    t_id=[]
    b_time=[]
    t_num0=0
    t_num1=0
    t_num2=0
    t_num3=0
    a=True
    i=0
    j=0
    fileread=open('t_alibaba_data.csv','r')
    while True:
        fileline=fileread.readline()
      #  print  type(fileline)
      #  print fileline,
      #  print  i
        filedian =fileline.find(r',')
        filedian1=fileline.rfind(r',')
    
        b_id1=fileline[filedian+1:filedian1-2]
        b_id.append(b_id1)
        
        u_id1=fileline[:filedian]
        u_id.append(u_id1)
        
        t_id1=fileline[filedian1-1:filedian1]
        t_id.append(t_id1)
        
        b_time1=fileline[filedian1:-2]
        b_time.append(b_time1)
    
    
        if not fileline:
            break
    output=open('taobao.txt','a')
    #print u_id
    #print b_id
    #print t_id
    print b_time
    
    ff=0
    while True:
        if u_id[i]==u_id[i+1]:
            
            if ff==0:
    
                output.write(u_id[i])
                output.write('   ')
            ff=ff+1
            if b_id[i]==b_id[i+1]:
        
                if int(t_id[i])==0:
                        t_num0=t_num0+1
                elif int(t_id[i])==1:
                    t_num1=t_num1+1
                elif int(t_id[i])==2:
                    t_num2=t_num2+1
                else:
                    t_num3=t_num3+1
            else:
                j=j+1
                print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
                if t_num0>=15 or t_num1>=1 :
                    output.write(b_id[i])
                    output.write(",")
    #            else:
    #                output.write(b_id[i])
    #                output.write(',') 
                t_num0=0
                t_num1=0
                t_num2=0
                t_num3=0
        
    #    else:
        else:
            output.write('
    ')
            ff=0
    #        print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
        i=i+1
        



    第二版本号,观察时间和购买行为
    #coding:utf-8
    import time
    
    u_id=[]
    b_id=[]
    t_id=[]
    b_time=[]
    t_num0=0
    t_num1=0
    t_num2=0
    t_num3=0
    b_num1=0
    b_time4=0
    a=True
    i=0
    j=0
    fileread=open('t_alibaba_data.csv','r')
    while True:
        fileline=fileread.readline()
      #  print  type(fileline)
      #  print fileline,
      #  print  i
        filedian =fileline.find(r',')
        filedian1=fileline.rfind(r',')
    
        b_id1=fileline[filedian+1:filedian1-2]
        b_id.append(b_id1)
        
        u_id1=fileline[:filedian]
        u_id.append(u_id1)
        
        t_id1=fileline[filedian1-1:filedian1]
        t_id.append(t_id1)
        
        b_time1=fileline[filedian1:-2]
        b_time.append(b_time1)
    
    
        if not fileline:
            break
    output=open('taobao.txt','a')
    #print u_id
    #print b_id
    #print t_id
    #print b_time
    
    ff=0
    while True:
        if u_id[i]==u_id[i+1]:
            
            if ff==0:
    
                output.write(u_id[i])
                output.write('   ')
            ff=ff+1
            if b_id[i]==b_id[i+1]:
        
                if int(t_id[i])==0:
                        t_num0=t_num0+1
                elif int(t_id[i])==1:
                    t_num1=t_num1+1
                elif int(t_id[i])==2:
                    t_num2=t_num2+1
                elif b_time[i]!=b_time[i+1]:
                    b_time4=b_time4+1
                else:
                    t_num3=t_num3+1
            else:
                j=j+1
                b_num1=b_num1+1
                print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4
                if t_num0>=15 and t_num1==0:
                    output.write(b_id[i])
                    output.write(",")
                if b_time4>=2 and t_num1==0:
                    output.write(b_id[i])
                    output.write(',') 
                if t_num0>15 and t_num1>=2:
                    output.write(b_id[i])
                    output.write(',') 
                if t_num2>=1 and t_num1==0:
                    output.write(b_id[i])
                    output.write(',') 
                if len(b_id)<=3:
                    output.write(b_id[i])
                    output.write(',') 
    
    #            if b_num1<=3:
    #                output.write(b_id[i])
    #                output.write(',') 
    #
    #
                t_num0=0
                t_num1=0
                t_num2=0
                t_num3=0
                b_time4=0
        
    #    else:
        else:
            output.write('
    ')
            b_num1=b_num1+1
    #        print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
        i=i+1
        b_num1=0
    



    第三版本号,使用数学分析
    #coding:utf-8
    import time
    import numpy as np
    from scipy import optimize
    from math import sqrt
    
    u_id=[]
    b_id=[]
    t_id=[]
    b_time=[]
    t_num0=0        #类型
    t_num1=0
    t_num2=0
    t_num3=0
    b_num1=0        #品牌个数
    b_time4=0       #时间
    a=True
    i=0
    j=0
    fileread=open('t_alibaba_data.csv','r')
    while True:
        fileline=fileread.readline()
      #  print  type(fileline)
      #  print fileline,
      #  print  i
        filedian =fileline.find(r',')
        filedian1=fileline.rfind(r',')
    
        b_id1=fileline[filedian+1:filedian1-2]
        b_id.append(b_id1)
        
        u_id1=fileline[:filedian]
        u_id.append(u_id1)
        
        t_id1=fileline[filedian1-1:filedian1]
        t_id.append(t_id1)
        
        b_time1=fileline[filedian1:-2]
        b_time.append(b_time1)
    
    
        if not fileline:
            break
    output=open('taobao.txt','a')
    #print u_id
    #print b_id
    #print t_id
    #print b_time
    t_num00=[]
    t_num11=[]
    t_num22=[]
    t_num33=[]
    t_time44=[]
    cc=0
    ff=0
    pp=0
    while True:
        if u_id[i]==u_id[i+1]:
            
            if ff==0:
    
                output.write(u_id[i])
                output.write('   ')
            ff=ff+1
            if b_id[i]==b_id[i+1]:
    #            cc=cc+1 
                if int(t_id[i])==0:
                        t_num0=t_num0+1
                elif int(t_id[i])==1:
                    t_num1=t_num1+1
                elif int(t_id[i])==2:
                    t_num2=t_num2+1
                else:
                    t_num3=t_num3+1
                if b_time[i]!=b_time[i+1]:
                   # print b_time4
                    b_time4=b_time4+1
            else:
                j=j+1
            #    b_num1=b_num1+1
    #            print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4
    #            if b_time4>=3:
    #                print b_time4
    #                pp=pp+1
    #       数据拟合分析部分
    
    
                t_num00.append(t_num0)
                t_num11.append(t_num1)
                t_num22.append(t_num2)
                t_num33.append(t_num3)
                t_time44.append(b_time4)
    
    #            if t_num0>=10 :
    #                output.write(b_id[i])           #看了15次的没有买的
    #                output.write(",")
    #            elif b_time4>=3 :
    #                output.write(b_id[i])           #多天看的,没有买
    #                output.write(',') 
    #           # if t_num0>15 and t_num1>=2:
    #           #     output.write(b_id[i])
    #           #     output.write(',') 
    #            elif t_num2>=1 :
    #                output.write(b_id[i])           #收藏出可是没有买
    #                output.write(',') 
    #            elif t_num3>=1 :         #放进购物车可是没有买
    #                output.write(b_id[i])
    #                output.write(',') 
    #          #  if b_time4>=2 and t_num1>=2:
    #          #      output.write(b_id[i])
    #          #      output.write(',') 
    #          #  
    #            elif t_num1>=1:
    #                output.write(b_id[i])           #买过两次
    #                output.write(',') 
    #                
                    
    #            if len(b_id)<=3:
    #                output.write(b_id[i])
    #                output.write(',') 
    
    #            if b_num1<=3:
    #                output.write(b_id[i])
    #                output.write(',') 
    #
    #
                t_num0=0
                t_num1=0
                t_num2=0
                t_num3=0
                b_time4=0
    #    elif b_num1<=3 and ff!=0:
    #        print b_id[i]       
    #        output.write(b_id[i])
    #        output.write('
    ')
    #        ff=0
    ##    else:
        elif not u_id[i+1]:
            break
    #    else:
    #
    #        output.write('
    ')
    #        ff=0
    #        print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
        i=i+1
        b_num1=0
    
    
    
    #分析浏览次数和购买的关系   
    y=np.array(t_num00)
    x=np.array(t_num11)
    
    def residuals(p):
        k,b=p
        return y-(k*x-b)
    
    r=optimize.leastsq(residuals,[1,0])
    k,b=r[0]
    print "K=",k,"b=",b
    
    #分析收藏和购买的关系
    x22=np.array(t_num22)
    
    def residuals(p):
        k,b=p
        return y-(k*x22-b)
    
    r=optimize.leastsq(residuals,[1,0])
    k22,b22=r[0]
    print "Kt_num22=",k22,"b22=",b22
    
    #分析购物车和购买的关系
    x33=np.array(t_num33)
    
    def residuals(p):
        k,b=p
        return y-(k*x33-b)
    
    r=optimize.leastsq(residuals,[1,0])
    k33,b33=r[0]
    print "kt_num33=",k33,"b33=",b33
    #查看天数和购物关系
    x44=np.array(t_time44)
    
    def residuals(p):
        k,b=p
        return y-(k*x44-b)
    
    r=optimize.leastsq(residuals,[1,0])
    k44,b44=r[0]
    print "ktime=",k44,"b44=",b44
    
    print pp
    #def sim_pearson()
    



  • 相关阅读:
    JS之事件及冒泡
    DOM读取和修改内联样式
    dom查询与修改的一些常用方法
    js修改this指向的三种方法(call,bind,apply)
    JS原型概念
    JS创建对象
    JS的this(谁调用就指向谁)
    变量声明提前与函数声明提前
    JS对象创建
    正则应用之数据采集房屋网站信息
  • 原文地址:https://www.cnblogs.com/mengfanrong/p/4605388.html
Copyright © 2011-2022 走看看