zoukankan      html  css  js  c++  java
  • 计算指标/哑变量

    from pandas import DataFrame,Series
    import pandas as pd
    import numpy as np
    
    # 如果一个DataFrame的某一列中含有K个不同值,则可以派生出一个K列矩阵
    df = DataFrame({'key':['b','b','a','c','a','b'],
                    'data':range(6)})
    print(df)
    '''
      key  data
    0   b     0
    1   b     1
    2   a     2
    3   c     3
    4   a     4
    5   b     5
    '''
    dummies = pd.get_dummies(df['key'],prefix='key')
    print(dummies)
    '''
       key_a  key_b  key_c
    0      0      1      0
    1      0      1      0
    2      1      0      0
    3      0      0      1
    4      1      0      0
    5      0      1      0
    '''
    df_with_dummies = df[['data']].join(dummies)
    print(df_with_dummies)
    '''
       data  key_a  key_b  key_c
    0     0      0      1      0
    1     1      0      1      0
    2     2      1      0      0
    3     3      0      0      1
    4     4      1      0      0
    5     5      0      1      0
    '''
    from pandas import DataFrame,Series
    import pandas as pd
    import numpy as np
    
    mnames = ['movie_id','title','genres']
    movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames, encoding='ISO-8859-1',engine='python')
    #print(movies[:10])
    
    genre_iter = (set(x.split('|')) for x in movies.genres)
    genres = sorted(set.union(*genre_iter)) # set.union选取并集,重复元素只会出现一次
    #print(genres[:10])
    
    # 创建一个多行多列全0的DataFrame
    dummies = DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
    
    for i,gen in enumerate(movies.genres):
        indices = dummies.columns.get_indexer(gen.split('|')) # 返回拆分结果在dummies中的排行
        dummies.iloc[i,indices] = 1
    movies_windic = movies.join(dummies.add_prefix('Genre_'))
    print(movies_windic.iloc[0].head(10)) # 取第1行的前10列
    import pandas as pd
    import numpy as np
    
    values = np.random.rand(10)
    print(values)
    '''
    [0.07858525 0.87300262 0.35604229 0.93110966 0.79934318 0.08215684
     0.96897297 0.3661382  0.22688337 0.50674505]
    '''
    bins = [0,0.2,0.4,0.6,0.8,1]
    print(pd.get_dummies(pd.cut(values,bins)))
    '''
       (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
    0           1           0           0           0           0
    1           0           0           0           0           1
    2           0           1           0           0           0
    3           0           0           0           0           1
    4           0           0           0           1           0
    5           1           0           0           0           0
    6           0           0           0           0           1
    7           0           1           0           0           0
    8           0           1           0           0           0
    9           0           0           1           0           0
    '''
  • 相关阅读:
    为什么我会被淘汰?
    2017-3-27日碎碎念
    (原创)我对未来的人类的发展,以及AI技术发展的一些思考。
    八大排序算法图文讲解
    PE病毒初探——向exe注入代码
    [转]Patch文件结构详解
    芝麻信用商家接入指南
    如何成为一名好的程序员的一些个人经验
    .NET CoreCLR开发人员指南(上)
    七牛云:ckeditor JS SDK 结合 C#实现多图片上传。
  • 原文地址:https://www.cnblogs.com/nicole-zhang/p/15194668.html
Copyright © 2011-2022 走看看