• Python_sklearn机器学习库学习笔记(一)_Feature Extraction and Preprocessing(特征提取与预处理)


      # Extracting features from categorical variables

    #Extracting features from categorical variables  独热编码
    from sklearn.feature_extraction import DictVectorizer
    onehot_encoder=DictVectorizer()
    instance=[{'city':'New York'},{'city':'San Francisco'},
              {'city':'Chapel Hill'}]
    print onehot_encoder.fit_transform(instance).toarray()
    输出结果:
    [[ 0.  1.  0.]
     [ 0.  0.  1.]
     [ 1.  0.  0.]]

      # Extracting features from text文字特征提取

      ## The bag-of-words representation

    #bag-of-words model.词库模型 
    corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
    ]
    #CountVectorizer类通过正则表达式用空格分割句子,然后抽取长度大于等于2的字母序列。scikit-learn实现代码如下:
    from sklearn.feature_extraction.text import CountVectorizer
    corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
    ]
    vectorizer=CountVectorizer()
    print vectorizer.fit_transform(corpus).todense()#todense将稀疏矩阵转化为完整特征矩阵
    print vectorizer.vocabulary_

      输出结果:

    [[1 1 0 1 0 1 0 1]
    [1 1 1 0 1 0 1 0]]
    {u'duke': 1, u'basketball': 0, u'lost': 4, u'played': 5, u'game': 2, u'unc': 7, u'in': 3, u'the': 6}

    corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
    ]
    vectorizer = CountVectorizer()
    print(vectorizer.fit_transform(corpus).todense())
    print(vectorizer.vocabulary_)

      输出结果:

    [[0 1 1 0 1 0 1 0 0 1]
    [0 1 1 1 0 1 0 0 1 0]
    [1 0 0 0 0 0 0 1 0 0]]
    {u'duke': 2, u'basketball': 1, u'lost': 5, u'played': 6, u'in': 4, u'game': 3, u'sandwich': 7, u'unc': 9, u'ate': 0, u'the': 8}

      scikit-learn里面的euclidean_distances函数可以计算若干向量的距离,表示两个语义最相似的
    文档其向量在空间中也是最接近的。

      

    from sklearn.metrics.pairwise import euclidean_distances
    count=[[0, 1, 1, 0, 0, 1, 0, 1],
           [0, 1, 1, 1, 1, 0, 0, 0],
           [1, 0, 0, 0, 0, 0, 1, 0]]
    print 'Distance between 1st and 2nd documents:',euclidean_distances(count[0],count[1])

     输出结果:Distance between 1st and 2nd documents: [[ 2.]]

    #format方法
    for x,y in[[0,1],[0,2],[1,2]]:
        count=[[0, 1, 1, 0, 0, 1, 0, 1],
               [0, 1, 1, 1, 1, 0, 0, 0],
               [1, 0, 0, 0, 0, 0, 1, 0]]
        dist=euclidean_distances(count[x],count[y])
        print '文档{}文档{}文档{}'.format(x,y,dist)

      输出结果:

    文档0文档1文档[[ 2.]]
    文档0文档2文档[[ 2.44948974]]
    文档1文档2文档[[ 2.44948974]]

    ## Stop-word filtering 停用词过滤
    CountVectorizer类可以通过设置stop_words参数过滤停用词,默认是英语常用的停用词。
    from sklearn.feature_extraction.text import CountVectorizer
    corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
    ]
    vectorizer=CountVectorizer(stop_words='english')
    print vectorizer.fit_transform(corpus).todense()
    print vectorizer.vocabulary_

      输出结果:

    [[0 1 1 0 0 1 0 1]
     [0 1 1 1 1 0 0 0]
     [1 0 0 0 0 0 1 0]]
    {u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}

      # Stemming and lemmatization  词根还原和词形还原 

    from sklearn.feature_extraction.text import CountVectorizer
    corpus = ['He ate the sandwiches',
              'Every sandwich was eaten by him']
    vectorizer=CountVectorizer(binary=True,stop_words='english')
    print vectorizer.fit_transform(corpus).todense()
    print vectorizer.vocabulary_

      输出结果:

      [[1 0 0 1]
      [0 1 1 0]]
      {u'sandwich': 2, u'ate': 0, u'sandwiches': 3, u'eaten': 1}

      ### 让我们分析一下单词gathering的词形还原:

      

    corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
    ]
    import nltk
    nltk.download()
    
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk import word_tokenize
    from nltk.stem import PorterStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk import pos_tag
    wordnet_tags = ['n', 'v']
    corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
    ] 
    stemmer = PorterStemmer()
    print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])

      输出结果:

      ('Stemmed:', [[u'He', u'ate', u'the', u'sandwich'], [u'Everi', u'sandwich', u'wa', u'eaten', u'by', u'him']])

      

    def lemmatize(token, tag):
        if tag[0].lower() in ['n', 'v']:
            return lemmatizer.lemmatize(token, tag[0].lower())
        return token
    lemmatizer = WordNetLemmatizer()
    tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
    print('Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus])

      输出结果:

      ('Lemmatized:', [['He', u'eat', 'the', u'sandwich'], ['Every', 'sandwich', u'be', u'eat', 'by', 'him']])

      ## 带TF-IDF权重的扩展词库

    from sklearn.feature_extraction.text import CountVectorizer
    corpus=['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
    vectorizer=CountVectorizer(stop_words='english')
    print vectorizer.fit_transform(corpus).todense()
    print vectorizer.vocabulary_

      输出结果:

      [[2 1 3 1 1]]
      {u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}

      

    #tf-idf
    from sklearn.feature_extraction.text import TfidfVectorizer
    corpus = ['The dog ate a sandwich and I ate a sandwich','The wizard transfigured a sandwich']
    vectorizer=TfidfVectorizer(stop_words='english')
    print vectorizer.fit_transform(corpus).todense()
    print vectorizer.vocabulary_

      输出结果:

      [[ 0.75458397  0.37729199  0.53689271  0.          0.        ]
      [ 0.          0.          0.44943642  0.6316672   0.6316672 ]]
      {u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}

      ## 通过哈希技巧实现特征向量

      

    from sklearn.feature_extraction.text import HashingVectorizer
    corpus = ['the', 'ate', 'bacon', 'cat']
    vectorizer = HashingVectorizer(n_features=6)
    print(vectorizer.transform(corpus).todense())

      输出结果:

    [[-1.  0.  0.  0.  0.  0.]
     [ 0.  0.  0.  1.  0.  0.]
     [ 0.  0.  0.  0. -1.  0.]
     [ 0.  1.  0.  0.  0.  0.]]
    设置成6是为了演示。另外,注意有些单词频率是负数。由于Hash碰撞可能发生,所以HashingVectorizer用有符号哈希函数(signed hash function)。特征值和它的词块的哈希值带
    同样符号,如果cats出现过两次,被哈希成-3,文档特征向量的第四个元素要减去2。如果dogs出现过两次,被哈希成3,文档特征向量的第四个元素要加上2。

    ## 图片特征提取
    #通过像素值提取特征
    scikit-learn的digits数字集包括至少1700种0-9的手写数字图像。每个图像都有8x8像像素构成。每
    个像素的值是0-16,白色是0,黑色是16。如下图所示:
    %matplotlib inline
    from sklearn import datasets
    import matplotlib.pyplot as plt
    digits=datasets.load_digits()
    print 'Digit:',digits.target[0]
    print digits.images[0]
    plt.imshow(digits.images[0], cmap=plt.cm.gray_r, interpolation='nearest')
    plt.show()

    输出结果:

      Digit: 0
    [[  0.   0.   5.  13.   9.   1.   0.   0.]
    [  0.   0.  13.  15.  10.  15.   5.   0.]
    [  0.   3.  15.   2.   0.  11.   8.   0.]
    [  0.   4.  12.   0.   0.   8.   8.   0.]
    [  0.   5.   8.   0.   0.   9.   8.   0.]
    [  0.   4.  11.   0.   1.  12.   7.   0.]
    [  0.   2.  14.   5.  10.  12.   0.   0.]
    [  0.   0.   6.  13.  10.   0.   0.   0.]]

    digits=datasets.load_digits()
    print('Feature vector:
    ',digits.images[0].reshape(-1,64))

    输出结果:

      ('Feature vector: ', array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.,   0.,   0.,  13.,
             15.,  10.,  15.,   5.,   0.,   0.,   3.,  15.,   2.,   0.,  11.,
              8.,   0.,   0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.,   0.,
              5.,   8.,   0.,   0.,   9.,   8.,   0.,   0.,   4.,  11.,   0.,
              1.,  12.,   7.,   0.,   0.,   2.,  14.,   5.,  10.,  12.,   0.,
              0.,   0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]]))

      

    %matplotlib inline
    import numpy as np
    from skimage.feature import corner_harris,corner_peaks
    from skimage.color import rgb2gray
    import matplotlib.pyplot as plt
    import skimage.io as io
    from skimage.exposure import equalize_hist
    
    def show_corners(corners,image):
        fig=plt.figure()
        plt.gray()
        plt.imshow(image)
        y_corner,x_corner=zip(*corners)
        plt.plot(x_corner,y_corner,'or')
        plt.xlim(0,image.shape[1])
        plt.ylim(image.shape[0],0)
        fig.set_size_inches(np.array(fig.get_size_inches())*1.5)
        plt.show()
    mandrill=io.imread('1.jpg')
    mandrill=equalize_hist(rgb2gray(mandrill))
    corners=corner_peaks(corner_harris(mandrill),min_distance=2)
    show_corners(corners,mandrill)

     

      ### SIFT和SURF

      

    import mahotas as mh
    from mahotas.features import surf
    image = mh.imread('2.jpg', as_grey=True)
    print('第一个SURF描述符:
    {}
    '.format(surf.surf(image)[0]))
    print('抽取了%s个SURF描述符' % len(surf.surf(image)))

      输出结果:

    第一个SURF描述符:
    [  4.40526550e+02   2.82058666e+02   1.80770206e+00   2.56869094e+02
       1.00000000e+00   1.91360320e+00  -6.59236825e-04  -2.96877983e-04
       1.09769833e-03   3.67625424e-04  -1.90927908e-03  -9.72986820e-04
       2.86457301e-03   9.74479580e-04  -2.15057079e-04  -1.42831161e-04
       2.23010810e-04   1.42831161e-04   3.37184432e-06   1.74527115e-06
       3.37184454e-06   1.74527136e-06   3.90064757e-02   3.58161210e-03
       3.90511371e-02   4.40730516e-03   4.41527246e-01   2.71798365e-02
       4.41527246e-01   8.70393902e-02   4.56954581e-01  -2.29019329e-02
       4.56954581e-01   9.63314021e-02   6.29652613e-02   1.77485267e-02
       6.29652613e-02   2.13300792e-02   2.23341915e-03  -7.45940061e-04
       6.30745845e-03   5.05762292e-03  -1.57216338e-02   7.64635174e-02
       1.43149320e-01   3.04822002e-01  -2.48229831e-02  -1.02886168e-01
       8.65904522e-02   1.43815811e-01  -6.32987455e-03  -5.59536669e-03
       2.03817407e-02   1.31338762e-02   6.68332753e-04   4.10704922e-05
       1.25106500e-03   1.20076608e-03   5.65924789e-03  -9.40465975e-03
       2.08687062e-02   4.03695676e-02   3.18301424e-03  -1.22350925e-02
       1.59209535e-02   1.88643296e-02   1.13586147e-03   4.11031770e-04
       1.96554689e-03   1.16562736e-03]
    
    抽取了826个SURF描述符

      ## 数据标准化
      
    #scikit-learn的scale函数可以实现:
    #解释变量的值可以通过正态分布进行标准化,减去均值后除以标准差。
    from sklearn import preprocessing
    import numpy as np
    X=np.array([[0., 0., 5., 13., 9., 1.],
                [0., 0., 13., 15., 10., 15.],
                [0., 3., 15., 2., 0., 11.]])
    print(preprocessing.scale(X))

      输出结果:

      [[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
      [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
      [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]

      

  • 相关阅读:
    ShellExecuteEX打开iqy文件导致excel hang的原因分析
    3种方法遍历商人访问5个城市问题
    用CRT查找内存泄漏
    同步异步和阻塞5-异步非阻塞
    同步异步和阻塞4-同步非阻塞
    同步异步和阻塞3-同步阻塞
    同步异步和阻塞2-测试小项目
    同步异步和阻塞1
    IDA分析脱壳后丢失导入表的PE
    计算机中补码的数学运算逻辑及证明
  • 原文地址:https://www.cnblogs.com/wuchuanying/p/6231912.html
走看看 - 开发者的网上家园