字典预处理
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from scipy.stats import pearsonr import jieba import pandas as pd def dict_vec(): # 实例化dict # dict = DictVectorizer() dict = DictVectorizer(sparse=False) # diaoyong fit_transform data = dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) # 打印每一个列的名称 print(dict.get_feature_names()) print(data) return None if __name__ == '__main__': dict_vec()
文本的预处理
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from scipy.stats import pearsonr import jieba import pandas as pd def dict_vec(): # 实例化dict # dict = DictVectorizer() dict = DictVectorizer(sparse=False) # diaoyong fit_transform data = dict.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) # 打印每一个列的名称 print(dict.get_feature_names()) print(data) return None def countvec(): # 实例化conunt count = CountVectorizer() # 对两篇文章进行特征抽取 data = count.fit_transform(["人生 人生 苦短,我 喜 欢Python", "生 活太 长 久,我不 喜欢P ython"]) # 内容 print(count.get_feature_names()) print(data.toarray()) # print(data) return None if __name__ == '__main__': countvec()