1 from sklearn.feature_extraction.text import CountVectorizer
2 import nltk
3 # nltk.download("punkt")
4 # nltk.download('averaged_perceptron_tagger')
5
6 '''
7 分别使用词袋法和nltk自然预言处理包提供的文本特征提取
8 '''
9
10 sent1 = "The cat is walking in the bedroom."
11 sent2 = "A dog was running across the kitchen."
12 # 使用词袋法 将文本转化为特征向量
13 count_vec = CountVectorizer()
14 sentences = [sent1, sent2]
15 # 输出转化后的特征向量
16 # print(count_vec.fit_transform(sentences).toarray())
17 '''
18 [[0 1 1 0 1 1 0 0 2 1 0]
19 [1 0 0 1 0 0 1 1 1 0 1]]
20 '''
21 # 输出转化后特征的含义
22 # print(count_vec.get_feature_names())
23 '''
24 ['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was']
25 '''
26
27 # 使用nltk对文本进行语言分析
28 # 对句子词汇分割和正则化 把aren't 分割成 are 和 n't I'm 分割成 I和'm
29 tokens1 = nltk.word_tokenize(sent1)
30 tokens2 = nltk.word_tokenize(sent2)
31 # print(tokens1)
32 # print(tokens2)
33 '''
34 ['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.']
35 ['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.']
36 '''
37 # 整理词汇表 按照ASCII的顺序排序
38 vocab_1 = sorted(set(tokens1))
39 vocab_2 = sorted(set(tokens2))
40 # print(vocab_1)
41 # print(vocab_2)
42 '''
43 ['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking']
44 ['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was']
45 '''
46 # 初始化stemer 寻找每个单词最原始的词根
47 stemmer = nltk.stem.PorterStemmer()
48 stem_1 = [stemmer.stem(t) for t in tokens1]
49 stem_2 = [stemmer.stem(t) for t in tokens2]
50 # print(stem_1)
51 # print(stem_2)
52 '''
53 ['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']
54 ['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.']
55 '''
56 # 利用词性标注器 对词性进行标注
57 pos_tag_1 = nltk.tag.pos_tag(tokens1)
58 pos_tag_2 = nltk.tag.pos_tag(tokens2)
59 # print(pos_tag_1)
60 # print(pos_tag_2)
61 '''
62 [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')]
63 [('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')]
64 '''