Natural Language Processing with Python
Charpter 6.1
suffix_fdist处代码稍微改动。
1 import nltk 2 from nltk.corpus import brown 3 4 def common_suffixes_fun(): 5 suffix_fdist=nltk.FreqDist() 6 for word in brown.words(): 7 word=word.lower() 8 suffix_fdist[word[-1:]] +=1 9 suffix_fdist[word[-2:]] +=1 10 suffix_fdist[word[-3:]] +=1 11 most_freqent_items=[it for it in sorted(suffix_fdist.items(),key=lambda x:(-x[1],x[0]))[:100]] 12 return [su[0] for su in most_freqent_items] 13 14 common_suffixes = common_suffixes_fun() 15 16 def pos_features(word): 17 features={} 18 for su in common_suffixes: 19 features['endswith(%s)' % su]=word.lower().endswith(su) 20 return features 21 22 def test_pos(): 23 tagged_words = brown.tagged_words(categories='news')[:5000] 24 featuresets=[(pos_features(word),tag) for (word,tag) in tagged_words] 25 26 size= int(len(tagged_words)*0.1) 27 train_set, test_set = featuresets[size:],featuresets[:size] 28 classifier=nltk.NaiveBayesClassifier.train(train_set) 29 30 print nltk.classify.accuracy(classifier,test_set) 31 classifier.show_most_informative_features(5)
运行结果为:
0.652
Most Informative Features
endswith(o) = True TO : NN = 423.2 : 1.0
endswith(es) = True DOZ : NN = 319.5 : 1.0
endswith(om) = True WPO : NN = 319.5 : 1.0
endswith(as) = True BEDZ : IN = 303.3 : 1.0
endswith(s) = True BEDZ : IN = 303.3 : 1.0