import nltk import numpy as np import re from nltk.corpus import stopwords # 1 分词1 text = "Sentiment analysis is a challenging subject in machine learning. People express their emotions in language that is often obscured by sarcasm, ambiguity, and plays on words, all of which could be very misleading for both humans and computers. There's another Kaggle competition for movie review sentiment analysis. In this tutorial we explore how Word2Vec can be applied to a similar problem.".lower() text_list = nltk.word_tokenize(text) #2 q去掉标点符号和停用词 #去掉标点符号 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] text_list = [word for word in text_list if word not in english_punctuations] #去掉停用词 stops = set(stopwords.words("english")) text_list = [word for word in text_list if word not in stops] #3统计词频 freq_dist = nltk.FreqDist(text_list) freq_list = [] num_words = len(freq_dist.values()) for i in range(num_words): freq_list.append([list(freq_dist.keys())[i],list(freq_dist.values())[i]]) freqArr = np.array(freq_list) print(freqArr) #4词性标注 print(nltk.pos_tag(text_list))