import nltk from nltk.corpus import stopwords # from nltk.stem.lancaster import LancasterStemmer # 词干化 # ls = LancasterStemmer() ls.stem(word) from db_process import MyProcess english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] s = 'attention window eyes users: if you are using internet explorer 9 or 10, you may not be able to log in to the chase site or other internet sites., I went to facebook with my students.' words = nltk.word_tokenize(s) # 分词 #tags = nltk.pos_tag(words) # 显示词性 filter_words = filter(lambda x: x not in english_punctuations and x not in stopwords.words('english'), words)