zoukankan html css js c++ java

NLP整体流程的代码

import nltk
import numpy as np
import re
from nltk.corpus import stopwords

# 1 分词1
text = "Sentiment analysis is a challenging subject in machine learning.
 People express their emotions in language that is often obscured by sarcasm,
  ambiguity, and plays on words, all of which could be very misleading for 
  both humans and computers. There's another Kaggle competition for movie review 
  sentiment analysis. In this tutorial we explore how Word2Vec can be applied to 
  a similar problem.".lower()

text_list = nltk.word_tokenize(text)

#2 q去掉标点符号和停用词
#去掉标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
text_list = [word for word in text_list if word not in english_punctuations]
#去掉停用词
stops = set(stopwords.words("english"))
text_list = [word for word in text_list if word not in stops]

#3统计词频
freq_dist = nltk.FreqDist(text_list)
freq_list = []
num_words = len(freq_dist.values())
for i in range(num_words):
    freq_list.append([list(freq_dist.keys())[i],list(freq_dist.values())[i]])
freqArr = np.array(freq_list)
print(freqArr)

#4词性标注
print(nltk.pos_tag(text_list))

查看全文

相关阅读:
ASP.NET MVC 5 安全性和创建用户角色
 使用ENTITY FRAMEWORK 6以正确的方式管理DBCONTEXT：深入指南
 C#读取二进制格式的shapefile
ASP.NET MVC 应用程序初学者常见问题汇总
 ASP.NET MVC 应用程序中使用CKEditor 4 的步骤
 html5学习笔记2
html5学习笔记
 c#接口定义与应用
 Django ORM 优化心得
 携程的那点事

原文地址：https://www.cnblogs.com/elpsycongroo/p/9369420.html