nltk简要笔记

import nltk
from nltk.corpus import stopwords
# from nltk.stem.lancaster import LancasterStemmer  # 词干化
# ls = LancasterStemmer()  ls.stem(word)

from db_process import MyProcess

english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']

s = 'attention window eyes users: if you are using internet explorer 9 or 10, you may not be able to log in to the chase site or other internet sites., I went to facebook with my students.'

words = nltk.word_tokenize(s)  # 分词

#tags = nltk.pos_tag(words)  # 显示词性

filter_words = filter(lambda x: x not in english_punctuations and x not in stopwords.words('english'), words)

查看全文

相关阅读:
MinIO：入门
 JS中面向对象的多种继承方式
 点击按钮实现图片下载
 给大家推荐一个免费的云平台-阿贝云
 REPLACE
SUBSTRING_REGEXPR 截取字符串
 基本的git/linux/g++/ 等指令
 C++引用和指针&， *
go语言异常处理 error panic recover defer
django.db.utils.IntegrityError: (1048, "Column 'id' cannot be null")

原文地址：https://www.cnblogs.com/fuzzier/p/7363797.html