zoukankan html css js c++ java

NLP（七）信息抽取和文本分类

原文链接：http://www.one2know.cn/nlp7/

命名实体
专有名词：人名地名产品名

例句	命名实体
Hampi is on the South Bank of Tungabhabra river	Hampi,Tungabhabra River
Paris is famous for Fashion	Paris
Burj Khalifa is one of the SKyscrapers in Dubai	Burj Khalifa,Dubai
Jeff Weiner is the CEO of LinkedIn	Jeff Weiner,LinkedIn

命名实体是独一无二的名词
分类：TIMEZONE,LOCATION,RIVERS,COSMETICS(化妆品),CURRENCY(货币),DATE,TIME,PERSON

NLTK识别命名实体
使用的数据已经经过以下预处理（之前学过的）：
1.将大文档分割成句子
2.将句子分割成词
3.对句子进行词性标注
4.从句子中提取包含连续词（非重叠）的组块（短语）
5.给这些组块包含的词标注IOB标签
分析treebank语料库：

import nltk

def sampleNE():
    sent = nltk.corpus.treebank.tagged_sents()[0] # 语料库第一句
    print(nltk.ne_chunk(sent)) # nltk.ne_chunk()函数分析识别一个句子的命名实体

def sampleNE2():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent,binary=True))  # 包含识别无类别的命名实体

if __name__ == "__main__":
    sampleNE()
    sampleNE2()

输出：

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)
(S
  (NE Pierre/NNP Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)

创建字典、逆序字典和使用字典
字典：一对一映射，将词和词性一一对应放入字典，下次可高效查找

import nltk

class LearningDictionary():
    def __init__(self,sentence): # 实例化时直接运行,建立了两个字典
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary()

    # 将词和词性放到字典
    def buildDictionary(self):
        self.dictionary = {}
        for (word,pos) in self.tagged:
            self.dictionary[word] = pos

    # 在原来的字典基础上，新建一个key和value调过来的字典
    def buildReverseDictionary(self):
        self.rdictionary = {}
        for key in self.dictionary.keys():
            value = self.dictionary[key]
            if value not in self.rdictionary:
                self.rdictionary[value] = [key]
            else:
                self.rdictionary[value].append(key)

    # 判断词是否在字典里
    def isWordPresent(self,word):
        return 'Yes' if word in self.dictionary else 'No'

    # 词 => 词性
    def getPOSForWord(self,word):
        return self.dictionary[word] if word in self.dictionary else None

    # 词性 => 词
    def getWordsForPOS(self,pos):
        return self.rdictionary[pos] if pos in self.rdictionary else None

# 测试
if __name__ == "__main__":
    # 以sentence实例化一个对象
    sentence = 'All the flights got delayed due to bad weather'
    learning = LearningDictionary(sentence)

    words = ['chair','flights','delayed','pencil','weather']
    pos = ['NN','VBS','NNS']
    for word in words:
        status = learning.isWordPresent(word)
        print("It '{}' present in dictionary ? : '{}'".format(word,status))
        if status is 'Yes':
            print("	POS For '{}' is '{}'".format(word,learning.getPOSForWord(word)))
    for pword in pos:
        print("POS '{}' has '{}' words".format(pword,learning.getWordsForPOS(pword)))

输出：

It 'chair' present in dictionary ? : 'No'
It 'flights' present in dictionary ? : 'Yes'
	POS For 'flights' is 'NNS'
It 'delayed' present in dictionary ? : 'Yes'
	POS For 'delayed' is 'VBN'
It 'pencil' present in dictionary ? : 'No'
It 'weather' present in dictionary ? : 'Yes'
	POS For 'weather' is 'NN'
POS 'NN' has '['weather']' words
POS 'VBS' has 'None' words
POS 'NNS' has '['flights']' words

特征集合选择

import nltk
import random

sampledata = [
    ('KA-01-F 1034 A','rtc'),
    ('KA-02-F 1030 B','rtc'),
    ('KA-03-FA 1200 C','rtc'),
    ('KA-01-G 0001 A','gov'),
    ('KA-02-G 1004 A','gov'),
    ('KA-03-G 0204 A','gov'),
    ('KA-04-G 9230 A','gov'),
    ('KA-27 1290','oth')
]
random.shuffle(sampledata) # 随机排序
testdata = [
    'KA-01-G 0109',
    'KA-02-F 9020 AC',
    'KA-02-FA 0801',
    'KA-01 9129'
]

def learnSimpleFeatures():
    def vehicleNumberFeature(vnumber):
        return {'vehicle_class':vnumber[6]} # 返回第7个字母  
    # 元组（第7个字母作为特征，类别）构成的列表
    featuresets = [(vehicleNumberFeature(vn),cls) for (vn,cls) in sampledata]
    # 朴素贝叶斯训练数据 将分类器保存在classifier中
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    # 测试数据
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print('(simple) %s is type of %s'%(num,classifier.classify(feature)))

def learnFeatures(): # 用6，7两位作为特征
    def vehicleNumberFeature(vnumber):
        return {
            'vehicle_class':vnumber[6],
            'vehicle_prev':vnumber[5],
        }
    featuresets = [(vehicleNumberFeature(vn),cls) for (vn,cls) in sampledata]
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print('(dual) %s is type of %s'%(num,classifier.classify(feature)))

if __name__ == "__main__":
    learnSimpleFeatures()
    learnFeatures()

输出：

(simple) KA-01-G 0109 is type of gov
(simple) KA-02-F 9020 AC is type of rtc
(simple) KA-02-FA 0801 is type of rtc
(simple) KA-01 9129 is type of gov
(dual) KA-01-G 0109 is type of gov
(dual) KA-02-F 9020 AC is type of rtc
(dual) KA-02-FA 0801 is type of rtc
(dual) KA-01 9129 is type of oth

利用分类器分割句子
依据：以'.'结尾，下一单词首字母大写

import nltk

# 定义特征 返回（字典，下一个句子首字母是否为大写的布尔值）
def featureExtractor(words,i):
    return ({'current-word':words[i],'next-is-upper':words[i+1][0].isupper()},words[i+1][0].isupper())

# 得到特征集合
def getFeaturesets(sentence):
    words = nltk.word_tokenize(sentence) # 得到句子的单词数组
    featuresets = [featureExtractor(words,i) for i in range(1,len(words)-1) if words[i] == '.']
    return featuresets

# 将文章分句的函数
def segmentTextAndPrintSentences(data):
    words = nltk.word_tokenize(data) # 整个文章分词
    for i in range(0,len(words)-1):
        if words[i] == '.':
            if classifier.classify(featureExtractor(words,i)[0]) == True:
                print(".")
            else:
                print(words[i],end='')
        else:
            print("{} ".format(words[i]),end='')
    print(words[-1]) # 输出最后一个标点
traindata = "The train and test data consist of three columns separated by spaces.Each word has been put on a separate line and there is an empty line after each sentence. The first column contains the current word, the second its part-of-speech tag as derived by the Brill tagger and the third its chunk tag as derived from the WSJ corpus. The chunk tags contain the name of the chunk type, for example I-NP for noun phrase words and I-VP for verb phrase words. Most chunk types have two types of chunk tags, B-CHUNK for the first word of the chunk and I-CHUNK for each other word in the chunk. Here is an example of the file format."
testdata = "The baseline result was obtained by selecting the chunk tag which was most frequently associated with the current part-of-speech tag. At the workshop, all 11 systems outperformed the baseline. Most of them (six of the eleven) obtained an F-score between 91.5 and 92.5. Two systems performed a lot better: Support Vector Machines used by Kudoh and Matsumoto [KM00] and Weighted Probability Distribution Voting used by Van Halteren [Hal00]. The papers associated with the participating systems can be found in the reference section below."
traindataset = getFeaturesets(traindata)
classifier = nltk.NaiveBayesClassifier.train(traindataset)
segmentTextAndPrintSentences(testdata)

输出：

The baseline result was obtained by selecting the chunk tag which was most frequently associated with the current part-of-speech tag .
At the workshop , all 11 systems outperformed the baseline .
Most of them ( six of the eleven ) obtained an F-score between 91.5 and 92.5 .
Two systems performed a lot better : Support Vector Machines used by Kudoh and Matsumoto [ KM00 ] and Weighted Probability Distribution Voting used by Van Halteren [ Hal00 ] .
The papers associated with the participating systems can be found in the reference section below .

文本分类
以RSS(丰富站点，Rich Site Summary)源的分类为例

import nltk
import random
import feedparser

# 两个跟雅虎体育相关的RSS源
urls = {
    'mlb':'http://sports.yahoo.com/mlb/rss.xml',
    'nfl':'http://sports.yahoo.com/nfl/rss.xml',
}

feedmap = {} # 字典存RSS源
stopwords = nltk.corpus.stopwords.words('english') # 停用词

# 输入单词列表 返回特征字典 key是非停用词 value是True
def featureExtractor(words):
    features = {}
    for word in words:
        if word not in stopwords:
            features["word({})".format(word)] = True
    return features

# 空列表 用于储存正确标注的句子
sentences = []

for category in urls.keys():
    feedmap[category] = feedparser.parse(urls[category]) # 下载数据源存到feedmap字典中
    print("downloading {}".format(urls[category]))
    for entry in feedmap[category]['entries']: # 遍历所有RSS条目
        data = entry['summary']
        words = data.split()
        sentences.append((category,words)) # 将类别和所有单词以元组形式存到sentences中

# 将 （类别，单词列表） 转化成 '所有单词的特征：类别' 组成的字典
featuresets = [(featureExtractor(words),category) for category,words in sentences]

# 打乱 一半训练集 一半测试集
random.shuffle(featuresets)
total = len(featuresets)
off = int(total/2)
trainset = featuresets[off:]
testset = featuresets[:off]

# 调用NaiveBayesClassifier模块train()函数 构造一个分类器
classifier = nltk.NaiveBayesClassifier.train(trainset)

# 打印准确率
print(nltk.classify.accuracy(classifier,testset))

# 打印数据的有效特征
classifier.show_most_informative_features(5)

for (i,entry) in enumerate(feedmap['nfl']['entries']):
    if i < 4: # 从nfl随机选取4个样本测试
        features = featureExtractor(entry['title'].split())
        category = classifier.classify(features)
        print('{} -> {}'.format(category,entry['summary']))

输出：

downloading http://sports.yahoo.com/mlb/rss.xml
downloading http://sports.yahoo.com/nfl/rss.xml
0.9148936170212766
Most Informative Features
               word(NFL) = True              nfl : mlb    =      8.6 : 1.0
       word(quarterback) = True              nfl : mlb    =      3.7 : 1.0
              word(team) = True              nfl : mlb    =      2.9 : 1.0
               word(two) = True              mlb : nfl    =      2.4 : 1.0
         word(Wednesday) = True              mlb : nfl    =      2.4 : 1.0
nfl -> The Cowboys RB will not be suspended for his role in an incident in May in Las Vegas.
nfl -> Giants defensive lineman Dexter Lawrence was 6 years old when Eli Manning began his NFL career. Manning is entering his 16th season, while Lawrence is arriving as a first-round draft pick. Age isn't always "just a number." "In the locker room, I feel their age," Manning said,
nfl -> Hue Jackson compiled a 3-36-1 record in two-and-a-half seasons with the Cleveland Browns before later joining division rival the Cincinnati Bengals.
nfl -> NFL Network's David Carr and free agent defensive lineman Andre Fluellen predict every game on the Minnesota Vikings' 2019 schedule.

利用上下文进行词性标注

import nltk

# 给出一些包含双词性的例句 address laugh
sentences = [
    "What is your address when you're in Beijing?",
    "the president's address on the state of economy.",
    "He addressed his remarks to the lawyers in the audience.",
    "In order to address an assembly, we should be ready",
    "He laughed inwardly at the scene.",
    "After all the advance publicity, the prizefight turned out to be a laugh.",
    "We can learn to laugh a little at even our most serious foibles.",
]

# 将每句话的 词和词性 放到列表中，构成一个二维列表
def getSentenceWords():
    sentwords = []
    for sentence in sentences:
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        sentwords.append(words)
    return sentwords

# 无上下文词性标注
def noContextTagger():
    # 构建一个基准系统
    tagger = nltk.UnigramTagger(getSentenceWords())
    print(tagger.tag('the little remarks towards assembly are laughable'.split()))

# 有上下文词性标注
def withContextTagger():
    # 返回字典：  4 x 特征:特征值
    def wordFeatures(words,wordPosInSentence):
        # 单词的倒数1，2，3个字母作为特征
        endFeatures = {
            'last(1)':words[wordPosInSentence][-1],
            'last(2)':words[wordPosInSentence][-2:],
            'last(3)':words[wordPosInSentence][-3:],
        }
        # 如果一个词不是句子中第一个 用前面的词决定
        if wordPosInSentence > 1:
            endFeatures['prev'] = words[wordPosInSentence - 1]
        else:
            endFeatures['prev'] = '|NONE|'
        return endFeatures
    allsentences = getSentenceWords() # 二维列表
    featureddata = [] # 准备放元组，元组包括 特征信息(featurelist)和标记(tag)
    for sentence in allsentences:
        untaggedSentence = nltk.tag.untag(sentence)
        featuredsentence = [(wordFeatures(untaggedSentence,index),tag) for index,(word,tag) in enumerate(sentence)]
        featureddata.extend(featuredsentence)
    breakup = int(len(featureddata) * 0.5)
    traindata = featureddata[breakup:]
    testdata = featureddata[:breakup]
    classifier = nltk.NaiveBayesClassifier.train(traindata)
    print("分类器准确率 : {}".format(nltk.classify.accuracy(classifier,testdata)))

if __name__ == "__main__":
    noContextTagger()
    withContextTagger()

输出：

[('the', 'DT'), ('little', 'JJ'), ('remarks', 'NNS'), ('towards', None), ('assembly', 'NN'), ('are', None), ('laughable', None)]
分类器准确率 : 0.38461538461538464

查看全文

相关阅读:
mysql 分列或取子串
 Excel “20200504”文本格式转化为时间格式
 Mysql清空数据表
 python 做词云图
 Pandas操作excel
python中zip()函数的用法
 Excel技能提升
 JS 学习笔记
 元类理解与元类编程《Python3网络爬虫开发》中第九章代理的使用代码Crawler中代码的理解
 关于选择器注意的点

原文地址：https://www.cnblogs.com/peng8098/p/nlp_7.html

NLP（七） 信息抽取和文本分类

NLP（七）信息抽取和文本分类