python中命名实体标注

import json
import re
import pandas as pd
import nltk
import save_csv

# bltk命令实体案例  提取文本中的人名，地名，机构等等

def parse_document(document):
    document = re.sub('
', ' ', document)
    if isinstance(document, str):
        document = document
    else:
        raise ValueError('Document is not string!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences


# sample document
text = open(r'test.json', "r").read()

text1 = ""
for item in json.loads(text):
    text1 += " " + item["text"]
# print(text1)
# text = """
# FIFA was founded in 1904 to oversee international competition among the national associations of Belgium,
# Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its
# membership now comprises 211 national associations. Member countries must each also be members of one of
# the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America
# and the Caribbean, Oceania, and South America.
# """

# tokenize sentences
sentences = parse_document(text1)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]



# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
    for tagged_tree in ne_tagged_sentence:
        # extract only chunks having NE labels
        if hasattr(tagged_tree, 'label'):
            entity_name = ' '.join(c[0] for c in tagged_tree.leaves())  # get NE name
            entity_type = tagged_tree.label()  # get NE category
            named_entities.append((entity_name, entity_type))
            # get unique named entities
            named_entities = list(set(named_entities))


# 存入excel之前
print(named_entities)
# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# 存入csv中
entity_frame.to_csv('data_df.csv', encoding='utf_8_sig')
# display results
print(entity_frame)
# save_csv.save_csv_data(entity_frame)

查看全文

相关阅读:
Silverlight实用窍门系列：4.Silverlight 4.0添加鼠标右键菜单和Silverlight全屏模式的进入退出。【附带源码实例】
Silverlight实用窍门系列：13.基于Popup浮动窗体控件模拟ToolTip的实现【附带实例源码】
Silverlight实用窍门系列：5.绑定webService数据到DataGrid,设置DataGrid模板,模拟数据库数据的绑定【附带实例源码】
Silverlight实用窍门系列：25.Silverlight多线程技术Timer的应用，模拟心电图、模拟CPU、内存状态图【附带源码实例】
Silverlight实用窍门系列：10.动态生成DataGrid,动态绑定DataGrid模板列【附带实例源码】
Silverlight实用窍门系列：7.制作可拖动的自定义控件，获取拖拽后控件坐标【实例源码下载】
Silverlight实用窍门系列：23.Silverlight多线程技术Thread的应用，后台线程更新UI控件，向多线程传递参数【附带源码实例】
Silverlight实用窍门系列：11.Silverlight中为自定义控件添加鼠标双击属性，Silverlight模拟鼠标双击【附带源码实例】
Silverlight实用窍门系列：12.继承于某些固定控件(以Grid为例)的鼠标左键双击事件的实现【附带实例源码】
Silverlight实用窍门系列：6.Silverlight弹出窗口以及DataGrid分页【附带实例源码】

原文地址：https://www.cnblogs.com/lxz123/p/14349714.html