zoukankan      html  css  js  c++  java
  • python中命名实体标注

    import json
    import re
    import pandas as pd
    import nltk
    import save_csv
    
    # bltk命令实体案例  提取文本中的人名,地名,机构等等
    
    def parse_document(document):
        document = re.sub('
    ', ' ', document)
        if isinstance(document, str):
            document = document
        else:
            raise ValueError('Document is not string!')
        document = document.strip()
        sentences = nltk.sent_tokenize(document)
        sentences = [sentence.strip() for sentence in sentences]
        return sentences
    
    
    # sample document
    text = open(r'test.json', "r").read()
    
    text1 = ""
    for item in json.loads(text):
        text1 += " " + item["text"]
    # print(text1)
    # text = """
    # FIFA was founded in 1904 to oversee international competition among the national associations of Belgium,
    # Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its
    # membership now comprises 211 national associations. Member countries must each also be members of one of
    # the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America
    # and the Caribbean, Oceania, and South America.
    # """
    
    # tokenize sentences
    sentences = parse_document(text1)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    # tag sentences and use nltk's Named Entity Chunker
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]
    
    
    
    # extract all named entities
    named_entities = []
    for ne_tagged_sentence in ne_chunked_sents:
        for tagged_tree in ne_tagged_sentence:
            # extract only chunks having NE labels
            if hasattr(tagged_tree, 'label'):
                entity_name = ' '.join(c[0] for c in tagged_tree.leaves())  # get NE name
                entity_type = tagged_tree.label()  # get NE category
                named_entities.append((entity_name, entity_type))
                # get unique named entities
                named_entities = list(set(named_entities))
    
    
    # 存入excel之前
    print(named_entities)
    # store named entities in a data frame
    entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
    # 存入csv中
    entity_frame.to_csv('data_df.csv', encoding='utf_8_sig')
    # display results
    print(entity_frame)
    # save_csv.save_csv_data(entity_frame)

  • 相关阅读:
    基于React 的audio音频播放组件
    React Context 的基本用法
    Video-React 视频播放组件的使用
    Html5 Canvas 使用
    React 中使用富文本编辑器 Braft Editor ,并集成上传图片功能
    ant design pro 项目实现路由级的动态加载按需加载
    确保代码仓库中包含 yarn.lock 文件
    ES6 对象解构赋值(浅拷贝 VS 深拷贝)
    JS 中判断数据类型是否为 null、undefined 或 NaN
    js中的数据类型及判断方法
  • 原文地址:https://www.cnblogs.com/lxz123/p/14349714.html
Copyright © 2011-2022 走看看