zoukankan      html  css  js  c++  java
  • (Code) Python implementation of phrase extraction from sentence

    import os
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import numpy as np 
    import string
    import nltk
    from nltk.tokenize import word_tokenize
    from textblob import TextBlob
    
    import pdb
    
    max_phrase_length = 5 
    
    basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/'
    
    path = basicPath
    files = os.listdir(path) 
    print(path)
    
    word_base_path =  '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt'
    wordBase = open(word_base_path, 'r')
    wordList = [] 
    
    lines = wordBase.readlines()
    for line in lines: 
        line_ = line.rstrip('
    ').rstrip('.')
        # pdb.set_trace()
        wordList.append(line_)    
    
    
    for i in range(len(files)):
        videoName = files[i]
        print videoName 
        langPath = path + videoName + '/language.txt'        ## for other datset 
        # langPath = path + videoName + '/' + videoName+'.txt'    
        f = open(langPath, 'r')
        language = f.readline()
        words = word_tokenize(language)
        token_results = nltk.pos_tag(words)
        blob = TextBlob(language)
        
        print blob.noun_phrases
    
        langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt'
        f_phrase = open(langPath_Phrase, 'w')
    
        langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt'
        f_phrase_Idx = open(langPath_PhraseIndex, 'w')    
    
        # pdb.set_trace()
    
        for j in range(len(blob.noun_phrases)):
            phrase = blob.noun_phrases[j]
            f_phrase.write(phrase)
            f_phrase.write('
    ')
    
    
            written_num = 0
    
            if len(phrase) > 1:  
                word_ = word_tokenize(phrase)
    
                for phraseIndex in range(len(word_)):
                    wordINDEX = wordList.index(word_[phraseIndex])
                    f_phrase_Idx.write(str(wordINDEX))
                    f_phrase_Idx.write(',') 
                    written_num = written_num + 1 
    
                if written_num < max_phrase_length: 
                    diff_num = max_phrase_length - written_num 
                    for k in range(diff_num): 
                        f_phrase_Idx.write('0')  
                        f_phrase_Idx.write(',')
                    f_phrase_Idx.write('
    ')
  • 相关阅读:
    解决mybatis查询返回结果值串查
    MSSQL Export Excel
    Linux检测硬盘读取速度
    Linux修改用户密码
    Linux系统关闭防火墙端口
    Linux系统查看系统信息
    查找一个String中存储的多个数据
    编辑器vi命令
    去除一段文字最后一个符号
    替换Jar包中的一个文件 Replace a file in a JAR
  • 原文地址:https://www.cnblogs.com/wangxiaocvpr/p/10571212.html
Copyright © 2011-2022 走看看