zoukankan      html  css  js  c++  java
  • (Code) Python implementation of phrase extraction from sentence

    import os
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import numpy as np 
    import string
    import nltk
    from nltk.tokenize import word_tokenize
    from textblob import TextBlob
    
    import pdb
    
    max_phrase_length = 5 
    
    basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/'
    
    path = basicPath
    files = os.listdir(path) 
    print(path)
    
    word_base_path =  '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt'
    wordBase = open(word_base_path, 'r')
    wordList = [] 
    
    lines = wordBase.readlines()
    for line in lines: 
        line_ = line.rstrip('
    ').rstrip('.')
        # pdb.set_trace()
        wordList.append(line_)    
    
    
    for i in range(len(files)):
        videoName = files[i]
        print videoName 
        langPath = path + videoName + '/language.txt'        ## for other datset 
        # langPath = path + videoName + '/' + videoName+'.txt'    
        f = open(langPath, 'r')
        language = f.readline()
        words = word_tokenize(language)
        token_results = nltk.pos_tag(words)
        blob = TextBlob(language)
        
        print blob.noun_phrases
    
        langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt'
        f_phrase = open(langPath_Phrase, 'w')
    
        langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt'
        f_phrase_Idx = open(langPath_PhraseIndex, 'w')    
    
        # pdb.set_trace()
    
        for j in range(len(blob.noun_phrases)):
            phrase = blob.noun_phrases[j]
            f_phrase.write(phrase)
            f_phrase.write('
    ')
    
    
            written_num = 0
    
            if len(phrase) > 1:  
                word_ = word_tokenize(phrase)
    
                for phraseIndex in range(len(word_)):
                    wordINDEX = wordList.index(word_[phraseIndex])
                    f_phrase_Idx.write(str(wordINDEX))
                    f_phrase_Idx.write(',') 
                    written_num = written_num + 1 
    
                if written_num < max_phrase_length: 
                    diff_num = max_phrase_length - written_num 
                    for k in range(diff_num): 
                        f_phrase_Idx.write('0')  
                        f_phrase_Idx.write(',')
                    f_phrase_Idx.write('
    ')
  • 相关阅读:
    [Codeforces 933A]A Twisty Movement
    [Codeforces 100633J]Ceizenpok’s formula
    [HAOI 2011]向量
    [JSOI 2008]最大数
    [Codeforces 750E]New Year and Old Subsequence
    [BZOJ 3439]Kpm的MC密码
    [TJOI 2013]单词
    [SCOI 2011]糖果
    [BZOJ 2160]拉拉队排练
    [AtCoder arc090F]Number of Digits
  • 原文地址:https://www.cnblogs.com/wangxiaocvpr/p/10571212.html
Copyright © 2011-2022 走看看