zoukankan      html  css  js  c++  java
  • (Code) Python implementation of phrase extraction from sentence

    import os
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import numpy as np 
    import string
    import nltk
    from nltk.tokenize import word_tokenize
    from textblob import TextBlob
    
    import pdb
    
    max_phrase_length = 5 
    
    basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/'
    
    path = basicPath
    files = os.listdir(path) 
    print(path)
    
    word_base_path =  '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt'
    wordBase = open(word_base_path, 'r')
    wordList = [] 
    
    lines = wordBase.readlines()
    for line in lines: 
        line_ = line.rstrip('
    ').rstrip('.')
        # pdb.set_trace()
        wordList.append(line_)    
    
    
    for i in range(len(files)):
        videoName = files[i]
        print videoName 
        langPath = path + videoName + '/language.txt'        ## for other datset 
        # langPath = path + videoName + '/' + videoName+'.txt'    
        f = open(langPath, 'r')
        language = f.readline()
        words = word_tokenize(language)
        token_results = nltk.pos_tag(words)
        blob = TextBlob(language)
        
        print blob.noun_phrases
    
        langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt'
        f_phrase = open(langPath_Phrase, 'w')
    
        langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt'
        f_phrase_Idx = open(langPath_PhraseIndex, 'w')    
    
        # pdb.set_trace()
    
        for j in range(len(blob.noun_phrases)):
            phrase = blob.noun_phrases[j]
            f_phrase.write(phrase)
            f_phrase.write('
    ')
    
    
            written_num = 0
    
            if len(phrase) > 1:  
                word_ = word_tokenize(phrase)
    
                for phraseIndex in range(len(word_)):
                    wordINDEX = wordList.index(word_[phraseIndex])
                    f_phrase_Idx.write(str(wordINDEX))
                    f_phrase_Idx.write(',') 
                    written_num = written_num + 1 
    
                if written_num < max_phrase_length: 
                    diff_num = max_phrase_length - written_num 
                    for k in range(diff_num): 
                        f_phrase_Idx.write('0')  
                        f_phrase_Idx.write(',')
                    f_phrase_Idx.write('
    ')
  • 相关阅读:
    喜欢这效果
    jQuery.Validate 使用例子
    网站安全要略谨记
    asp.net发送邮件
    URL参数的加号等特殊字符的处理
    sqlhelper
    Python单元测试框架
    基于Selenium2与Python自动化测试环境搭建
    Robot Framework和Selenium 2 Grid集成指南
    APP性能测试(CPU)
  • 原文地址:https://www.cnblogs.com/wangxiaocvpr/p/10571212.html
Copyright © 2011-2022 走看看