zoukankan      html  css  js  c++  java
  • (Code) Python implementation of phrase extraction from sentence

    import os
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import numpy as np 
    import string
    import nltk
    from nltk.tokenize import word_tokenize
    from textblob import TextBlob
    
    import pdb
    
    max_phrase_length = 5 
    
    basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/'
    
    path = basicPath
    files = os.listdir(path) 
    print(path)
    
    word_base_path =  '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt'
    wordBase = open(word_base_path, 'r')
    wordList = [] 
    
    lines = wordBase.readlines()
    for line in lines: 
        line_ = line.rstrip('
    ').rstrip('.')
        # pdb.set_trace()
        wordList.append(line_)    
    
    
    for i in range(len(files)):
        videoName = files[i]
        print videoName 
        langPath = path + videoName + '/language.txt'        ## for other datset 
        # langPath = path + videoName + '/' + videoName+'.txt'    
        f = open(langPath, 'r')
        language = f.readline()
        words = word_tokenize(language)
        token_results = nltk.pos_tag(words)
        blob = TextBlob(language)
        
        print blob.noun_phrases
    
        langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt'
        f_phrase = open(langPath_Phrase, 'w')
    
        langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt'
        f_phrase_Idx = open(langPath_PhraseIndex, 'w')    
    
        # pdb.set_trace()
    
        for j in range(len(blob.noun_phrases)):
            phrase = blob.noun_phrases[j]
            f_phrase.write(phrase)
            f_phrase.write('
    ')
    
    
            written_num = 0
    
            if len(phrase) > 1:  
                word_ = word_tokenize(phrase)
    
                for phraseIndex in range(len(word_)):
                    wordINDEX = wordList.index(word_[phraseIndex])
                    f_phrase_Idx.write(str(wordINDEX))
                    f_phrase_Idx.write(',') 
                    written_num = written_num + 1 
    
                if written_num < max_phrase_length: 
                    diff_num = max_phrase_length - written_num 
                    for k in range(diff_num): 
                        f_phrase_Idx.write('0')  
                        f_phrase_Idx.write(',')
                    f_phrase_Idx.write('
    ')
  • 相关阅读:
    Django 同步数据库命令syncdb,makemigrations,migrate
    新mac上安装,查看,设置一些常用的软件
    脚本之文本练习
    hadoop工作流程
    find命令
    awk用法
    apache笔记
    iscsi原理
    nfs服务的配置
    django用户投票系统详解
  • 原文地址:https://www.cnblogs.com/wangxiaocvpr/p/10571212.html
Copyright © 2011-2022 走看看