zoukankan      html  css  js  c++  java
  • (Code) Python implementation of phrase extraction from sentence

    import os
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import numpy as np 
    import string
    import nltk
    from nltk.tokenize import word_tokenize
    from textblob import TextBlob
    
    import pdb
    
    max_phrase_length = 5 
    
    basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/'
    
    path = basicPath
    files = os.listdir(path) 
    print(path)
    
    word_base_path =  '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt'
    wordBase = open(word_base_path, 'r')
    wordList = [] 
    
    lines = wordBase.readlines()
    for line in lines: 
        line_ = line.rstrip('
    ').rstrip('.')
        # pdb.set_trace()
        wordList.append(line_)    
    
    
    for i in range(len(files)):
        videoName = files[i]
        print videoName 
        langPath = path + videoName + '/language.txt'        ## for other datset 
        # langPath = path + videoName + '/' + videoName+'.txt'    
        f = open(langPath, 'r')
        language = f.readline()
        words = word_tokenize(language)
        token_results = nltk.pos_tag(words)
        blob = TextBlob(language)
        
        print blob.noun_phrases
    
        langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt'
        f_phrase = open(langPath_Phrase, 'w')
    
        langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt'
        f_phrase_Idx = open(langPath_PhraseIndex, 'w')    
    
        # pdb.set_trace()
    
        for j in range(len(blob.noun_phrases)):
            phrase = blob.noun_phrases[j]
            f_phrase.write(phrase)
            f_phrase.write('
    ')
    
    
            written_num = 0
    
            if len(phrase) > 1:  
                word_ = word_tokenize(phrase)
    
                for phraseIndex in range(len(word_)):
                    wordINDEX = wordList.index(word_[phraseIndex])
                    f_phrase_Idx.write(str(wordINDEX))
                    f_phrase_Idx.write(',') 
                    written_num = written_num + 1 
    
                if written_num < max_phrase_length: 
                    diff_num = max_phrase_length - written_num 
                    for k in range(diff_num): 
                        f_phrase_Idx.write('0')  
                        f_phrase_Idx.write(',')
                    f_phrase_Idx.write('
    ')
  • 相关阅读:
    1405ST软件测试课的要求补充说明
    软测实验课安排和考试
    Asp.Net 4.0 FormAuthentication 原理
    微信支付-“申请退款”接口遇到curl出错,错误码:58
    前端资源构建-Grunt环境搭建
    微信服务号开发-获取用户位置信息
    微信支付开发-当前页面的URL未注册
    Using Redis to store php session
    nginx performance monitor
    thinkphp nginx php-fpm url rewrite 导致 404 错误
  • 原文地址:https://www.cnblogs.com/wangxiaocvpr/p/10571212.html
Copyright © 2011-2022 走看看