zoukankan      html  css  js  c++  java
  • python文本处理(结巴分词并去除符号)

    import re
    import jieba.analyse
    import codecs
    import pandas as pd
    
    def simplification_text(xianbingshi):
        """提取文本"""
        xianbingshi_simplification = []
        with codecs.open(xianbingshi,'r','utf8') as f:
            for line in f :
                line = line.strip()
                line_write = re.findall('(?<=<b>).*?(?=<e>)',line)
                for line in line_write:
                    xianbingshi_simplification.append(line)
        with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoqucodexianbingshi_write.txt','w','utf8') as f:
            for line in xianbingshi_simplification:
                f.write(line + '
    ')
    def jieba_text():
        """"""
        word_list = []
        data = open(r"C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29xianbingshi_write.txt", encoding='utf-8').read()
        seg_list = jieba.cut(data, cut_all=False)  # 精确模式
        for i in seg_list:
            word_list.append(i.strip())
        data_quchong = pd.DataFrame({'a':word_list})
        data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True)
        word_list = data_quchong['a'].tolist()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_list:
                w.write(line + '
    ')
    def word_messy(word):
        """词语提炼"""
        word_sub_list = []
        with codecs.open(word,'r','utf8') as f:
            for line in f:
                line_sub = re.sub("^[1-9]d*.d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?d+)(.d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line)
                word_sub_list.append(line_sub)
        word_sub_list.sort()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_sub_list:
                w.write(line.strip("
    ") + '
    ')
    
    if __name__ == '__main__':
        xianbingshi = r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoquxianbingshi_sub_sen_all(1).txt'
        # simplification_text(xianbingshi)
        # word = r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29word.txt'
        simplification_text(xianbingshi)
  • 相关阅读:
    IE8,IE10下载的临时文件到哪里去了???
    安全退出,清空Session或Cookie
    删掉SQL Server登录时登录名下拉列表框中的选项
    C#中==、Equals、ReferenceEquals的区别
    [转载]C#中as和is关键字的用法
    HTML5权威指南 5.绘制图形
    HTML5权威指南 3.HTML5的结构
    HTML5权威指南 2.HTML5与HTML4的区别
    HTML5权威指南 1.Web时代的变迁
    web前端黑客技术揭秘 9.Web蠕虫
  • 原文地址:https://www.cnblogs.com/yiwoqu/p/11542002.html
Copyright © 2011-2022 走看看