zoukankan      html  css  js  c++  java
  • 机器学习之新闻文本分类。

    新闻文本分类首先需要通过大量的训练之后获得一个存放关键字的表,

    之后再输入一个新闻内容,通过代码就可以自动判断出这个新闻的类别,

    我这里是在已经有了新闻文本的关键词表后的处理,

    # encoding=utf-8                                #遍历文件,用ProsessofWords处理文件
    from imp import reload
    import jieba
    import os
    import sys
    from imp import reload
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from sklearn.neighbors import KNeighborsClassifier
    
    
    reload(sys)
    VECTOR_DIR = 'vectors.bin'
    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    TEST_SPLIT = 0.2
    
    
    def deposit_txt(title, content):
        textpath = "news/news.txt"
        f = open(textpath, 'w+', encoding='utf-8')
        f.write(title+content)
        f.close()
    
    
    def EnumPathFiles(path, callback, stop_words_list):
        if not os.path.isdir(path):
            print('Error:"', path, '" is not a directory or does not exist.')
            return
        list_dirs = os.walk(path)
    
        for root, dirs, files in list_dirs:
            for d in dirs:
                print(d)
                EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
            for f in files:
                callback(root, f, stop_words_list)
    
    
    def ProsessofWords(textpath, stop_words_list):
        f = open(textpath, 'r', encoding='utf-8')
        text = f.read()
        f.close()
        result = list()
        outstr = ''
        seg_list = jieba.cut(text, cut_all=False)
        for word in seg_list:
            if word not in stop_words_list:
                if word != '	':
                    outstr += word
                    outstr += " "
        f = open(textpath, 'w+', encoding='utf-8')
        f.write(outstr)
        f.close()
    
    
    def callback1(path, filename, stop_words_list):
        textpath = path + '\' + filename
        print(textpath)
        ProsessofWords(textpath, stop_words_list)
    
    
    def fenci():
        stopwords_file = "stopword/stopword.txt"
        stop_f = open(stopwords_file, "r", encoding='utf-8')
        stop_words = list()
        for line in stop_f.readlines():
            line = line.strip()
            if not len(line):
                continue
            stop_words.append(line)
        stop_f.close()
        print(len(stop_words))
        EnumPathFiles(r'news', callback1, stop_words)
    
    
    def CV_Tfidf():
    
        reload(sys)
    
        # 数据获取
        print('(1) load texts...')
        train_texts = open('dataset_train/x_train.txt', encoding='utf-8').read().split('
    ')
        train_labels = open('dataset_train/y_train.txt', encoding='utf-8').read().split('
    ')
        test_texts = open('news/news.txt', encoding='utf-8').read().split('
    ')
        all_text = train_texts + test_texts
    
        # 特征值抽取
        print('(2) doc to var...')
    
        count_v0 = CountVectorizer()
        counts_all = count_v0.fit_transform(all_text)
        count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_)
        counts_train = count_v1.fit_transform(train_texts)
        print("the shape of train is " + repr(counts_train.shape))
        count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
        counts_test = count_v2.fit_transform(test_texts)
        print("the shape of test is " + repr(counts_test.shape))
    
        tfidftransformer = TfidfTransformer()
        train_data = tfidftransformer.fit(counts_train).transform(counts_train)
        test_data = tfidftransformer.fit(counts_test).transform(counts_test)
    
        x_train = train_data
        y_train = train_labels
        x_test = test_data
    
        # KNN算法建模
        print('(3) KNN...')
        knnclf = KNeighborsClassifier(n_neighbors=3)
        knnclf.fit(x_train, y_train)
        preds = knnclf.predict(x_test)
        preds = preds.tolist()
        for i, pred in enumerate(preds):
            print(pred)
            if pred == '1':
                return"此新闻为娱乐类新闻"
            elif pred == '2':
                return "此新闻为汽车类新闻"
            elif pred == '3':
                return "此新闻为游戏类新闻"
            elif pred == '4':
                return "此新闻为科技类新闻"
            elif pred == '5':
                return "此新闻为综合体育最新类新闻"
            elif pred == '6':
                return "此新闻为财经类新闻"
            elif pred == '7':
                return "此新闻为房产类新闻"
            elif pred == '8':
                return "此新闻为教育类新闻"
            elif pred == '9':
                return "此新闻为军事类新闻"
    def news(title, content):
        deposit_txt(title, content)
        fenci()
        result = CV_Tfidf()
        return result
  • 相关阅读:
    kettle在linux下执行任务
    activemq spring 集成与测试
    mysql创建存储过程,定时任务,定时删除log
    dubbo-admin 无法支持JDK1.8
    自定义事件驱动加异步执行
    AOP 实现自定义注解
    mybatis-plus的学习
    阿里巴巴架构师的成长之路
    Redis错误:jedis.exceptions.JedisDataException: ERR Client sent AUTH, but no password is set
    freemark基础知识
  • 原文地址:https://www.cnblogs.com/chaogehahaha/p/14908678.html
Copyright © 2011-2022 走看看