zoukankan html css js c++ java

KNN算法源代码

import pandas as pd
import os


def creatcatesdir(data, target):
    # 获取去重后的分类列表
    cates = list(data['channelName'].unique())
    # 打印类别
    print(cates)
    # 建立类别文件夹
    for cate in cates:
        # 拼接子目录路径
        final_path = target + cate
        try:
            os.mkdir(final_path)  # 建立目录
        except Exception as e:
            print(str(e))


def excel2txt(data, target):
    # 建立类别目录
    creatcatesdir(data, target)
    # 逐条获取excel中的内容
    for index, row in data.iterrows():
        # 文章内容
        content = row['content']
        # 文件名 -> 文章id
        filename = row['id']
        # 文章标题
        title = row['title']
        # 子目录 -> 类别
        cate = row['channelName']
        # 拼接文件路径
        txt_path = target + cate + os.sep
        # 将文章内容写入txt
        with open(txt_path + str(filename) + ".txt", encoding='utf-8', mode='wt') as f:
            f.write(str(title)+str(content))


def main():
    # 使用pandas读取excel
    targetfile = "../article/"
    # 数据表个数
    sheets = [1, 2, 3, 4, 5, 6, 7, 8]
    # 遍历每个数据表 并将数据写入txt文件
    for sheet in sheets:
        data = pd.read_excel('1.xlsx', sheet_name=sheet)
        excel2txt(data, targetfile)


if __name__ == '__main__':
    main()


****************************************************************************************

# encoding=utf-8                         #遍历文件，用ProsessofWords处理文件
from imp import reload
import jieba
import os
import numpy as np
import sys

reload(sys)


def EnumPathFiles(path, callback, stop_words_list):
    if not os.path.isdir(path):
        print('Error:"', path, '" is not a directory or does not exist.')
        return
    list_dirs = os.walk(path)

    for root, dirs, files in list_dirs:
        for d in dirs:
            print(d)
            EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
        for f in files:
            callback(root, f, stop_words_list)


def ProsessofWords(textpath, stop_words_list):
    f = open(textpath, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    result = list()
    outstr = ''
    seg_list = jieba.cut(text, cut_all=False)
    for word in seg_list:
        if word not in stop_words_list:
            if word != '	':
                outstr += word
                outstr += " "
    f = open(textpath, 'w+', encoding='utf-8')
    f.write(outstr)
    f.close()


def callback1(path, filename, stop_words_list):
    textpath = path + '\' + filename
    print(textpath)
    ProsessofWords(textpath, stop_words_list)


if __name__ == '__main__':
    stopwords_file = "../stopword/stopword.txt"
    stop_f = open(stopwords_file, "r", encoding='utf-8')
    stop_words = list()
    for line in stop_f.readlines():
        line = line.strip()
        if not len(line):
            continue
        stop_words.append(line)
    stop_f.close()
    print(len(stop_words))

    EnumPathFiles(r'../article', callback1, stop_words)

****************************************************************************

# encoding=utf-8
import os


def merge_file(path):
    files = os.listdir(path)
    print(files)
    dict = {'娱乐': '1', '汽车': '2', '游戏': '3', '科技': '4', '综合体育最新': '5', '财经': '6'}
    outfile_train = '../dataset_train/x_train.txt'
    outfile_label = '../dataset_train/y_train.txt'
    result_train = open(outfile_train, 'a', encoding='utf-8')
    result_label = open(outfile_label, 'a', encoding='utf-8')
    for file in files:
        text_dir = path + '\' + file
        texts = os.listdir(text_dir)
        for text in texts:
            txt_file_dir = text_dir + '\' + text
            print(txt_file_dir)
            f = open(txt_file_dir, 'r', encoding='utf-8')
            content = f.read()
            if len(content) > 3000:
                content = content.encode('utf-8').decode('utf-8')[0:3000]        # 截取字段
            result_train.write(content+'
')      # 合并文件
            result_label.write(dict[file]+'
')
    result_label.close()
    result_train.close()


if __name__ == "__main__":
    path = r"../dataset_train"
    merge_file(path)

*********************************************************************************

# coding:utf-8
import sys
from imp import reload
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier


reload(sys)

VECTOR_DIR = 'vectors.bin'

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
TEST_SPLIT = 0.2

# 数据获取
print('(1) load texts...')
train_texts = open('../dataset_train/x_train.txt', encoding='utf-8').read().split('
')
train_labels = open('../dataset_train/y_train.txt', encoding='utf-8').read().split('
')
test_texts = open('../dataset_test/x_test.txt', encoding='utf-8').read().split('
')
test_labels = open('../dataset_test/y_test.txt', encoding='utf-8').read().split('
')
all_text = train_texts + test_texts

# 特征值抽取
print('(2) doc to var...')

count_v0 = CountVectorizer();
counts_all = count_v0.fit_transform(all_text);
count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_);
counts_train = count_v1.fit_transform(train_texts);
print("the shape of train is " + repr(counts_train.shape))
count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);
counts_test = count_v2.fit_transform(test_texts);
print("the shape of test is " + repr(counts_test.shape))

tfidftransformer = TfidfTransformer();
train_data = tfidftransformer.fit(counts_train).transform(counts_train);
test_data = tfidftransformer.fit(counts_test).transform(counts_test);

x_train = train_data
y_train = train_labels
x_test = test_data
y_test = test_labels

# KNN算法建模
print('(3) KNN...')
for x in range(1, 15):
    knnclf = KNeighborsClassifier(n_neighbors=x)
    knnclf.fit(x_train, y_train)
    preds = knnclf.predict(x_test);
    num = 0
    preds = preds.tolist()
    for i, pred in enumerate(preds):
        if int(pred) == int(y_test[i]):
            num += 1
    print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds)))

查看全文

相关阅读:
Calendar来进行一个时间段内每天的判断
 关于PreparedStatement的setDate的了解
 git学习--创建标签
 git学习--bug分支
 hibernate(1)
JavaScript（三）---- 控制流程语句
 JavaScript（二）---- 变量、数据类型和运算符
 JavaScript（一）---- 概述
 css（四）-- 盒子模型和定位
 css（三）-- 常用属性

原文地址：https://www.cnblogs.com/huangmouren233/p/14707015.html