zoukankan      html  css  js  c++  java
  • KNN算法源代码

    import pandas as pd
    import os


    def creatcatesdir(data, target):
    # 获取去重后的分类列表
    cates = list(data['channelName'].unique())
    # 打印类别
    print(cates)
    # 建立类别文件夹
    for cate in cates:
    # 拼接子目录路径
    final_path = target + cate
    try:
    os.mkdir(final_path) # 建立目录
    except Exception as e:
    print(str(e))


    def excel2txt(data, target):
    # 建立类别目录
    creatcatesdir(data, target)
    # 逐条获取excel中的内容
    for index, row in data.iterrows():
    # 文章内容
    content = row['content']
    # 文件名 -> 文章id
    filename = row['id']
    # 文章标题
    title = row['title']
    # 子目录 -> 类别
    cate = row['channelName']
    # 拼接文件路径
    txt_path = target + cate + os.sep
    # 将文章内容写入txt
    with open(txt_path + str(filename) + ".txt", encoding='utf-8', mode='wt') as f:
    f.write(str(title)+str(content))


    def main():
    # 使用pandas读取excel
    targetfile = "../article/"
    # 数据表个数
    sheets = [1, 2, 3, 4, 5, 6, 7, 8]
    # 遍历每个数据表 并将数据写入txt文件
    for sheet in sheets:
    data = pd.read_excel('1.xlsx', sheet_name=sheet)
    excel2txt(data, targetfile)


    if __name__ == '__main__':
    main()


    ****************************************************************************************
    # encoding=utf-8                         #遍历文件,用ProsessofWords处理文件
    from imp import reload
    import jieba
    import os
    import numpy as np
    import sys

    reload(sys)


    def EnumPathFiles(path, callback, stop_words_list):
    if not os.path.isdir(path):
    print('Error:"', path, '" is not a directory or does not exist.')
    return
    list_dirs = os.walk(path)

    for root, dirs, files in list_dirs:
    for d in dirs:
    print(d)
    EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
    for f in files:
    callback(root, f, stop_words_list)


    def ProsessofWords(textpath, stop_words_list):
    f = open(textpath, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    result = list()
    outstr = ''
    seg_list = jieba.cut(text, cut_all=False)
    for word in seg_list:
    if word not in stop_words_list:
    if word != ' ':
    outstr += word
    outstr += " "
    f = open(textpath, 'w+', encoding='utf-8')
    f.write(outstr)
    f.close()


    def callback1(path, filename, stop_words_list):
    textpath = path + '\' + filename
    print(textpath)
    ProsessofWords(textpath, stop_words_list)


    if __name__ == '__main__':
    stopwords_file = "../stopword/stopword.txt"
    stop_f = open(stopwords_file, "r", encoding='utf-8')
    stop_words = list()
    for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
    continue
    stop_words.append(line)
    stop_f.close()
    print(len(stop_words))

    EnumPathFiles(r'../article', callback1, stop_words)

    ****************************************************************************
    # encoding=utf-8
    import os


    def merge_file(path):
    files = os.listdir(path)
    print(files)
    dict = {'娱乐': '1', '汽车': '2', '游戏': '3', '科技': '4', '综合体育最新': '5', '财经': '6'}
    outfile_train = '../dataset_train/x_train.txt'
    outfile_label = '../dataset_train/y_train.txt'
    result_train = open(outfile_train, 'a', encoding='utf-8')
    result_label = open(outfile_label, 'a', encoding='utf-8')
    for file in files:
    text_dir = path + '\' + file
    texts = os.listdir(text_dir)
    for text in texts:
    txt_file_dir = text_dir + '\' + text
    print(txt_file_dir)
    f = open(txt_file_dir, 'r', encoding='utf-8')
    content = f.read()
    if len(content) > 3000:
    content = content.encode('utf-8').decode('utf-8')[0:3000] # 截取字段
    result_train.write(content+' ') # 合并文件
    result_label.write(dict[file]+' ')
    result_label.close()
    result_train.close()


    if __name__ == "__main__":
    path = r"../dataset_train"
    merge_file(path)

    *********************************************************************************
    # coding:utf-8
    import sys
    from imp import reload
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors import KNeighborsClassifier


    reload(sys)

    VECTOR_DIR = 'vectors.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    TEST_SPLIT = 0.2

    # 数据获取
    print('(1) load texts...')
    train_texts = open('../dataset_train/x_train.txt', encoding='utf-8').read().split(' ')
    train_labels = open('../dataset_train/y_train.txt', encoding='utf-8').read().split(' ')
    test_texts = open('../dataset_test/x_test.txt', encoding='utf-8').read().split(' ')
    test_labels = open('../dataset_test/y_test.txt', encoding='utf-8').read().split(' ')
    all_text = train_texts + test_texts

    # 特征值抽取
    print('(2) doc to var...')

    count_v0 = CountVectorizer();
    counts_all = count_v0.fit_transform(all_text);
    count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_);
    counts_train = count_v1.fit_transform(train_texts);
    print("the shape of train is " + repr(counts_train.shape))
    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);
    counts_test = count_v2.fit_transform(test_texts);
    print("the shape of test is " + repr(counts_test.shape))

    tfidftransformer = TfidfTransformer();
    train_data = tfidftransformer.fit(counts_train).transform(counts_train);
    test_data = tfidftransformer.fit(counts_test).transform(counts_test);

    x_train = train_data
    y_train = train_labels
    x_test = test_data
    y_test = test_labels

    # KNN算法建模
    print('(3) KNN...')
    for x in range(1, 15):
    knnclf = KNeighborsClassifier(n_neighbors=x)
    knnclf.fit(x_train, y_train)
    preds = knnclf.predict(x_test);
    num = 0
    preds = preds.tolist()
    for i, pred in enumerate(preds):
    if int(pred) == int(y_test[i]):
    num += 1
    print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds)))









  • 相关阅读:
    python——(os, shutil)
    python-(subprocess, commands)
    PHP设计模式二:单例模式
    PHP设计模式一:工厂方法设计模式
    PHP垃圾回收机制
    PHP异常处理机制
    超文本传送协议HTTP
    IP地址
    Linux系统网络基本配置
    Linux系统LVM基本使用
  • 原文地址:https://www.cnblogs.com/huangmouren233/p/14707015.html
Copyright © 2011-2022 走看看