zoukankan      html  css  js  c++  java
  • liblinear使用总结

    liblinear是libsvm的线性核的改进版本,专门适用于百万数据量的分类。正好适用于我这次数据挖掘的实验。

    liblinear用法和libsvm很相似,我是用的是.exe文件,利用python的subprocess向控制台发送命令即可完成本次试验。

    其中核心两句即

    train train.txt

    predict test.txt train.txt.model output.txt

    由于是线性核,没有设置参数c、g

    对于50W篇文章模型训练仅需340秒,50W篇文章的预测仅需6秒

      1 from subprocess import *
      2 import time
      3 
      4 time = time.time
      5 
      6 start_time = time()
      7 print("训练")
      8 cmd = "train train.txt"
      9 Popen(cmd, shell = True, stdout = PIPE).communicate()
     10 print("训练结束",str(time() - start_time))
     11 
     12 
     13 start_time = time()
     14 print("预测")
     15 cmd = "predict test.txt train.txt.model output.txt"
     16 Popen(cmd, shell = True).communicate()
     17 print("预测结束",str(time() - start_time))
     18 
     19 
     20 #进行统计
     21 #读测试集真实label
     22 start_time = time()
     23 print("统计")
     24 test_filename = "test.txt"
     25 f = open(test_filename,"r",encoding = "utf-8")
     26 real_class = []
     27 for line in f:
     28     real_class.append(line[0])
     29 
     30 #总样本
     31 total_sample = len(real_class)
     32 
     33 #读预测结果label
     34 predict_filename = "output.txt"
     35 f_predict = open(predict_filename,"r",encoding = "utf-8")
     36 s = f_predict.read()
     37 predict_class = s.split()
     38 
     39 #对预测正确的文章进行计数
     40 T = 0
     41 for real, predict in zip(real_class,predict_class):
     42     if int(real) == int(predict):
     43         T += 1
     44 accuracy  = T / total_sample * 100
     45 print("正确率 为", str(accuracy) + "%")
     46 
     47 
     48 # class_label = ["0","1","2","3","4","5","6","7","8","9"]
     49 num_to_cate = {0:"it",1:"体育",2:"军事",3:"金融",4:"健康",5:"汽车",6:"房产",7:"文化",8:"教育",9:"娱乐"}
     50 
     51 class_label = ["it","体育","军事","金融","健康","汽车","房产","文化","教育","娱乐"]
     52 
     53 predict_precision = dict.fromkeys(class_label,1.0)
     54 predict_true = dict.fromkeys(class_label,1.0)
     55 
     56 predict_recall = dict.fromkeys(class_label,1.0)
     57 predict_F = dict.fromkeys(class_label,0.0)
     58 # print(str(predict_precision))
     59 # print(str(predict_precision))
     60 # print(str(predict_recall))
     61 # print(str(predict_true))
     62 mat = dict.fromkeys(class_label,{})
     63 for k,v in mat.items():
     64     mat[k] = dict.fromkeys(class_label,0)
     65 
     66 # print(str(mat))
     67 
     68 for real, predict in zip(real_class,predict_class):
     69     real = int(real)
     70     predict = int(predict)
     71     # print(num_to_cate[real])
     72     # print(num_to_cate[predict])
     73     mat[num_to_cate[real]][num_to_cate[predict]] += 1
     74     predict_precision[num_to_cate[predict]] += 1
     75     predict_recall[num_to_cate[real]] += 1
     76 
     77     if int(real) == int(predict):
     78         predict_true[num_to_cate[predict]] += 1
     79 
     80 # print(str(predict_precision))
     81 # print(str(predict_recall))
     82 # print(str(predict_true))
     83 
     84 #输出混淆矩阵
     85 for k, v in mat.items():
     86     print(k + ":" + str(v))
     87 
     88 #计算精确率和召回率
     89 for x in range(len(class_label)):
     90     # x =  str(x)
     91     predict_precision[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_precision[num_to_cate[x]]
     92     predict_recall[num_to_cate[x]] = predict_true[num_to_cate[x]] / predict_recall[num_to_cate[x]]
     93 
     94 # print(str(predict_precision))
     95 # print(str(predict_recall))
     96 # print(str(predict_true))
     97 
     98 #计算F测度
     99 for x in range(len(class_label)):
    100     # x = str(x)
    101     predict_F[num_to_cate[x]] = 2 * predict_recall[num_to_cate[x]] * predict_precision[num_to_cate[x]] / (predict_precision[num_to_cate[x]] + predict_recall[num_to_cate[x]])
    102 
    103 print("统计结束",str(time() - start_time))
    104 print("精确率为",str(predict_precision))
    105 print("召回率为",str(predict_recall))
    106 print("F测度为",str(predict_F))
    107 
    108 print("保存结果")
    109 final_result_filename = "./finalresult.txt"
    110 f = open(final_result_filename,"w",encoding = "utf-8")
    111 for k, v in mat.items():
    112     f.write(k + ":" + str(v) + "
    ")
    113 
    114 f.write("
    ")
    115 f.write("正确率为" + str(accuracy) + "%" + "
    
    ")
    116 f.write("精确率为" + str(predict_precision) + "
    
    ")
    117 f.write("召回率为" + str(predict_recall) + "
    
    ")
    118 f.write("F测度为" + str(predict_F) + "
    
    ")
    119 print("保存结果结束")
    120 
    121 
    122 # cate_to_num = {"it":0,"体育":1,"军事":2,"华人":3,"国内":4,"国际":5,"房产":6,"文娱":7,"社会":8,"财经":9}
    123 # num_to_cate = {0:"it",1:"体育",2:"军事",3:"华人",4:"国内",5:"国际",6:"房产",7:"文娱",8:"社会",9:"财经"}
  • 相关阅读:
    [Android教程]通过Intent分享数据内容给其他应用程序
    【Android您问我讲】Android 2.x中使用actionbar Actionbarsherlock的使用
    PHP按比例生成縮略圖片
    PHP實現任務計畫
    javascript下漢字和Unicode編碼互轉代碼
    js存/讀取cookie函數
    php Captcha 練習
    PHP概率抽獎
    讓iframe自適應高度
    簡單的 PHP 將sql文件導入數據庫程序
  • 原文地址:https://www.cnblogs.com/anqiang1995/p/7955672.html
Copyright © 2011-2022 走看看