zoukankan      html  css  js  c++  java
  • ###《Machine Learning in Action》

    初学Python;理解机器学习。
    算法是需要实现的,纸上得来终觉浅。

    // @author:       gr
    // @date:         2015-01-16
    // @email:        forgerui@gmail.com
    

    一、简单的KNN

    from numpy import *
    import operator
    
    def createDataSet():
        group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
        labels = ['A', 'A', 'B', 'B']
        return group, labels
    
    def classify0(inX, dataSet, labels, k):
        # 求输入向量与各个样例的距离
        dataSetSize = dataSet.shape[0]
        diffMat = tile(inX, (dataSetSize, 1)) - dataSet
        sqDiffMat = diffMat ** 2
        sqDistances = sqDiffMat.sum(axis = 1)
        distances = sqDistances ** 0.5
        
        # 按距离递增排序
        sortedDistIndicies = distances.argsort()
        classCount = {}
        
        # 对前k个样例的标签进行计数
        for i in range(k):
            voteIlabel = labels[sortedDistIndicies[i]]
            classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
        
        # 按照计数对标签进行递减排序
        sortedClassCount = sorted(classCount.iteritems(),
                key = operator.itemgetter(1), reverse=True)
        
        # 返回最多计数的标签,即为该输入向量的预测标签
        return sortedClassCount[0][0]
    

    二、KNN用于约会网站配对效果

    def file2matrix(filename):
        # 读取文件
        fr = open(filename)
        arrayOLines = fr.readlines()
        numberOfLines = len(arrayOLines)
        
        returnMat = zeros((numberOfLines, 3))
        classLabelVector = []
        index = 0
        for line in arrayOLines:
            # 去除换行符
            line = line.strip()
            # 按Tab键分割列
            listFromLine = line.split('	')
            returnMat[index, :] = listFromLine[0:3]
            # 存储标签
            classLabelVector.append(int(listFromLine[-1]))
            index += 1
        return returnMat, classLabelVector
        
    def autoNorm(dataSet):
        minVals = dataSet.min(0)
        maxVals = dataSet.max(0)
        ranges = maxVals - minVals
        normDataSet = zeros(shape(dataSet))
        # 数据的行数
        m = dataSet.shape[0]
        normDataSet = dataSet - tile(minVals, (m, 1))
        normDataSet = normDataSet / tile(ranges, (m, 1))
        return normDataSet, ranges, minVals
    
    def datingClassTest():
        hoRatio = 0.10
        datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
        normMat, ranges, minVals = autoNorm(datingDataMat)
        m = normMat.shape[0]
        # 选取测试集数量
        numTestVecs = int(m * hoRatio)
        errorCount = 0.0
        for i in range(numTestVecs):
            classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], 
                    datingLabels[numTestVecs:m], 7)
            print "the classifirer came back with: %d, the real answer is: %d"
                    % (classifierResult, datingLabels[i])
            # 记录错误数
            if (classifierResult != datingLabels[i]) : errorCount += 1.0
        print "numTestVecs: %f" % float(numTestVecs)
        print "the total error rate is: %f" % (errorCount/float(numTestVecs))
    
    def classifyPerson():
        # 针对一个人判断
        resultList = ['not at all', 'in small doses', 'in large doses']
        percentTats = float(raw_input(
                "percentage of time spent playing video games?"))
        ffMiles = float(raw_input("frequent flier miles earned per year?"))
        iceCream = float(raw_input("liters of ice cream consumed per year?"))
        datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
        normMat, ranges, minVals = autoNorm(datingDataMat)
        inArr = array([ffMiles, percentTats, iceCream])
        classifierResult = classify0((inArr-
                minVals)/ranges, normMat, datingLabels, 3)
        print "You will probably like this person: ", 
                resultList[classifierResult - 1]
    

    三、手写识别系统

    def img2vector(filename):
        # 32*32的图片转成一个向量
        returnVect = zeros((1, 1024))
        fr = open(filename)
        for i in range(32):
            lineStr = fr.readline()
            for j in range(32):
                returnVect[0, 32*i+j] = int(lineStr[j])
        return returnVect
    
    def handwritingClassTest():
        hwLabels = []
        trainingFileList = listdir('trainingDigits')
        m = len(trainingFileList)
        trainingMat = zeros((m, 1024))
        # 把训练的文件图片转换成一个m*1024矩阵
        for i in range(m):
            fileNameStr = trainingFileList[i]
            fileStr = fileNameStr.split('.')[0]
            classNumStr = int(fileStr.split('_')[0])
            hwLabels.append(classNumStr)
            trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)
        testFileList = listdir('testDigits')
        errorCount = 0.0
        # 在测试集上测试
        mTest = len(testFileList)
        for i in range(mTest):
            fileNameStr = testFileList[i]
            fileStr = fileNameStr.split('.')[0]
            classNumStr = int(fileStr.split('_')[0])
            vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
            classifierResult = classify0(vectorUnderTest, 
                    trainingMat, hwLabels, 3)
            print "the classifier came back with: %d, the real answer is: %d" 
                    % (classifierResult, classNumStr)
            if (classifierResult != classNumStr):
                errorCount += 1.0
        print "
     the total number of errors is: %d" % errorCount
        print "
     the total error rate is: %f" % (errorCount/float(mTest))
  • 相关阅读:
    新一篇: 正则表达式使用详解
    C#預處理指令
    [转]SQL Server 2005 Beta 2 TransactSQL 增强功能
    SQL Server 2005之PIVOT/UNPIVOT行列转换
    爬虫入门到放弃系列01:什么是爬虫
    我的程序员之路01:自学Java篇
    Java入门者:如何写出美观的Java代码?
    JedisCluster使用pipeline操作Redis Cluster最详细从0到1实现过程
    IDEA超神之路:安装、运行HelloWorld以及激活到2099年的第一场雪
    软考系统架构师、信息系统项目管理师、系统分析师、系统规划与管理师和网络规划师资料大汇总
  • 原文地址:https://www.cnblogs.com/gr-nick/p/4238655.html
Copyright © 2011-2022 走看看