zoukankan      html  css  js  c++  java
  • 逻辑斯蒂回归

    import math
    feature=[]
    result=[]
    theta=[]
    tempfeature=[]
    test_result=[]
    def getDataFromFile():
        with open('kr-vs-kp.data.txt') as fileData:
            #recordIndex=0
            for each in fileData:        
                cur=each.split(',')
                feature.append([ord(eachItem)for eachItem in cur[:-1]])#transform chars to ASCII
                #feature[recordIndex].insert(0,1)#
                #recordIndex+=1
                if cur[-1][:-1]=='won':
                    result.append(1)
                else:          
                    result.append(0)#'won\n','nowin\n'
                
            feature.pop()
            feature.pop()
            tempfeature=feature
    #print(feature[0])
    #print(result) #read data set from file
    def feature_scale(feature_list):
        featureIndex=0
        for eachFeatureListItem in feature_list:
            #print('before scale the list is:',eachFeatureListItem)
            averageValue=sum(eachFeatureListItem)/len(eachFeatureListItem)
            maxValue=max(eachFeatureListItem)
            minValue=min(eachFeatureListItem)
            feature_list[featureIndex]=[(ea-averageValue)/(maxValue-minValue+1) for ea in eachFeatureListItem]       
            #print("after scale the list is :",feature_list[featureIndex])
            #print('max and min',max(feature_list[featureIndex]),min(feature_list[featureIndex]))
            featureIndex+=1
            
    def inittheta():    
        for th in range(37):
            theta.append(1.)            
                                 
    def hypothesisOfLogisticRegression(theta_list,feature_record):#theta_list index 0-36,feature_list index 0-35
        product_thetalist_featurelist=.0
        for eachvalue in range(36):#calculate product of theta and feature from theta1 to theta35
            product_thetalist_featurelist+=theta_list[eachvalue+1]*feature_record[eachvalue]
        product_thetalist_featurelist+=theta_list[0]    
        #print(product_thetalist_featurelist)
        #print(math.exp(-product_thetalist_featurelist))
        #print(1/(1+math.exp(-product_thetalist_featurelist)) )
        return 1/(1+math.exp(-product_thetalist_featurelist))    
    #print(hypothesisOfLogisticRegression(theta, feature[0]))
    #print(sum(feature[0]))
    def calculteCostFunction(thetaList,featureList,resultList):    
        for calCost in range(3196):
            #calculate cost function
            Costvalue=.0
            Costvalue+=resultList[calCost]*math.log10(hypothesisOfLogisticRegression(thetaList, featureList[calCost]))+(1-resultList[calCost])*math.log10(1-hypothesisOfLogisticRegression(thetaList, featureList[calCost]))
        #print('cost function value is',(-1/3196*Costvalue))
        return (-1/3196*Costvalue)


    def SGD():    
        temptheta=theta
        tempCost=.0
        Cost=1.
        while(Cost-tempCost>0.0000000001): 
            for eachRecord in range(3196):
                if(Cost-tempCost>0.0000000001):
                    Cost=calculteCostFunction(theta, feature, result)
                    print('the value of cost function is:',Cost)
                    for j in range(1,37):
                        theta[j]=theta[j]-0.01*(hypothesisOfLogisticRegression(temptheta,feature[eachRecord])-result[eachRecord])*feature[eachRecord][j-1]
                    theta[0]=theta[0]-0.01*(hypothesisOfLogisticRegression(temptheta,feature[eachRecord])-result[eachRecord])
                    temptheta=theta
                    tempCost=calculteCostFunction(theta, feature, result)
                    print('new cost is ',tempCost)
                else:
                    print("Find the optimal theta")
                    #print(theta)              
                        
    def GD():
        temp2theta=theta
        temp2Cost=.0
        Cost2=1.
        sumValue=[.0]
        while(Cost2-temp2Cost>0.00001):
            Cost2=calculteCostFunction(theta, feature, result)
            print('Cost is',Cost2)        
            for j in range(1,37):
                sumValue.append(.0)
                for i in range(3196):
                    sumValue[j]+=(hypothesisOfLogisticRegression(temp2theta, feature[i])-result[i])*feature[i][j-1]
                theta[j]-=0.01*sumValue[j]
            for i1 in range(3196):
                sumValue[0]+=(hypothesisOfLogisticRegression(temp2theta, feature[i1])-result[i1])
            theta[0]-=0.01*sumValue[0]
            temp2Cost=calculteCostFunction(theta, feature, result)            
            print('new cost is ',temp2Cost)        
        print('Find optimal theta ',theta)
        print('cost is',temp2Cost)
        
    def getTestResult():
        for f1 in feature:
            if(hypothesisOfLogisticRegression(theta, f1)>=0.5):
                test_result.append(1)
            else:

                test_result.append(0) 

                      

    def TestLRClassifier():
        correct_count=0
        wrong_count=0
        for ii in range(3196):
            if(test_result[ii]==result[ii]):
                correct_count+=1
            else:
                wrong_count+=1
        print('the correct proportion is ',correct_count/len(feature))
       print('the wrong proportion is ',wrong_count/len(feature))                         
        
    getDataFromFile()
    inittheta()
    #print(theta)
    feature_scale(feature) 
    hypothesisOfLogisticRegression(theta, feature[0])         
    calculteCostFunction(theta, feature, result)
    GD()
    #print('finally',theta)
    getTestResult()
    TestLRClassifier()          
            

  • 相关阅读:
    Nginx有哪些作用?
    MYSQL如何优化?
    jdk1.8新特性
    [javase基础] JDK JRE JVM的区别?
    JDBC中如何进行事务处理?
    JDBC、ibatis(mybatis)、Hibernate有什么不同?
    java面试题最容易犯错
    Spring高频率面试题
    python pip whl安装和使用
    深入理解 Linux的进程,线程,PID,LWP,TID,TGID
  • 原文地址:https://www.cnblogs.com/lz3018/p/4579773.html
Copyright © 2011-2022 走看看