zoukankan      html  css  js  c++  java
  • 机器学习预测药品溶解度

     python机器学习-乳腺癌细胞挖掘(博主亲自录制视频)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

    随机森林

    # -*- coding: utf-8 -*-
    """
    Created on Tue Sep  4 09:39:29 2018
    
    @author: zhi.li04
    随机森林100棵树
    RF RMS 0.6057144333891424
    ('RF r^2 score', 0.9114913707148344)
    
    随机森林1000棵树
    ('RF RMS', 0.5891965582822096)
    ('RF r^2 score', 0.9116131032510899)
    """
    from rdkit import Chem, DataStructs
    from rdkit.Chem import AllChem
    from rdkit.ML.Descriptors import MoleculeDescriptors
    from rdkit.Chem import Descriptors
    from rdkit.Chem.EState import Fingerprinter
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    from sklearn.metrics import r2_score
    from sklearn.ensemble import RandomForestRegressor
    from sklearn import gaussian_process
    from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel, RBF
    
    
    #定义描述符计算函数
    def get_fps(mol):
        calc=MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        ds = np.asarray(calc.CalcDescriptors(mol))
        arr=Fingerprinter.FingerprintMol(mol)[0]
        return np.append(arr,ds)
    
    #读入数据
    data = pd.read_table('smi_sol.dat', sep=' ')
    data.to_excel("all_data.xlsx")
     
    #增加结构和描述符属性
    data['Mol'] = data['smiles'].apply(Chem.MolFromSmiles)
    data['Descriptors'] = data['Mol'].apply(get_fps)
    #查看前五行
    data.head(5)
    
    #转换为numpy数组
    X = np.array(list(data['Descriptors']))
    
    df_x=pd.DataFrame(X)
    df_x.to_excel("data.xlsx")
    
    y = data['solubility'].values
    df_y=pd.DataFrame(y)
    df_y.to_excel("label.xlsx")
     
    st = StandardScaler()
    X = st.fit_transform(X)
     
    #划分训练集和测试集
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=42)
    
    '''
    #高斯过程回归
    kernel=1.0 * RBF(length_scale=1) + WhiteKernel(noise_level=1) 
    gp = gaussian_process.GaussianProcessRegressor(kernel=kernel,n_restarts_optimizer=0,normalize_y=True)
    gp.fit(X_train, y_train)
    
    y_pred, sigma = gp.predict(X_test, return_std=True)
    rms = (np.mean((y_test - y_pred)**2))**0.5
    #s = np.std(y_test -y_pred)
    print ("GP RMS", rms)
    print ("GP r^2 score",r2_score(y_test,y_pred))
    '''
    
    #随机森林模型
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X)
    rms = (np.mean((y - y_pred)**2))**0.5
    print ("RF RMS", rms)
    
    print ("RF r^2 score",r2_score(y,y_pred))
    plt.scatter(y_train,rf.predict(X_train), label = 'Train', c='blue')
    plt.title('RF Predictor')
    plt.xlabel('Measured Solubility')
    plt.ylabel('Predicted Solubility')
    plt.scatter(y_test,rf.predict(X_test),c='lightgreen', label='Test', alpha = 0.8)
    plt.legend(loc=4)
    plt.savefig('RF Predictor.png', dpi=600)
    plt.show()
    
    
    df_validation=pd.DataFrame({"test":y,"predict":y_pred})
    df_validation.to_excel("validation.xlsx")
    

      

    高斯模型

    # -*- coding: utf-8 -*-
    """
    Created on Tue Sep  4 15:53:57 2018
    
    @author: zhi.li04
    
    默认参数
    GP RMS label    2.98575
    GP r^2 score -1.26973055888
    
    核参数改为kernel=1.0 * RBF(length_scale=1) + WhiteKernel(noise_level=1)
    RMS    0.651688
    dtype: float64
    GP r^2 score 0.891869923330719
    
    核参数修改,且正态化后
    GP RMS label    0.597042
    GP r^2 score 0.9092436176966117
    """
    
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    from sklearn.metrics import r2_score
    from sklearn.ensemble import RandomForestRegressor
    from sklearn import gaussian_process
    from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel, RBF
    
    #读入数据
    data = pd.read_excel('data.xlsx')
    y =  pd.read_excel('label.xlsx')
    
    
    st = StandardScaler()
    X = st.fit_transform(data)
     
    #划分训练集和测试集
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=42)
    
    
    #(工作电脑运行会死机)
    kernel=1.0 * RBF(length_scale=1) + WhiteKernel(noise_level=1)
    #gp = gaussian_process.GaussianProcessRegressor()
    #gp = gaussian_process.GaussianProcessRegressor(kernel=kernel)
    gp = gaussian_process.GaussianProcessRegressor(kernel=kernel,n_restarts_optimizer=0,normalize_y=True)
    gp.fit(X_train, y_train)
    
    y_pred, sigma = gp.predict(X_test, return_std=True)
    rms = (np.mean((y_test - y_pred)**2))**0.5
    #s = np.std(y_test -y_pred)
    print ("GP RMS", rms)
    print ("GP r^2 score",r2_score(y_test,y_pred))
    
    
    plt.scatter(y_train,gp.predict(X_train), label = 'Train', c='blue')
    plt.title('GP Predictor')
    plt.xlabel('Measured Solubility')
    plt.ylabel('Predicted Solubility')
    plt.scatter(y_test,gp.predict(X_test),c='lightgreen', label='Test', alpha = 0.8)
    plt.legend(loc=4)
    plt.savefig('GP Predictor.png', dpi=300)
    plt.show()
    

     https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149( 欢迎关注博主主页,学习python视频资源,还有大量免费python经典文章)


    项目合作QQ:231469242

      

  • 相关阅读:
    .NET定时任务执行管理器开源组件–FluentScheduler
    ASP.NET MVC 使用 FluentScheduler 定时器计划任务
    Windows 2008 IIS7.5中创建独立账号的FTP站点图文教程
    除非Microsoft FTP 服务(FTPSVC)正在运行,否则无法启动FTP站点。服务目前已停止
    Web API--自定义异常结果的处理
    C#怎么遍历一个对象里面的全部属性?
    WebApi 接口参数不再困惑:传参详解
    C#进阶系列——WebApi 接口测试工具:WebApiTestClient
    WebApi安全性 使用TOKEN+签名验证
    ASP.NET(C#) Web Api通过文件流下载文件到本地实例
  • 原文地址:https://www.cnblogs.com/webRobot/p/9590494.html
Copyright © 2011-2022 走看看