zoukankan      html  css  js  c++  java
  • 神经网络2-预测药物靶点

     python机器学习-乳腺癌细胞挖掘(博主亲自录制视频)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

    结论:神经网络算法有过度拟合,尝试其他算法,x_test和y_test报错

    # -*- coding: utf-8 -*-
    """
    Created on Wed Sep  5 11:23:58 2018
    
    @author: zhi.li04
    数据源与说明文档
    https://www.wildcardconsulting.dk/useful-information/a-deep-tox21-neural-network-with-rdkit-and-keras/
    """
    import pandas as pd
    import numpy as np
     
    #RDkit for fingerprinting and cheminformatics
    from rdkit import Chem, DataStructs
    from rdkit.Chem import AllChem, rdMolDescriptors
     
    #MolVS for standardization and normalization of molecules
    import molvs as mv
    
    
    
    #Function to get parent of a smiles
    def parent(smiles):
     st = mv.Standardizer() #MolVS standardizer
     try:
      mols = st.charge_parent(Chem.MolFromSmiles(smiles))
      return Chem.MolToSmiles(mols)
     except:
      print "%s failed conversion"%smiles
      return "NaN"
     
    #Clean and standardize the data
    def clean_data(data):
     #remove missing smiles
     data = data[~(data['smiles'].isnull())]
     
     #Standardize and get parent with molvs
     data["smiles_parent"] = data.smiles.apply(parent)
     data = data[~(data['smiles_parent'] == "NaN")]
     
     #Filter small fragents away
     def NumAtoms(smile):
      return Chem.MolFromSmiles(smile).GetNumAtoms()
     
     data["NumAtoms"] = data["smiles_parent"].apply(NumAtoms)
     data = data[data["NumAtoms"] > 3]
     return data
     
    
    
    #Read the data
    data = pd.DataFrame.from_csv('tox21_10k_data_all_pandas.csv')
    valdata = pd.DataFrame.from_csv('tox21_10k_challenge_test_pandas.csv')
    testdata = pd.DataFrame.from_csv('tox21_10k_challenge_score_pandas.csv')
    
    data = clean_data(data)
    valdata = clean_data(valdata)
    testdata = clean_data(testdata)
    
    
    #Calculate Fingerprints
    def morgan_fp(smiles):
     mol = Chem.MolFromSmiles(smiles)
     fp = AllChem.GetMorganFingerprintAsBitVect(mol,3, nBits=8192)
     npfp = np.array(list(fp.ToBitString())).astype('int8')
     return npfp
     
    fp = "morgan"
    data[fp] = data["smiles_parent"].apply(morgan_fp) 
    valdata[fp] = valdata["smiles_parent"].apply(morgan_fp) 
    testdata[fp] = testdata["smiles_parent"].apply(morgan_fp) 
    
    
    #Choose property to model
    prop = 'SR-MMP'
     
    #Convert to Numpy arrays
    X_train = np.array(list(data[~(data[prop].isnull())][fp]))
    X_val = np.array(list(valdata[~(valdata[prop].isnull())][fp]))
    X_test = np.array(list(testdata[~(testdata[prop].isnull())][fp]))
     
    #Select the property values from data where the value of the property is not null and reshape
    y_train = data[~(data[prop].isnull())][prop].values.reshape(-1,1)
    y_val = valdata[~(valdata[prop].isnull())][prop].values.reshape(-1,1)
    y_test = testdata[~(testdata[prop].isnull())][prop].values.reshape(-1,1)
    
    #Set network hyper parameters
    l1 = 0.000
    l2 = 0.016
    dropout = 0.5
    hidden_dim = 80
     
    #Build neural network
    model = Sequential()
    model.add(Dropout(0.2, input_shape=(X_train.shape[1],)))
    for i in range(3):
     wr = WeightRegularizer(l2 = l2, l1 = l1) 
     model.add(Dense(output_dim=hidden_dim, activation="relu", W_regularizer=wr))
     model.add(Dropout(dropout))
    wr = WeightRegularizer(l2 = l2, l1 = l1) 
    model.add(Dense(y_train.shape[1], activation='sigmoid',W_regularizer=wr))
     
    ##Compile model and make it ready for optimization
    model.compile(loss='binary_crossentropy', optimizer = SGD(lr=0.005, momentum=0.9, nesterov=True), metrics=['binary_crossentropy'])
    #Reduce lr callback
    reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5,patience=50, min_lr=0.00001, verbose=1)
     
    #Training
    history = model.fit(X_train, y_train, nb_epoch=1000, batch_size=1000, validation_data=(X_val,y_val), callbacks=[reduce_lr])
    
    
    
    #Plot Train History
    def plot_history(history):
        lw = 2
        fig, ax1 = plt.subplots()
        ax1.plot(history.epoch, history.history['binary_crossentropy'],c='b', label="Train", lw=lw)
        ax1.plot(history.epoch, history.history['val_loss'],c='g', label="Val", lw=lw)
        plt.ylim([0.0, max(history.history['binary_crossentropy'])])
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Loss')
        ax2 = ax1.twinx()
        ax2.plot(history.epoch, history.history['lr'],c='r', label="Learning Rate", lw=lw)
        ax2.set_ylabel('Learning rate')
        plt.legend()
        plt.show()
     
    plot_history(history)
    
    
    def show_auc(model):
        pred_train = model.predict(X_train)
        pred_val = model.predict(X_val)
        pred_test = model.predict(X_test)
     
        auc_train = roc_auc_score(y_train, pred_train)
        auc_val = roc_auc_score(y_val, pred_val)
        auc_test = roc_auc_score(y_test, pred_test)
        print "AUC, Train:%0.3F Test:%0.3F Val:%0.3F"%(auc_train, auc_test, auc_val)
     
        fpr_train, tpr_train, _ =roc_curve(y_train, pred_train)
        fpr_val, tpr_val, _ = roc_curve(y_val, pred_val)
        fpr_test, tpr_test, _ = roc_curve(y_test, pred_test)
     
        plt.figure()
        lw = 2
        plt.plot(fpr_train, tpr_train, color='b',lw=lw, label='Train ROC (area = %0.2f)'%auc_train)
        plt.plot(fpr_val, tpr_val, color='g',lw=lw, label='Val ROC (area = %0.2f)'%auc_val)
        plt.plot(fpr_test, tpr_test, color='r',lw=lw, label='Test ROC (area = %0.2f)'%auc_test)
     
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic of %s'%prop)
        plt.legend(loc="lower right")
        plt.interactive(True)
        plt.show()
     
    show_auc(model)
    
    
    #Compare with a Linear model
    from sklearn import linear_model
    #prepare scoring lists
    fitscores = []
    predictscores = []
    ##prepare a log spaced list of alpha values to test
    alphas = np.logspace(-2, 4, num=10)
    ##Iterate through alphas and fit with Ridge Regression
    for alpha in alphas:
      estimator = linear_model.LogisticRegression(C = 1/alpha)
      estimator.fit(X_train,y_train)
      fitscores.append(estimator.score(X_train,y_train))
      predictscores.append(estimator.score(X_val,y_val))
     
    #show a plot
    import matplotlib.pyplot as plt
    ax = plt.gca()
    ax.set_xscale('log')
    ax.plot(alphas, fitscores,'g')
    ax.plot(alphas, predictscores,'b')
    plt.xlabel('alpha')
    plt.ylabel('Correlation Coefficient')
    plt.show()
     
    estimator= linear_model.LogisticRegression(C = 1)
    estimator.fit(X_train,y_train)
    #Predict the test set
    y_pred = estimator.predict(X_test)
    print roc_auc_score(y_test, y_pred)
    

      

     https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149( 欢迎关注博主主页,学习python视频资源,还有大量免费python经典文章)


     

    机器学习项目合作QQ:231469242

  • 相关阅读:
    OCP-1Z0-053-V13.02-638题
    OCP-1Z0-053-200题-60题-637
    OCP-1Z0-053-V13.02-637题
    OCP-1Z0-053-200题-47题-625
    OCP-1Z0-053-V13.02-625题
    OCP-1Z0-053-200题-42题-621
    OCP-1Z0-053-V13.02-621题
    OCP-1Z0-053-200题-41题-620
    OCP-1Z0-053-V13.02-620题
    OCP-1Z0-053-200题-37题-616
  • 原文地址:https://www.cnblogs.com/webRobot/p/9592068.html
Copyright © 2011-2022 走看看