zoukankan      html  css  js  c++  java
  • 读书笔记 Bioinformatics Algorithms Chapter5

    Chapter5  HOW DO WE COMPARE DNA SEQUENCES 

    Bioinformatics Algorithms-An_Active Learning Approach

    http://bioinformaticsalgorithms.com/

     
    一、
    1983年,Russell Doolitte 将血小板源生长因子[platelet derived growth factor(PDGF),一种刺激细胞增殖的物质]和其它已知基因比对,发现它的序列和原癌基因(oncogene)的序列有很高的相似度。这让科学家猜测某些癌症是因为基因在不合适的时机作用所致(scientists hypothesized that some forms of cancer might be caused by a good gene doing the right thing at the wrong time.)。
    二、提出问题 序列比对:动态规划法
     
    1.全局比对:
    状态转移方程
     
      1 '''
      2 Code Challenge: Solve the Global Alignment Problem.
      3      Input: Two protein strings written in the single-letter amino acid alphabet.
      4      Output: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the
      5     BLOSUM62 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
      6 ----------
      7 Sample Input:
      8 PLEASANTLY
      9 MEANLY
     10 ----------
     11 Sample Output:
     12 8
     13 PLEASANTLY
     14 -MEA--N-LY
     15 ----------
     16 @ Lo Kowngho  2018.9
     17 '''
     18 import numpy
     19 from os.path import dirname
     20 
     21 def Grade(Symb1,Symb2):
     22     Indx1 = symbolList[Symb1]
     23     Indx2 = symbolList[Symb2]
     24     return matrix[Indx1][Indx2]
     25     
     26 def Init_Graph_Global(l1,l2):
     27     Graph = numpy.zeros([l2,l1])
     28     for x in range(1,l2):
     29         Graph[x][0] = Graph[x-1][0]-5
     30     for y in range(1,l1):
     31         Graph[0][y] = Graph[0][y-1]-5
     32     return Graph
     33         
     34 def Init_Path(l1,l2):
     35     Path = numpy.zeros([l2,l1])
     36     for x in range(1,l2):
     37         Path[x][0] = 1
     38     for y in range(1,l1):
     39         Path[0][y] = 2
     40     return Path
     41 
     42 def buildGlobalAlignmentGraph(text1,text2):
     43     
     44     l1 = len(text1)
     45     l2 = len(text2)
     46     Graph = Init_Graph_Global(l1, l2)
     47     Path = Init_Path(l1, l2)
     48         
     49     for x in range(1,l2):
     50         for y in range(1,l1):
     51             Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[y],text2[x]))
     52             if Graph[x-1][y]-5==Graph[x][y]:
     53                 Path[x][y]=1
     54             elif Graph[x][y-1]-5==Graph[x][y]:
     55                 Path[x][y]=2
     56             else:
     57                 Path[x][y]=3
     58     return [Graph,Path]
     59     
     60     
     61 def OutputGlobalAligement(Path,Graph,text1,text2):
     62     outT1 = ''
     63     outT2 = ''
     64     l1 = len(text1)
     65     l2 = len(text2)
     66     x = l2-1
     67     y = l1-1
     68     while(x!=0 or y!=0):
     69         if Path[x][y]==1:
     70             outT1 += '-'
     71             outT2 += text2[x]
     72             x -= 1            
     73         elif Path[x][y]==2:
     74             outT1 += text1[y]
     75             outT2 += '-'
     76             y -= 1
     77         else:
     78             outT1 += text1[y]
     79             outT2 += text2[x]
     80             x -= 1
     81             y -= 1
     82     return [outT1[::-1],outT2[::-1]]    
     83     
     84 def ImportScoreMatrix():
     85     dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('
    ')
     86     symbolList = dataset[0].split()
     87     for i in range(len(symbolList)):
     88         symbolList[i]=[symbolList[i],i]
     89     symbolList = dict(symbolList)
     90     print(symbolList)
     91     matrix = []
     92     for i in range(1,len(dataset)):
     93         matrix.append(dataset[i].split()[1:])
     94     for l in range(len(matrix)):
     95         for i in range(len(matrix[l])):
     96             matrix[l][i]=int(matrix[l][i])
     97     return [matrix,symbolList]
     98 
     99 if __name__ == '__main__':
    100     
    101     [matrix,symbolList] = ImportScoreMatrix()
    102 
    103     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
    104     text1 = '0'+dataset[0]
    105     text2 = '0'+dataset[1]
    106     
    107     [Graph,Path] = buildGlobalAlignmentGraph(text1, text2)
    108     
    109     [outT1,outT2] = OutputGlobalAligement(Path,Graph,text1,text2)
    110     
    111     print(int(Graph[-1][-1]))
    112     print(outT1)
    113     print(outT2)
    全局比对 python
     
    2. 局部比对
    可以把局部比对想象成下面的Free Taxi场景,在开始和结尾都不受罚分约束,只在中间的某一过程受罚分约束。
                  
    在全局比对的基础上,状态转移方程在加上一个0,表示每一个点,既可以由→、↓、↘经过罚分得到,也可以直接由起点,不经罚分得到(Free Taxi)。
      1 '''
      2 Code Challenge: Solve the Local Alignment Problem.
      3      Input: Two protein strings written in the single-letter amino acid alphabet.
      4      Output: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum
      5      score. Use the PAM250 scoring matrix for matches and mismatches as well as the indel penalty σ = 5.
      6 ---------------
      7 Sample Input:
      8 MEANLY
      9 PENALTY
     10 ---------------
     11 Sample Output:
     12 15
     13 EANL-Y
     14 ENALTY
     15 ---------------
     16 Lo Kwongho 2018.9
     17 '''
     18 import numpy
     19 from os.path import dirname
     20 
     21 def Grade(Symb1,Symb2):
     22     Indx1 = symbolList[Symb1]
     23     Indx2 = symbolList[Symb2]
     24     return matrix[Indx1][Indx2]
     25 
     26 def Init_Graph_Local(l1,l2):
     27     Graph = numpy.zeros([l1,l2])
     28     return Graph
     29         
     30 def Init_Path(l1,l2):
     31     Path = numpy.ones([l1,l2])*-1
     32     for x in range(1,l1):
     33         Path[x][0] = 1
     34     for y in range(1,l2):
     35         Path[0][y] = 2
     36     return Path
     37 
     38 def buildLocalAlignmentGraph(text1,text2):
     39     l1 = len(text1)
     40     l2 = len(text2)
     41     Graph = Init_Graph_Local(l1, l2)
     42     Path = Init_Path(l1, l2)
     43     
     44     for x in range(1,l1):
     45         for y in range(1,l2):
     46             Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[x],text2[y]),0)
     47             if Graph[x-1][y]-5 == Graph[x][y]:
     48                 Path[x][y] = 1
     49             elif Graph[x][y-1]-5==Graph[x][y]:
     50                 Path[x][y] = 2
     51             elif Graph[x][y] == 0:
     52                 Path[x][y] = 0
     53             else:
     54                 Path[x][y] = 3
     55     maxVal = 0
     56     maxIndx = [-1,-1]
     57     for x in range(1,l1):
     58         for y in range(1,l2):
     59             if Graph[x][y]>maxVal:
     60                 maxVal=Graph[x][y]
     61                 maxIndx=[x,y]
     62                 
     63     return [Graph,Path,maxIndx]
     64 
     65 def OutputLocalAligement(Path,Graph,text1,text2,maxIndx):
     66     outT1 = ''
     67     outT2 = ''
     68     l1 = len(text1)
     69     l2 = len(text2)
     70     x = maxIndx[0]
     71     y = maxIndx[1]
     72     while(x!=0 or y!=0):
     73         if Path[x][y]==1:
     74             outT1 += text1[x]
     75             outT2 += '-'
     76             x -= 1            
     77         elif Path[x][y]==2:
     78             outT1 += '-'
     79             outT2 += text2[y]
     80             y -= 1
     81         elif Path[x][y]==3:
     82             outT1 += text1[x]
     83             outT2 += text2[y]
     84             x -= 1
     85             y -= 1
     86         else:
     87             x=0
     88             y=0
     89     return [outT1[::-1],outT2[::-1]]    
     90 
     91 
     92 def ImportScoreMatrix():
     93     dataset = open(dirname(__file__)+'PAM250.txt').read().strip().split('
    ')
     94     symbolList = dataset[0].split()
     95     for i in range(len(symbolList)):
     96         symbolList[i]=[symbolList[i],i]
     97     symbolList = dict(symbolList)
     98     matrix = []
     99     for i in range(1,len(dataset)):
    100         matrix.append(dataset[i].split()[1:])
    101     for l in range(len(matrix)):
    102         for i in range(len(matrix[l])):
    103             matrix[l][i]=int(matrix[l][i])
    104     return [matrix,symbolList]
    105     
    106 if __name__ == '__main__':
    107     [matrix,symbolList] = ImportScoreMatrix()
    108     
    109     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
    110     text1 = '0'+dataset[0]
    111     text2 = '0'+dataset[1]
    112     
    113     [Graph,Path,maxIndx] = buildLocalAlignmentGraph(text1,text2)
    114     
    115     [outT1,outT2]=OutputLocalAligement(Path,Graph,text1,text2,maxIndx)
    116     print(int(Graph[maxIndx[0]][maxIndx[1]]))
    117     print(outT1)
    118     print(outT2)
    局部比对 Python
    3. Overlarp Alignment
      1 '''
      2 Code Challenge: Solve the Overlap Alignment Problem.
      3    >>Input: Two strings v and w, each of length at most 1000.
      4    >>Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of v and a prefix w' of w.
      5     achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2.
      6 -------------------
      7 Sample Input:
      8 PAWHEAE
      9 HEAGAWGHEE
     10 -------------------
     11 Sample Output:
     12 1
     13 HEAE
     14 HEAG
     15 -------------------
     16 coder: Lo Kwongho
     17 '''
     18 
     19 import numpy
     20 from os.path import dirname
     21 
     22 def Init_Graph_Overlap(l1,l2):
     23     Graph = numpy.zeros([l1,l2])
     24     for y in range(1,l2):
     25         Graph[0][y] = Graph[0][y-1]-1
     26     return Graph
     27         
     28 def Init_Path(l1,l2):
     29     Path = numpy.ones([l1,l2])*-1
     30     for x in range(1,l1):
     31         Path[x][0] = 1
     32     for y in range(1,l2):
     33         Path[0][y] = 2
     34     return Path
     35 
     36 def buildOverlapAlignmentGraph(text1,text2):
     37     l1 = len(text1)
     38     l2 = len(text2)
     39     Graph = Init_Graph_Overlap(l1, l2)
     40     Path = Init_Path(l1,l2)
     41     for x in range(1,l1):
     42         for y in range(1,l2):
     43             if text1[x]==text2[y]:
     44                 Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-2,Graph[x][y-1]-2)
     45             else:
     46                 Graph[x][y]=max(Graph[x-1][y-1]-2,Graph[x-1][y]-2,Graph[x][y-1]-2)
     47             if Graph[x][y]==Graph[x-1][y]-2:
     48                 Path[x][y]=1
     49             elif Graph[x][y]==Graph[x][y-1]-2:
     50                 Path[x][y]=2
     51             else:
     52                 Path[x][y]=3
     53         
     54     maxVal = 0
     55     maxIndx = -1
     56     for i in range(l2):
     57         if Graph[l1-1][i]>maxVal:
     58             maxVal=Graph[l1-1][i]
     59             maxIndx=i
     60                     
     61     return [Graph,Path,maxIndx,maxVal]    
     62     
     63 def OutputOverlapAligement(Path,Graph,text1,text2,maxIndx):
     64     outT1 = ''
     65     outT2 = ''
     66     l1 = len(text1)
     67     l2 = len(text2)
     68     x = l1-1
     69     y = maxIndx
     70     while(y!=0):
     71         if Path[x][y]==1:
     72             outT1 += text1[x]
     73             outT2 += '-'
     74             x -= 1            
     75         elif Path[x][y]==2:
     76             outT1 += '-'
     77             outT2 += text2[y]
     78             y -= 1
     79         elif Path[x][y]==3:
     80             outT1 += text1[x]
     81             outT2 += text2[y]
     82             x -= 1
     83             y -= 1
     84         else:
     85             x=0
     86             y=0
     87     return [outT1[::-1],outT2[::-1]]    
     88         
     89 
     90 if __name__ == '__main__':
     91     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
     92     text1 = '0'+dataset[0]
     93     text2 = '0'+dataset[1]
     94     l1 = len(text1)
     95     l2 = len(text2)
     96     [Graph,Path,maxIndx,maxVal] = buildOverlapAlignmentGraph(text1,text2)
     97     #print(Graph)
     98         
     99     [outText1,outText2]=OutputOverlapAligement(Path, Graph, text1, text2, maxIndx)
    100 
    101     print(int(maxVal))
    102     print(outText1)
    103     print(outText2)
    Overlarp in python
    4.Fitting Alignment 
      1 '''
      2 Fitting Alignment Problem: Construct a highest-scoring fitting alignment between two strings.
      3    >>Input: Strings v and w as well as a matrix Score.
      4    >>Output: A highest-scoring fitting alignment of v and w as defined by the scoring matrix Score.
      5 -------------------
      6 Sample Input:
      7 GTAGGCTTAAGGTTA
      8 TAGATA
      9 -------------------
     10 Sample Output:
     11 2
     12 TAGGCTTA
     13 TAGA--TA
     14 -------------------
     15 coder: Lo Kwongho
     16 '''
     17 
     18 import numpy
     19 from os.path import dirname
     20 
     21 def Init_Graph_Fiting(l1,l2):
     22     Graph = numpy.zeros([l1,l2])
     23     for y in range(1,l2):
     24         Graph[0][y] = Graph[0][y-1]-1
     25     return Graph
     26         
     27 def Init_Path(l1,l2):
     28     Path = numpy.ones([l1,l2])*-1
     29     for x in range(1,l1):
     30         Path[x][0] = 1
     31     for y in range(1,l2):
     32         Path[0][y] = 2
     33     return Path
     34 
     35 def buildFittingAlignmentGraph(text1,text2):
     36     l1 = len(text1)
     37     l2 = len(text2)
     38     Graph = Init_Graph_Fiting(l1, l2)
     39     Path = Init_Path(l1,l2)
     40     for x in range(1,l1):
     41         for y in range(1,l2):
     42             if text1[x]==text2[y]:
     43                 Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-1,Graph[x][y-1]-1)
     44             else:
     45                 Graph[x][y]=max(Graph[x-1][y-1]-1,Graph[x-1][y]-1,Graph[x][y-1]-1)
     46             if Graph[x][y]==Graph[x-1][y]-1:
     47                 Path[x][y]=1
     48             elif Graph[x][y]==Graph[x][y-1]-1:
     49                 Path[x][y]=2
     50             else:
     51                 Path[x][y]=3
     52     
     53     maxVal = 0
     54     maxIndx = -1
     55     for i in range(l1):
     56         if Graph[i][l2-1]>maxVal:
     57             maxVal=Graph[i][l2-1]
     58             maxIndx=i
     59                         
     60     return [Graph,Path,maxIndx,maxVal]    
     61     
     62 def OutputFittingAligement(Path,Graph,text1,text2,maxIndx):
     63     outT1 = ''
     64     outT2 = ''
     65     l1 = len(text1)
     66     l2 = len(text2)
     67     x = maxIndx
     68     y = l2-1
     69     while(y!=0):
     70         if Path[x][y]==1:
     71             outT1 += text1[x]
     72             outT2 += '-'
     73             x -= 1            
     74         elif Path[x][y]==2:
     75             outT1 += '-'
     76             outT2 += text2[y]
     77             y -= 1
     78         elif Path[x][y]==3:
     79             outT1 += text1[x]
     80             outT2 += text2[y]
     81             x -= 1
     82             y -= 1
     83         else:
     84             x=0
     85             y=0
     86     return [outT1[::-1],outT2[::-1]]    
     87         
     88 
     89 if __name__ == '__main__':
     90     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
     91     text1 = '0'+dataset[0]
     92     text2 = '0'+dataset[1]
     93     l1 = len(text1)
     94     l2 = len(text2)
     95     [Graph,Path,maxIndx,maxVal] = buildFittingAlignmentGraph(text1,text2)
     96     
     97     [outText1,outText2]=OutputFittingAligement(Path, Graph, text1, text2, maxIndx)
     98     #print(Graph)
     99     print(int(maxVal))
    100     print(outText1)
    101     print(outText2)
    Fitting Alignment in python
    这四种比对的关系如图:
     
    全局比对                    局部比对
    Overlarp Alignment                 Fitting Alignment
    5、基因的插入和删除,通常都是连续的一段,故在比对出现的连续空位,应该把它们当作一个整体看待。在比对的空位罚分中,生物学家认为,在每一条链上新开一个空位,应罚重分,而空位的延续,罚分应较少:
    解决问题的方法是:开三个矩阵,每个矩阵代表一种方向。在→、↓方向上行走,代表产生空位。故每当从↘转移到→、↓,或者→、↓间转移,代表在某链上产生新空位,重罚,而在→、↓内转移,代表空位延续,轻罚。
     
                         
      1 '''
      2 Code Challenge: Solve the Alignment with Affine Gap Penalties Problem.
      3    >>Input: Two amino acid strings v and w (each of length at most 100).
      4    >>Output: The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the
      5      BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1.
      6 ---------------------
      7 Sample Input:
      8 PRTEINS
      9 PRTWPSEIN
     10 ---------------------
     11 Sample Output:
     12 8
     13 PRT---EINS
     14 PRTWPSEIN-
     15 ---------------------
     16 coder: Lo Kwongho
     17 '''
     18 import numpy
     19 from os.path import dirname
     20 negINFINITY = -999
     21 #Penalties
     22 gs = -10 #gap_Start
     23 ge = -1  #gap_Extend
     24 #
     25 def Grade(Symb1,Symb2):
     26     Indx1 = symbolList[Symb1]
     27     Indx2 = symbolList[Symb2]
     28     return matrix[Indx1][Indx2]
     29 
     30 def initGraph(l1,l2):
     31     Graph = [numpy.zeros([l1,l2]    ,dtype=int) for i in range(3)]
     32 
     33     Graph[1][0][0] = negINFINITY
     34     Graph[2][0][0] = negINFINITY
     35     for x in range(1,l1):
     36         Graph[0][x][0]=negINFINITY
     37         if x==1:
     38             Graph[1][x][0]=ge+gs
     39         else:
     40             Graph[1][x][0]=Graph[1][x-1][0]+ge
     41         Graph[2][x][0]=negINFINITY
     42     for y in range(1,l2):
     43         Graph[0][0][y]=negINFINITY
     44         if y ==1:
     45             Graph[2][0][y]=ge+gs
     46         else:
     47             Graph[2][0][y]=Graph[2][0][y-1]+ge
     48         Graph[1][0][y]=negINFINITY
     49     return Graph
     50     
     51 def Init_Path(l1,l2):
     52     Path = [numpy.ones([l1,l2])*-1 for i in range(3)]
     53     '''for x in range(1,l1):
     54         Path[x][0] = 1
     55     for y in range(1,l2):
     56         Path[0][y] = 2'''
     57     return Path
     58     
     59 def buildAlignmentGraph(text1,text2,l1,l2):
     60 
     61     Graph = initGraph(l1,l2)
     62     #Path = #Init_Path(l1,l2)
     63     for x in range(1,l1):
     64         for y in range(1,l2):                
     65             # X ######
     66             Graph[1][x][y]=max(gs+ge+Graph[0][x-1][y],gs+ge+Graph[2][x-1][y],ge+Graph[1][x-1][y])
     67 
     68                 
     69             # Y ###### 
     70             Graph[2][x][y]=max(gs+ge+Graph[0][x][y-1],gs+ge+Graph[1][x][y-1],ge+Graph[2][x][y-1])
     71 
     72             # M ######
     73             Graph[0][x][y]=Grade(text1[x], text2[y])+max(Graph[0][x-1][y-1],Graph[1][x-1][y-1],Graph[2][x-1][y-1])
     74 
     75     maxVal = 0
     76     maxIndx = -1
     77     for i in range(3):
     78         if Graph[i][l1-1][l2-1]>maxVal:
     79             maxVal=Graph[i][l1-1][l2-1]
     80             maxIndx=i
     81     return [Graph,maxIndx,maxVal]
     82 
     83 def trackBack(Graph,maxIndx,text1,text2):
     84     x = len(text1)-1
     85     y = len(text2)-1
     86     otext1 = ''
     87     otext2 = ''
     88     Indx = maxIndx
     89     while(1):
     90         if Indx==0:
     91             otext1 += text1[x]
     92             otext2 += text2[y]
     93             if x ==1:
     94                 break
     95             if Graph[0][x][y]==Graph[1][x-1][y-1]+Grade(text1[x], text2[y]):
     96                 Indx = 1
     97             elif Graph[0][x][y]==Graph[2][x-1][y-1]+Grade(text1[x], text2[y]):
     98                 Indx = 2
     99             else:
    100                 Indx = 0
    101             x -= 1
    102             y -= 1
    103         elif Indx==1:
    104             otext1 += text1[x]
    105             otext2 += '-'
    106             if x == 1:
    107                 break
    108             if Graph[1][x][y]==Graph[0][x-1][y]+ge+gs:
    109                 Indx = 0
    110             elif Graph[1][x][y]==Graph[2][x-1][y]+ge+gs:
    111                 Indx = 2
    112             else:
    113                 Indx = 1
    114             x -= 1
    115         else:
    116             otext1 += '-'
    117             otext2 += text2[y]
    118             if y == 1:
    119                 break
    120             if Graph[2][x][y]==Graph[0][x][y-1]+ge+gs:
    121                 Indx = 0
    122             elif Graph[2][x][y]==Graph[1][x][y-1]+ge+gs:
    123                 Indx = 1
    124             else:
    125                 Indx = 2
    126             y -= 1
    127                 
    128     return [otext1[::-1],otext2[::-1]]
    129         
    130 def ImportScoreMatrix():
    131     dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('
    ')
    132     symbolList = dataset[0].split()
    133     for i in range(len(symbolList)):
    134         symbolList[i]=[symbolList[i],i]
    135     symbolList = dict(symbolList)
    136     matrix = []
    137     for i in range(1,len(dataset)):
    138         matrix.append(dataset[i].split()[1:])
    139     for l in range(len(matrix)):
    140         for i in range(len(matrix[l])):
    141             matrix[l][i]=int(matrix[l][i])
    142     return [matrix,symbolList]
    143 
    144 
    145 if __name__ == '__main__':
    146     [matrix,symbolList] = ImportScoreMatrix() # 打分矩阵
    147     
    148     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
    149     text1 = '0'+dataset[0]
    150     text2 = '0'+dataset[1]
    151     l1 = len(text1)
    152     l2 = len(text2)
    153     [Graph,maxIndx,maxVal] = buildAlignmentGraph(text1, text2, l1, l2)
    154     #print(Graph)
    155     
    156     [output_text1,output_text2] = trackBack(Graph,maxIndx,text1,text2)
    157     print(maxVal)
    158     print(output_text1)
    159     print(output_text2)
    160     
    Alignment with Affine Gap Penalties
    6 * 一种线性空间的比对方法 Space-Efficient Sequence Alignment(分治+动态规划)
    https://www.cs.rit.edu/~rlaz/algorithms20082/slides/SpaceEfficientAlignment.pdf
      1 '''
      2 Code Challenge: Implement LinearSpaceAlignment to solve the Global Alignment Problem for a large dataset.
      3 >>>Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet.
      4 >>>Output: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the
      5 
      6 BLOSUM62 scoring matrix and indel penalty σ = 5.
      7 ------------
      8 Sample Input:
      9 PLEASANTLY
     10 MEANLY
     11 ------------
     12 Sample Output:
     13 8
     14 PLEASANTLY
     15 -MEA--N-LY
     16 ------------
     17 coder: Lo Kwongho in 2018.9
     18 '''
     19 from os.path import dirname
     20 import numpy
     21 #
     22 indel = -5
     23 negINF = -9999
     24 #
     25 #
     26 def Grade(Symb1,Symb2):
     27     Indx1 = symbolList[Symb1]
     28     Indx2 = symbolList[Symb2]
     29     return matrix[Indx1][Indx2]
     30 
     31 def ImportScoreMatrix():
     32     dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('
    ')
     33     symbolList = dataset[0].split()
     34     for i in range(len(symbolList)):
     35         symbolList[i]=[symbolList[i],i]
     36     symbolList = dict(symbolList)
     37     matrix = []
     38     for i in range(1,len(dataset)):
     39         matrix.append(dataset[i].split()[1:])
     40     for l in range(len(matrix)):
     41         for i in range(len(matrix[l])):
     42             matrix[l][i]=int(matrix[l][i])
     43     return [matrix,symbolList]
     44 #
     45 def half_Alignment(v,w):
     46     nv = len(v)
     47     mw = len(w)
     48     s = numpy.zeros(shape=(nv+1,2),dtype=int)
     49     for i in range(nv+1):
     50         s[i,0] = indel*i
     51     if mw==0:
     52         return s[:,0] #
     53     for j in range(mw):
     54         s[0,1]=s[0,0]+indel
     55         for i in range(nv):
     56             s[i+1,1]=max(s[i,1]+indel,s[i+1,0]+indel,s[i,0]+Grade(w[j],v[i]))
     57         s[:,0]=s[:,1]
     58     return s[:,1]
     59 
     60 def midEdge(v,w):
     61     nv = len(v)
     62     mw = len(w)
     63     mid = int((mw-1)/2)
     64     wl = w[:mid]
     65     wr = w[mid+1:]
     66     pre = half_Alignment(v,wl)
     67     suf = half_Alignment(v[::-1],wr[::-1])[::-1]
     68     hs = [pre[i]+suf[i]+indel  for i in range(nv+1)]
     69     ds = [pre[i]+suf[i+1]+Grade(w[mid],v[i])  for i in range(nv)]
     70     maxhs = max(hs)
     71     maxds = max(ds)
     72     if maxhs>maxds:
     73         return ( (hs.index(maxhs),mid) ,(hs.index(maxhs),mid+1) )
     74     else:
     75         return ( (ds.index(maxds),mid) ,(ds.index(maxds)+1,mid+1) )
     76 
     77 def build_Alignment_track(v,w):
     78     vn = len(v)
     79     wm = len(w)
     80     if vn==0 and wm==0:
     81         return []
     82     elif vn==0:
     83         return ['-']*wm
     84     elif wm==0:
     85         return ['|']*vn
     86     ((x1,y1),(x2,y2)) = midEdge(v,w)
     87     if x1==x2:
     88         edge = ['-']
     89     else:
     90         edge = ['\']
     91     wleft = w[:y1]
     92     wright = w[y2:]
     93     vupper = v[:x1]
     94     vbotm = v[x2:]
     95 
     96     upper_left_track = build_Alignment_track(vupper,wleft)
     97     bottom_right_track = build_Alignment_track(vbotm,wright)
     98     return upper_left_track+edge+bottom_right_track
     99 
    100 def trackToString(v,w,track):
    101     vi = 0
    102     wj = 0
    103     outv = ''
    104     outw = ''
    105     score = 0
    106     for i in track:
    107         if i == '|':
    108             outv += v[vi]
    109             outw += '-'
    110             score += indel
    111             vi += 1    
    112         elif i == '-':
    113             outv += '-'
    114             outw += w[wj]
    115             score += indel
    116             wj += 1            
    117         else:
    118             outv += v[vi]
    119             outw += w[wj]
    120             score += Grade(v[vi], w[wj])
    121             vi += 1
    122             wj += 1
    123             
    124     return [score,outv,outw]
    125 
    126 def LinearSpaceAlignment(v,w):
    127     track = build_Alignment_track(v,w)
    128     [score,outv, outw] = trackToString(v,w,track)
    129     print(score)
    130     print(outv)
    131     print(outw)
    132 
    133 if __name__ == '__main__':
    134     dataset = open(dirname(__file__)+'dataset.txt').read().strip().split()
    135     [matrix,symbolList] = ImportScoreMatrix()
    136     v = dataset[0]
    137     w = dataset[1]
    138     LinearSpaceAlignment(v,w)
    Linear-Space Alignment
  • 相关阅读:
    cmd命令之set详解
    微信公众号之推送消息
    总结js(1)
    本地文件夹变远程仓库并且提交Github
    npm之使用淘宝源
    页面倒计时返回
    在线sass编译器
    js条件语句之职责链数组
    【轉】靜
    css 實現微信聊天類似的氣泡
  • 原文地址:https://www.cnblogs.com/lokwongho/p/9688047.html
Copyright © 2011-2022 走看看