zoukankan      html  css  js  c++  java
  • 数模国赛——致病基因

    ## 读取数据
    setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件")
    phenotype <- read.table("phenotype.txt",header = F,encoding = "UTF-8")
    multi_phenos <- read.table("multi_phenos.txt",header = F,encoding = "UTF-8")
    genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",colClasses ='character')
    # genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",stringsAsFactors = F)
    #gene_info <- read.table("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info/gene_1.dat")
    
    setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info")
    # filelist <- list.files(pattern=".*.dat")
    # datalist <- lapply(filelist, function(x) read.table(x, header=F, stringsAsFactors = F)) 
    #datafr <- do.call("rbind", datalist)
    # gene_info <- c()
    # for(i in 1:length(filelist)){
    #   gene_info = rbind(gene_info, data.frame(genotype=datalist[[i]],geneinfo=i))
    # }
    # colnames(gene_info) <- c('genotype','geneinfo')
    gene_info <- c()
    for(i in 1:300){
      filename = paste('gene_',i,'.dat',sep ="")
      temp = read.table(filename, header=F, stringsAsFactors = F)
      gene_info = rbind(gene_info, data.frame(genotype=temp,geneinfo=i))
    }
    
    colnames(gene_info) <- c('genotype','geneinfo')
    unique(gene_info$geneinfo)
    temp = c()
    for(i in 1:ncol(genotype)){
      temp[i] = which(colnames(genotype)==gene_info$genotype[i])
    }
    # gene_info <- gene_info[temp,]
    gene_info <- cbind(gene_info,data.frame(column=temp))
    unique(1:9445-gene_info$column)
    unique(gene_info$geneinfo)
    
    # rownames(gene_info) <- 1:ncol(genotype)
    # sort(unique(gene_info$geneinfo));length(unique(gene_info$geneinfo))
    # head(gene_info)
    
    #datafr <- as.data.frame(datafr)
    
    ## 数据预处理
    ## 数据更正
    setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件")
    aa=as.character(genotype[1,]); unique(aa)
    unique(c(as.matrix(genotype))) #查看数据类型及异常值检测
    ch1 <- c('II','ID','DD')
    ch2 <- c('TT','TC','CC')
    (ss=which(aa %in% ch1,arr.ind = T));
    colnames(genotype[,ss]);rm(aa) ##定位异常值数据
    # ss=which(genotype=='II',arr.ind = T)
    # ss=unique(ss[,2])
    genotype[1:10,ss] #c(6070,8473)
    for(i in ss){
      for(j in 1:3){
        genotype[[i]][genotype[[i]]==ch1[j]]=ch2[j]
      }
    }
    genotype[1:10,ss]
    #修正后数据保存
    save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/data.RData")
    ## 数据编码
    (ch1 <- paste(rep(c('A','T','C','G'),4), rep(c('A','T','C','G'),each=4), sep = ""))
    (ch2 <- paste(rep(1:4,4), rep(1:4,each=4), sep = ""))
    genotype[1:10,1:10] #c(6070,8473)
    for(i in 1:9445){
      for(j in 1:16){
        genotype[[i]][genotype[[i]]==ch1[j]] <- ch2[j]
      }
    }
    genotype[1:10,1:10];class(genotype[[1]])
    # genotype=apply(genotype, MARGIN = 2, FUN = as.integer)
    genotype=apply(as.matrix(genotype), MARGIN = 2, FUN = as.integer)
    genotype <- as.data.frame(genotype)
    genotype[1:10,1:10];class(genotype[[1]])
    ## 编码后数据保存
    save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/sdata.RData")
    
    #############################################################################################
    #问题二求解
    ##临界值法
    row=1000;col=9445
    alpha=0.999   #c(0.95,0.99,0.999)
    cc=qchisq(alpha, df=2)  
    wh=c()
    ## 卡方检验
    for(i in 1:col){
      genotype_i = genotype[[i]]
      me = table(genotype_i)/2
      hb = rep(0,length(me)); names(hb)=names(me)
      temp = table(genotype_i[501:1000])
      for(j in 1:length(temp)){
        hb[names(hb)==names(temp)[j]]=temp[j]
      }
      
    #   jk = rep(0,length(aa)); names(jk)=names(me)
    #   temp = table(genotype_i[1:500])
    #   for(j in 1:length(temp)){
    #     jk[names(jk)==names(temp)[j]]=temp[j]
    #   }
      
      jk=me*2-hb
      chisq = 2*sum((hb-me)^2/me)
      if(chisq > cc){
        wh=c(wh,i)
      }
    
    }
    #length(wh);wh;colnames(genotype)[wh]
    
    ## 优比检验
    cc=qchisq(alpha, df=1)  #c(0.95,0.99,0.999)
    wh1=c()
    for(i in 1:length(wh)){
      genotype_i = genotype[,wh[i]]
      me = table(genotype_i)/2
      hb = rep(0,length(me)); names(hb)=names(me)
      temp = table(genotype_i[501:1000])
      for(j in 1:length(temp)){
        hb[names(hb)==names(temp)[j]]=temp[j]
      }
      
      jk=me*2-hb
      or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
      a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
      u2 <- (log(or))^2/a
      if(u2 > cc){
        wh1=c(wh1,wh[i])
      }
    }
    
    #length(wh1);wh[wh1];colnames(genotype)[wh[wh1]]
    
    #结果整理导出
    wh1=data.frame(column=wh1,genotype=colnames(genotype)[wh1],stringsAsFactors = F)
    wh=data.frame(column=wh,genotype=colnames(genotype)[wh],stringsAsFactors = F)
    write.csv(wh, 'wh.csv')
    write.csv(wh1, 'wh1.csv')
    
    
    
    
    
    ## p值法
    row=1000;col=9445
    pvalue=pvalue1=MAF=c()
    for(i in 1:col){
      genotype_i = genotype[[i]]
      me = table(genotype_i)/2
      ## 最小等位基因频率MAF
      MAF[i] = (min(me[1],me[3])+me[2]/2)/sum(me)
      hb = rep(0,length(me)); names(hb)=names(me)
      temp = table(genotype_i[501:1000])
      for(j in 1:length(temp)){
        hb[names(hb)==names(temp)[j]]=temp[j]
      }
      
      #jk = rep(0,length(aa)); names(jk)=names(me)
      #temp = table(genotype_i[1:500])
      #for(j in 1:length(temp)){
      #  jk[names(jk)==names(temp)[j]]=temp[j]
      #}
      
      jk=me*2-hb
      
      ## 卡方检验p值
      chisq = 2*sum((hb-me)^2/me)
      pvalue[i] = pchisq(chisq, df=2, lower.tail = F)
      
      ## 优比检验p值
      or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
      a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
      u2 <- (log(or))^2/a
      pvalue1[i] = pchisq(u2, df=1, lower.tail = F)
      
    }
    rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)
    
    ##最小等位基因频率MAF范围
    c(min(MAF),max(MAF))
    
    ##结果存储
    p_chisq <- data.frame(genotype=colnames(genotype), p_value=pvalue, stringsAsFactors = F)
    p_or <- data.frame(genotype=colnames(genotype), p_value=pvalue1, stringsAsFactors = F)
    write.csv(p_chisq, 'p_chisq.csv')
    write.csv(p_or, 'p_or.csv')
    
    ##显著性检验
    alpha <- 0.001
    (chisq <- p_chisq[p_chisq[[2]]<alpha,])
    #p_or[p_or[[2]]<alpha,]
    temp <- p_or[p_chisq$p_value<alpha,]
    (or <- temp[temp$p_value<alpha,])
    write.csv(chisq, 'chisq.csv')
    write.csv(or, 'or.csv')
    
    
    ############################################################################################
    
    ##第三问求解
    ## 基因定位问题
    ## 对于临界值准则
    # xx=yy=c()
    # for(i in 1:nrow(wh)){
    #   xx[i]=gene_info$geneinfo[which(gene_info$genotype==wh$genotype[i])]
    # }
    # for(i in 1:nrow(wh1)){
    #   yy[i]=gene_info$geneinfo[which(gene_info$genotype==wh1$genotype[i])]
    # }
    # names(xx)=wh$genotype;xx
    # names(yy)=wh1$genotype;yy
    
    chisq_gene = or_gene = c()
    for(i in 1:nrow(wh)){
      chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$column==wh$column[i]),])
    }
    for(i in 1:nrow(wh1)){
      or_gene=rbind(or_gene, gene_info[which(gene_info$column==wh1$column[i]),])
    }
    chisq_gene;or_gene
    head(genotype[,chisq_gene$column])
    # chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene
    # or_gene <- or_gene[order(or_gene$geneinfo),];or_gene
    length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo))
    length(or_gene$geneinfo);length(unique(or_gene$geneinfo))
    
    
    ## 对于p值准则
    # xx=yy=c()
    # for(i in 1:nrow(chisq)){
    #   xx[i]=gene_info$geneinfo[which(gene_info$genotype==chisq$genotype[i])]
    # }
    # for(i in 1:nrow(or)){
    #   yy[i]=gene_info$geneinfo[which(gene_info$genotype==or$genotype[i])]
    # }
    # xx;yy
    # length(xx);length(unique(xx))
    # length(yy);length(unique(yy))
    
    chisq_gene = or_gene = c()
    for(i in 1:nrow(chisq)){
      chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$genotype==chisq$genotype[i]),])
    }
    for(i in 1:nrow(or)){
      or_gene=rbind(or_gene, gene_info[which(gene_info$genotype==or$genotype[i]),])
    }
    # chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene
    # or_gene <- or_gene[order(or_gene$geneinfo),];or_gene
    chisq_gene;or_gene
    length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo))
    length(or_gene$geneinfo);length(unique(or_gene$geneinfo))
    write.csv(or_gene, 'or_gene.csv')
    
    
    ## 单倍体检测
    ## 连锁不平衡(LD)检验
    ld_test <- c()
    alpha = 0.001
    id <- as.integer(or_gene$column)
    for(j in 1:nrow(or_gene)){
      # ww = gene_info[gene_info$geneinfo == or_gene[j,2],]
      cc = gene_info$column[gene_info$geneinfo == or_gene[j,2]]
      # ww = ww[-which(cc==id[j])]
      cc = cc[-which(cc==id[j])]
      for(i in cc){
        if(i<id[j]){
          genotype_i = genotype[[i]]
          genotype_j = genotype[,id[j]]
          
        }else{
          genotype_j = genotype[[i]]
          genotype_i = genotype[,id[j]]
    #     genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10)
    #     temp = (genotype_i%%10)*10+genotype_i%%10
    #     genotype_ij = c(genotype_ij,temp)
        }
        
        a1 = floor(genotype_i/10)
        b1 = floor(genotype_j/10)
        a2 = genotype_i%%10
        b2 = genotype_j%%10
        nij = a1*10+b1
        temp = a2*10+b2
        nij = table(c(nij,temp))
        Aa = table(c(a1,a2))
        Bb = table(c(b1,b2))
        temp = outer(Aa,Bb,'*')
        n = sum(nij)
        mij = c(t(temp))/n
        ##计算检验统计量
        chisq2 = sum((nij-mij)^2/mij)
        l2 = -2*sum(log(mij/nij)*nij)
        chisq2 = pchisq(chisq2, df=1, lower.tail = F)
        l2 = pchisq(l2, df=1, lower.tail = F)
        r2 = (nij[1]-mij[1])^2/(Aa[1]/n*Bb[2]/n*Aa[2]*Bb[1])
        if(chisq2<alpha && l2<alpha && r2>0.22){
          ##bootstrap求CI置信区间
          CI = c()
          for(k in 1:1000){
            samp = sample(1:1000, 1000, replace = TRUE)
            if(i<id[j]){
              genotype_i = genotype[samp,i]
              genotype_j = genotype[samp,id[j]]
              
            }else{
              genotype_j = genotype[samp,i]
              genotype_i = genotype[samp,id[j]]
              #     genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10)
              #     temp = (genotype_i%%10)*10+genotype_i%%10
              #     genotype_ij = c(genotype_ij,temp)
            }
            
            a1 = floor(genotype_i/10)
            b1 = floor(genotype_j/10)
            a2 = genotype_i%%10
            b2 = genotype_j%%10
            nij = a1*10+b1
            temp = a2*10+b2
            nij = table(c(nij,temp))
            Aa = table(c(a1,a2))
            Bb = table(c(b1,b2))
            temp = outer(Aa,Bb,'*')
            n = sum(nij)
            mij = c(t(temp))/n
            if(nij[1]>mij[1]){
              D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[2],Aa[2]*Bb[1])
            }else{
              D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[1],Aa[2]*Bb[2])
            }
            CI=c(CI,D1)
          }
          CI = sort(CI)
    #       if(D1){ #(CI[25]>0.7 && CI[975]>0.98)
    #         temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
    #         ld_test = rbind(ld_test,temp)
    #         # temp = c(or_gene[j,1],or_gene[j,2],id[j],ww[i,1],ww[i,2],i,chisq2,l2,D1,r2)
    #       }
          temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
          ld_test = rbind(ld_test,temp)
          
        }
    #     if(r2>0.2){
    #       temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975])
    #       ld_test = rbind(ld_test,temp)
    #     }
        
        
      }
    }
    #ld_test <- as.data.frame(ld_test)
    #rm(temp)
    colnames(ld_test) <- c('geneinfo','columni','columnj','chisq2_p','l_p','r2','CI.25','CI.975')
    ld_test
    write.csv(ld_test, 'ld_test.csv')
    # colnames(ld_test) <- c('genotype_i','geneinfo_i','column_i','genotype_j',
    #                        'geneinfo_j','column_j','chisq_p','l_p','D','r2')
    
    
    ## 治病基因定位
    ## 卡方检验p值,二倍体
    # id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)])
    # genotype_i = genotype[,id[1]]
    # genotype_j = genotype[,id[2]]
    # genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10)
    # genotype_j = genotype_i%%10*10+genotype_j%%10
    # me = table(c(genotype_i,genotype_j))/2           
    # hb = rep(0,length(me)); names(hb)=names(me)
    # temp = table(c(genotype_i[501:1000],genotype_i[501:1000]))
    # for(j in 1:length(temp)){
    #   hb[names(hb)==names(temp)[j]]=temp[j]
    # }             
    # 
    # jk=me*2-hb
    #             
    # chisq3 = 2*sum((hb-me)^2/me)
    # pvalue = pchisq(chisq3, df=3, lower.tail = F)
    # c(chisq3,pvalue)
    id <- id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)])
    id <- sort(id); leng <- length(id)
    genotype_i = genotype_j = 0
    for(i in 1:leng){
      genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
      genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
    }
    me = table(rbind(genotype_i,genotype_j))/2           
    hb = rep(0,length(me)); names(hb)=names(me)
    temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
    for(j in 1:length(temp)){
      hb[names(hb)==names(temp)[j]]=temp[j]
    }             
    
    jk=me*2-hb
    
    chisq_3 = 2*sum((hb-me)^2/me)
    pvalue = pchisq(chisq_3, df=3, lower.tail = F)
    length(me); c(chisq_3,pvalue)
    
    ## 卡方检验p值,三倍体
    id <- c(ld_test$columni[1],ld_test$columnj[c(3,4)])
    id <- sort(id); leng <- length(id)
    genotype_i = genotype_j = 0
    for(i in 1:leng){
      genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
      genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
    }
    me = table(c(genotype_i,genotype_j))/2           
    hb = rep(0,length(me)); names(hb)=names(me)
    temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
    for(j in 1:length(temp)){
      hb[names(hb)==names(temp)[j]]=temp[j]
    }             
    
    jk=me*2-hb
    df = 2^(leng+1)-2^leng-1
    chisq_3 = 2*sum((hb-me)^2/me)
    pvalue = pchisq(chisq_3, df=df, lower.tail = F)
    length(me); c(chisq_3,pvalue)
    
    ## 卡方检验p值,四倍体
    id <- c(ld_test$columni[1],ld_test$columnj[-2])
    id <- sort(id); leng <- length(id)
    genotype_i = genotype_j = 0
    for(i in 1:leng){
      genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
      genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
    }
    me = table(c(genotype_i,genotype_j))/2           
    hb = rep(0,length(me)); names(hb)=names(me)
    temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
    for(j in 1:length(temp)){
      hb[names(hb)==names(temp)[j]]=temp[j]
    }             
    
    jk=me*2-hb
    df = 2^(leng+1)-2^leng-1
    chisq_3 = 2*sum((hb-me)^2/me)
    pvalue = pchisq(chisq_3, df=df, lower.tail = F)
    length(me); c(chisq_3,pvalue)
    
    ## 卡方检验p值,五倍体
    id <- c(ld_test$columni[1],ld_test$columnj)
    id <- sort(id); leng <- length(id)
    genotype_i = genotype_j = 0
    for(i in 1:leng){
      genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i)
      genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i)
    }
    me = table(c(genotype_i,genotype_j))/2           
    hb = rep(0,length(me)); names(hb)=names(me)
    temp = table(c(genotype_i[501:1000],genotype_j[501:1000]))
    for(j in 1:length(temp)){
      hb[names(hb)==names(temp)[j]]=temp[j]
    }             
    
    jk=me*2-hb
    df = 2^(leng+1)-2^leng-1
    chisq_3 = 2*sum((hb-me)^2/me)
    pvalue = pchisq(chisq_3, df=df, lower.tail = F)
    length(me); c(chisq_3,pvalue)
    
    
    
    ## 优比检验p值
    # genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10)
    # genotype_j = temp
    # me = table(c(genotype_i,genotype_j))/2           
    # hb = rep(0,length(me)); names(hb)=names(me)
    # temp = table(c(genotype_i[501:1000],genotype_i[501:1000]))
    # for(j in 1:length(temp)){
    #   hb[names(hb)==names(temp)[j]]=temp[j]
    # }             
    # 
    # jk=me*2-hb
    # or_3 <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2]))
    # a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2])
    # u2 <- (log(or_3))^2/a
    # pvalue = pchisq(u2, df=1, lower.tail = F)
    # c(or_3,pvalue)              
    #rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)
      
    

      

  • 相关阅读:
    c#命名空间
    MUTC 2 B Meeting point1 二分
    高斯消元模板
    MUTC 2 C Meeting point2 切比雪夫距离orz
    MUTC 2 E Save the dwarfs DP?
    Uva 10859 Placing Lampposts 树形dp
    Uva 11552 Fewest Flops 字符串dp
    Uva 10891 Game of Sum dp博弈
    MUTC 2 D Matrix 并查集
    Uva 1456 Cellular Network 概率dp
  • 原文地址:https://www.cnblogs.com/iupoint/p/9770066.html
Copyright © 2011-2022 走看看