## 读取数据 setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件") phenotype <- read.table("phenotype.txt",header = F,encoding = "UTF-8") multi_phenos <- read.table("multi_phenos.txt",header = F,encoding = "UTF-8") genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",colClasses ='character') # genotype <- read.table("genotype.dat",header = T,encoding = "UTF-8",stringsAsFactors = F) #gene_info <- read.table("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info/gene_1.dat") setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/gene_info") # filelist <- list.files(pattern=".*.dat") # datalist <- lapply(filelist, function(x) read.table(x, header=F, stringsAsFactors = F)) #datafr <- do.call("rbind", datalist) # gene_info <- c() # for(i in 1:length(filelist)){ # gene_info = rbind(gene_info, data.frame(genotype=datalist[[i]],geneinfo=i)) # } # colnames(gene_info) <- c('genotype','geneinfo') gene_info <- c() for(i in 1:300){ filename = paste('gene_',i,'.dat',sep ="") temp = read.table(filename, header=F, stringsAsFactors = F) gene_info = rbind(gene_info, data.frame(genotype=temp,geneinfo=i)) } colnames(gene_info) <- c('genotype','geneinfo') unique(gene_info$geneinfo) temp = c() for(i in 1:ncol(genotype)){ temp[i] = which(colnames(genotype)==gene_info$genotype[i]) } # gene_info <- gene_info[temp,] gene_info <- cbind(gene_info,data.frame(column=temp)) unique(1:9445-gene_info$column) unique(gene_info$geneinfo) # rownames(gene_info) <- 1:ncol(genotype) # sort(unique(gene_info$geneinfo));length(unique(gene_info$geneinfo)) # head(gene_info) #datafr <- as.data.frame(datafr) ## 数据预处理 ## 数据更正 setwd("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件") aa=as.character(genotype[1,]); unique(aa) unique(c(as.matrix(genotype))) #查看数据类型及异常值检测 ch1 <- c('II','ID','DD') ch2 <- c('TT','TC','CC') (ss=which(aa %in% ch1,arr.ind = T)); colnames(genotype[,ss]);rm(aa) ##定位异常值数据 # ss=which(genotype=='II',arr.ind = T) # ss=unique(ss[,2]) genotype[1:10,ss] #c(6070,8473) for(i in ss){ for(j in 1:3){ genotype[[i]][genotype[[i]]==ch1[j]]=ch2[j] } } genotype[1:10,ss] #修正后数据保存 save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/data.RData") ## 数据编码 (ch1 <- paste(rep(c('A','T','C','G'),4), rep(c('A','T','C','G'),each=4), sep = "")) (ch2 <- paste(rep(1:4,4), rep(1:4,each=4), sep = "")) genotype[1:10,1:10] #c(6070,8473) for(i in 1:9445){ for(j in 1:16){ genotype[[i]][genotype[[i]]==ch1[j]] <- ch2[j] } } genotype[1:10,1:10];class(genotype[[1]]) # genotype=apply(genotype, MARGIN = 2, FUN = as.integer) genotype=apply(as.matrix(genotype), MARGIN = 2, FUN = as.integer) genotype <- as.data.frame(genotype) genotype[1:10,1:10];class(genotype[[1]]) ## 编码后数据保存 save.image("D:/Dasktop/数学建模/国赛2016/2016试题/B/B题附件/sdata.RData") ############################################################################################# #问题二求解 ##临界值法 row=1000;col=9445 alpha=0.999 #c(0.95,0.99,0.999) cc=qchisq(alpha, df=2) wh=c() ## 卡方检验 for(i in 1:col){ genotype_i = genotype[[i]] me = table(genotype_i)/2 hb = rep(0,length(me)); names(hb)=names(me) temp = table(genotype_i[501:1000]) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } # jk = rep(0,length(aa)); names(jk)=names(me) # temp = table(genotype_i[1:500]) # for(j in 1:length(temp)){ # jk[names(jk)==names(temp)[j]]=temp[j] # } jk=me*2-hb chisq = 2*sum((hb-me)^2/me) if(chisq > cc){ wh=c(wh,i) } } #length(wh);wh;colnames(genotype)[wh] ## 优比检验 cc=qchisq(alpha, df=1) #c(0.95,0.99,0.999) wh1=c() for(i in 1:length(wh)){ genotype_i = genotype[,wh[i]] me = table(genotype_i)/2 hb = rep(0,length(me)); names(hb)=names(me) temp = table(genotype_i[501:1000]) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } jk=me*2-hb or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2])) a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2]) u2 <- (log(or))^2/a if(u2 > cc){ wh1=c(wh1,wh[i]) } } #length(wh1);wh[wh1];colnames(genotype)[wh[wh1]] #结果整理导出 wh1=data.frame(column=wh1,genotype=colnames(genotype)[wh1],stringsAsFactors = F) wh=data.frame(column=wh,genotype=colnames(genotype)[wh],stringsAsFactors = F) write.csv(wh, 'wh.csv') write.csv(wh1, 'wh1.csv') ## p值法 row=1000;col=9445 pvalue=pvalue1=MAF=c() for(i in 1:col){ genotype_i = genotype[[i]] me = table(genotype_i)/2 ## 最小等位基因频率MAF MAF[i] = (min(me[1],me[3])+me[2]/2)/sum(me) hb = rep(0,length(me)); names(hb)=names(me) temp = table(genotype_i[501:1000]) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } #jk = rep(0,length(aa)); names(jk)=names(me) #temp = table(genotype_i[1:500]) #for(j in 1:length(temp)){ # jk[names(jk)==names(temp)[j]]=temp[j] #} jk=me*2-hb ## 卡方检验p值 chisq = 2*sum((hb-me)^2/me) pvalue[i] = pchisq(chisq, df=2, lower.tail = F) ## 优比检验p值 or <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2])) a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2]) u2 <- (log(or))^2/a pvalue1[i] = pchisq(u2, df=1, lower.tail = F) } rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2) ##最小等位基因频率MAF范围 c(min(MAF),max(MAF)) ##结果存储 p_chisq <- data.frame(genotype=colnames(genotype), p_value=pvalue, stringsAsFactors = F) p_or <- data.frame(genotype=colnames(genotype), p_value=pvalue1, stringsAsFactors = F) write.csv(p_chisq, 'p_chisq.csv') write.csv(p_or, 'p_or.csv') ##显著性检验 alpha <- 0.001 (chisq <- p_chisq[p_chisq[[2]]<alpha,]) #p_or[p_or[[2]]<alpha,] temp <- p_or[p_chisq$p_value<alpha,] (or <- temp[temp$p_value<alpha,]) write.csv(chisq, 'chisq.csv') write.csv(or, 'or.csv') ############################################################################################ ##第三问求解 ## 基因定位问题 ## 对于临界值准则 # xx=yy=c() # for(i in 1:nrow(wh)){ # xx[i]=gene_info$geneinfo[which(gene_info$genotype==wh$genotype[i])] # } # for(i in 1:nrow(wh1)){ # yy[i]=gene_info$geneinfo[which(gene_info$genotype==wh1$genotype[i])] # } # names(xx)=wh$genotype;xx # names(yy)=wh1$genotype;yy chisq_gene = or_gene = c() for(i in 1:nrow(wh)){ chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$column==wh$column[i]),]) } for(i in 1:nrow(wh1)){ or_gene=rbind(or_gene, gene_info[which(gene_info$column==wh1$column[i]),]) } chisq_gene;or_gene head(genotype[,chisq_gene$column]) # chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene # or_gene <- or_gene[order(or_gene$geneinfo),];or_gene length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo)) length(or_gene$geneinfo);length(unique(or_gene$geneinfo)) ## 对于p值准则 # xx=yy=c() # for(i in 1:nrow(chisq)){ # xx[i]=gene_info$geneinfo[which(gene_info$genotype==chisq$genotype[i])] # } # for(i in 1:nrow(or)){ # yy[i]=gene_info$geneinfo[which(gene_info$genotype==or$genotype[i])] # } # xx;yy # length(xx);length(unique(xx)) # length(yy);length(unique(yy)) chisq_gene = or_gene = c() for(i in 1:nrow(chisq)){ chisq_gene=rbind(chisq_gene, gene_info[which(gene_info$genotype==chisq$genotype[i]),]) } for(i in 1:nrow(or)){ or_gene=rbind(or_gene, gene_info[which(gene_info$genotype==or$genotype[i]),]) } # chisq_gene <- chisq_gene[order(chisq_gene$geneinfo),];chisq_gene # or_gene <- or_gene[order(or_gene$geneinfo),];or_gene chisq_gene;or_gene length(chisq_gene$geneinfo);length(unique(chisq_gene$geneinfo)) length(or_gene$geneinfo);length(unique(or_gene$geneinfo)) write.csv(or_gene, 'or_gene.csv') ## 单倍体检测 ## 连锁不平衡(LD)检验 ld_test <- c() alpha = 0.001 id <- as.integer(or_gene$column) for(j in 1:nrow(or_gene)){ # ww = gene_info[gene_info$geneinfo == or_gene[j,2],] cc = gene_info$column[gene_info$geneinfo == or_gene[j,2]] # ww = ww[-which(cc==id[j])] cc = cc[-which(cc==id[j])] for(i in cc){ if(i<id[j]){ genotype_i = genotype[[i]] genotype_j = genotype[,id[j]] }else{ genotype_j = genotype[[i]] genotype_i = genotype[,id[j]] # genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10) # temp = (genotype_i%%10)*10+genotype_i%%10 # genotype_ij = c(genotype_ij,temp) } a1 = floor(genotype_i/10) b1 = floor(genotype_j/10) a2 = genotype_i%%10 b2 = genotype_j%%10 nij = a1*10+b1 temp = a2*10+b2 nij = table(c(nij,temp)) Aa = table(c(a1,a2)) Bb = table(c(b1,b2)) temp = outer(Aa,Bb,'*') n = sum(nij) mij = c(t(temp))/n ##计算检验统计量 chisq2 = sum((nij-mij)^2/mij) l2 = -2*sum(log(mij/nij)*nij) chisq2 = pchisq(chisq2, df=1, lower.tail = F) l2 = pchisq(l2, df=1, lower.tail = F) r2 = (nij[1]-mij[1])^2/(Aa[1]/n*Bb[2]/n*Aa[2]*Bb[1]) if(chisq2<alpha && l2<alpha && r2>0.22){ ##bootstrap求CI置信区间 CI = c() for(k in 1:1000){ samp = sample(1:1000, 1000, replace = TRUE) if(i<id[j]){ genotype_i = genotype[samp,i] genotype_j = genotype[samp,id[j]] }else{ genotype_j = genotype[samp,i] genotype_i = genotype[samp,id[j]] # genotype_ij = floor(genotype_i/10)*10+floor(genotype_j/10) # temp = (genotype_i%%10)*10+genotype_i%%10 # genotype_ij = c(genotype_ij,temp) } a1 = floor(genotype_i/10) b1 = floor(genotype_j/10) a2 = genotype_i%%10 b2 = genotype_j%%10 nij = a1*10+b1 temp = a2*10+b2 nij = table(c(nij,temp)) Aa = table(c(a1,a2)) Bb = table(c(b1,b2)) temp = outer(Aa,Bb,'*') n = sum(nij) mij = c(t(temp))/n if(nij[1]>mij[1]){ D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[2],Aa[2]*Bb[1]) }else{ D1 = (nij[1]-mij[1])*n/min(Aa[1]*Bb[1],Aa[2]*Bb[2]) } CI=c(CI,D1) } CI = sort(CI) # if(D1){ #(CI[25]>0.7 && CI[975]>0.98) # temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975]) # ld_test = rbind(ld_test,temp) # # temp = c(or_gene[j,1],or_gene[j,2],id[j],ww[i,1],ww[i,2],i,chisq2,l2,D1,r2) # } temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975]) ld_test = rbind(ld_test,temp) } # if(r2>0.2){ # temp = data.frame(or_gene[j,2],id[j],i,chisq2,l2,r2,CI[25],CI[975]) # ld_test = rbind(ld_test,temp) # } } } #ld_test <- as.data.frame(ld_test) #rm(temp) colnames(ld_test) <- c('geneinfo','columni','columnj','chisq2_p','l_p','r2','CI.25','CI.975') ld_test write.csv(ld_test, 'ld_test.csv') # colnames(ld_test) <- c('genotype_i','geneinfo_i','column_i','genotype_j', # 'geneinfo_j','column_j','chisq_p','l_p','D','r2') ## 治病基因定位 ## 卡方检验p值,二倍体 # id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)]) # genotype_i = genotype[,id[1]] # genotype_j = genotype[,id[2]] # genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10) # genotype_j = genotype_i%%10*10+genotype_j%%10 # me = table(c(genotype_i,genotype_j))/2 # hb = rep(0,length(me)); names(hb)=names(me) # temp = table(c(genotype_i[501:1000],genotype_i[501:1000])) # for(j in 1:length(temp)){ # hb[names(hb)==names(temp)[j]]=temp[j] # } # # jk=me*2-hb # # chisq3 = 2*sum((hb-me)^2/me) # pvalue = pchisq(chisq3, df=3, lower.tail = F) # c(chisq3,pvalue) id <- id <- c(ld_test$columni[1],ld_test$columnj[which.max(ld_test$r2)]) id <- sort(id); leng <- length(id) genotype_i = genotype_j = 0 for(i in 1:leng){ genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i) genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i) } me = table(rbind(genotype_i,genotype_j))/2 hb = rep(0,length(me)); names(hb)=names(me) temp = table(c(genotype_i[501:1000],genotype_j[501:1000])) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } jk=me*2-hb chisq_3 = 2*sum((hb-me)^2/me) pvalue = pchisq(chisq_3, df=3, lower.tail = F) length(me); c(chisq_3,pvalue) ## 卡方检验p值,三倍体 id <- c(ld_test$columni[1],ld_test$columnj[c(3,4)]) id <- sort(id); leng <- length(id) genotype_i = genotype_j = 0 for(i in 1:leng){ genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i) genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i) } me = table(c(genotype_i,genotype_j))/2 hb = rep(0,length(me)); names(hb)=names(me) temp = table(c(genotype_i[501:1000],genotype_j[501:1000])) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } jk=me*2-hb df = 2^(leng+1)-2^leng-1 chisq_3 = 2*sum((hb-me)^2/me) pvalue = pchisq(chisq_3, df=df, lower.tail = F) length(me); c(chisq_3,pvalue) ## 卡方检验p值,四倍体 id <- c(ld_test$columni[1],ld_test$columnj[-2]) id <- sort(id); leng <- length(id) genotype_i = genotype_j = 0 for(i in 1:leng){ genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i) genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i) } me = table(c(genotype_i,genotype_j))/2 hb = rep(0,length(me)); names(hb)=names(me) temp = table(c(genotype_i[501:1000],genotype_j[501:1000])) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } jk=me*2-hb df = 2^(leng+1)-2^leng-1 chisq_3 = 2*sum((hb-me)^2/me) pvalue = pchisq(chisq_3, df=df, lower.tail = F) length(me); c(chisq_3,pvalue) ## 卡方检验p值,五倍体 id <- c(ld_test$columni[1],ld_test$columnj) id <- sort(id); leng <- length(id) genotype_i = genotype_j = 0 for(i in 1:leng){ genotype_i = genotype_i + floor(genotype[,id[i]]/10)*10^(leng-i) genotype_j = genotype_j + (genotype[,id[i]]%%10)*10^(leng-i) } me = table(c(genotype_i,genotype_j))/2 hb = rep(0,length(me)); names(hb)=names(me) temp = table(c(genotype_i[501:1000],genotype_j[501:1000])) for(j in 1:length(temp)){ hb[names(hb)==names(temp)[j]]=temp[j] } jk=me*2-hb df = 2^(leng+1)-2^leng-1 chisq_3 = 2*sum((hb-me)^2/me) pvalue = pchisq(chisq_3, df=df, lower.tail = F) length(me); c(chisq_3,pvalue) ## 优比检验p值 # genotype_i = floor(genotype_i/10)*10+floor(genotype_j/10) # genotype_j = temp # me = table(c(genotype_i,genotype_j))/2 # hb = rep(0,length(me)); names(hb)=names(me) # temp = table(c(genotype_i[501:1000],genotype_i[501:1000])) # for(j in 1:length(temp)){ # hb[names(hb)==names(temp)[j]]=temp[j] # } # # jk=me*2-hb # or_3 <- ((hb[1]*2+hb[2])*(jk[3]*2+jk[2]))/((hb[3]*2+hb[2])*(jk[1]*2+jk[2])) # a <- 1/(hb[1]*2+hb[2])+1/(hb[3]*2+hb[2])+1/(jk[1]*2+jk[2])+1/(jk[3]*2+jk[2]) # u2 <- (log(or_3))^2/a # pvalue = pchisq(u2, df=1, lower.tail = F) # c(or_3,pvalue) #rm(genotype_i,me,hb,temp,jk,chisq,or,a,u2)