题意: 输入n个序列,求出一个最大长度的字符串,使得它在超过一半的DNA序列中连续出现。如果有多解,按照字典序从小到大输出所有解。
分析:这道题的关键是将多个字符串连接成一个串,方法是用不同的分隔符把所有原串拼接起来。接下来,就可以求这个新串的后缀数组和 height 数组, 然后二分答案,没次只需判断是非有一个长度为p的串在超过一半的串中出现过,判断方法是扫描一遍height数组,把它分成若干段,每当height[i] < p时,开辟一个新段,然后判断之前段是否包含了超过 n/2个原串后缀,那么当前的p值满足条件(注意n = 1时要特判)
详见代码:
#include <iostream> #include <cstdio> #include <algorithm> #include <cstring> #include <map> #include <vector> using namespace std; const int maxn = 104; const int maxm = 1005; char s[maxn*maxm]; int sa[maxn*maxm], t[maxn*maxm], t2[maxn*maxm], c[maxn*maxm]; int N; void build_sa(int m) { int* x = t, *y = t2; for(int i = 0; i < m; i++) c[i] = 0; for(int i = 0; i < N; i++) c[x[i] = s[i]]++; for(int i = 1; i < m; i++) c[i] += c[i-1]; for(int i = N-1; i >= 0; i--) sa[--c[x[i]]] = i; for(int k = 1; k <= N; k <<= 1) { int p = 0; for(int i = N-k; i < N; i++) y[p++] = i; for(int i = 0; i < N; i++) if(sa[i] >= k) y[p++] = sa[i] - k; for(int i = 0; i < m; i++) c[i] = 0; for(int i = 0; i < N; i++) c[x[y[i]]]++; for(int i = 1; i < m; i++) c[i] += c[i-1]; for(int i = N-1; i >= 0; i--) sa[--c[x[y[i]]]] = y[i]; swap(x, y); p = 1; x[sa[0]] = 0; for(int i = 1; i < N; i++) x[sa[i]] = (y[sa[i-1]] == y[sa[i]] && y[sa[i-1]+k] == y[sa[i]+k] ? p-1 :p++); if(p >= N) break; m = p; } } int rnk[maxn*maxm], height[maxn*maxm]; void get_height() { int k = 0; for(int i = 0; i < N; i++) rnk[sa[i]] = i; for(int i = 0; i < N; i++) { if(!rnk[i]) continue; int j = sa[rnk[i]-1]; if(k) k--; while(s[i+k] == s[j+k]) k++; height[rnk[i]] = k; } } int n; char s2[maxm]; int sign[maxn]; int mlen; vector<int> A; int flag[maxn]; map<char, int> Map; bool find(int p, vector<int> &A) { //判断当前长度p是否符合要求 memset(flag, 0, sizeof flag); bool OK = false; int cnt = 0; int start = 0; int t = lower_bound(sign, sign+n, sa[start]) - sign; if(!Map.count(s[sa[start]])) cnt++; flag[t] = start; for(int i = 1; i < N; i++) { if(height[i] >= p) { t = lower_bound(sign, sign+n, sa[i]) - sign; if(!Map.count(s[sa[i]]) && flag[t] < start) cnt++; flag[t] = i; if(i == N-1 && cnt > n/2){ OK = true; A.push_back(sa[start]); } } else { if(cnt > n/2) { OK = true; A.push_back(sa[start]); } cnt = 0; start = i; int t = lower_bound(sign, sign+n, sa[start]) - sign; if(!Map.count(s[sa[start]])) cnt++; flag[t] = start; } } return OK; } int cnt; char gen_sign() { //生成分隔符并记录 int i = 1; for(; i < 128; i++) if(!Map.count(i) && (i < 'a' || i > 'z')) break; Map[i] = ++cnt; return i; } int main() { int tt = 0; while(scanf("%d", &n) == 1 && n) { if(tt++) puts(""); if(n == 1) { scanf("%s", s); printf("%s ", s); continue; } cnt = 0; Map.clear(); N = 0; for(int i = 0; i < n; i++) { scanf("%s", s2); strcpy(s+N, s2); N += strlen(s2); s[N++] = gen_sign(); sign[i] = N-1; } s[N] = ''; //cout << s <<endl; //for(int i = 0; i < n; i++) cout<< sign[i] <<endl; build_sa(127); get_height(); //for(int i = 0; i < N; i++) printf("%d ", sa[i]); //puts(""); //for(int i = 0; i < N; i++) printf("%d ", height[i]); //puts(""); mlen = 0; int L = 0, R = N-1; A.clear(); vector<int> B; while(R >= L) { int M = L + (R-L+1)/2; B.clear(); if(find(M, B)) { mlen = M; A = B; L = M+1; } else R = M-1; } if(A.size() == 0) printf("? "); for(int i = 0; i < A.size(); i++) { for(int j = 0; j < mlen; j++) printf("%c", s[A[i]+j]); printf(" "); } } }