All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
Example:
Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT" Output: ["AAAAACCCCC", "CCCCCAAAAA"]
所有的DNA都是由一系列核苷酸组成的,简写为 A, C, G, and T,比如:"ACGAATTCCG"。当研究DNA时,识别DNA里的子序列是很有帮助的。写一个函数找出10个字母长度的出现过多次的子序列。
解法1:hash table + hash set
解法2: hash set
解法3:hash table + bit manipulte
Java:
public List<String> findRepeatedDnaSequences(String s) { Set<String> result = new HashSet(); if(s ==null || s.length() <2) return new ArrayList(); Set<String> temp = new HashSet(); for(int i=0; i<s.length()-9; i++){ String x = s.substring(i,i+10); if(temp.contains(x)){ result.add(x); } else temp.add(x); } return new ArrayList(result); }
Java:
public List<String> findRepeatedDnaSequences(String s) { Set seen = new HashSet(), repeated = new HashSet(); for (int i = 0; i + 9 < s.length(); i++) { String ten = s.substring(i, i + 10); if (!seen.add(ten)) repeated.add(ten); } return new ArrayList(repeated); }
Java: hashmap + bits manipulation
public List<String> findRepeatedDnaSequences(String s) { Set<Integer> words = new HashSet<>(); Set<Integer> doubleWords = new HashSet<>(); List<String> rv = new ArrayList<>(); char[] map = new char[26]; //map['A' - 'A'] = 0; map['C' - 'A'] = 1; map['G' - 'A'] = 2; map['T' - 'A'] = 3; for(int i = 0; i < s.length() - 9; i++) { int v = 0; for(int j = i; j < i + 10; j++) { v <<= 2; v |= map[s.charAt(j) - 'A']; } if(!words.add(v) && doubleWords.add(v)) { rv.add(s.substring(i, i + 10)); } } return rv; }
Python:
class Solution(object): def findRepeatedDnaSequences(self, s): """ :type s: str :rtype: List[str] """ dict, rolling_hash, res = {}, 0, [] for i in xrange(len(s)): rolling_hash = ((rolling_hash << 3) & 0x3fffffff) | (ord(s[i]) & 7) if rolling_hash not in dict: dict[rolling_hash] = True elif dict[rolling_hash]: res.append(s[i - 9: i + 1]) dict[rolling_hash] = False return res
Python:
def findRepeatedDnaSequences2(self, s): """ :type s: str :rtype: List[str] """ l, r = [], [] if len(s) < 10: return [] for i in range(len(s) - 9): l.extend([s[i:i + 10]]) return [k for k, v in collections.Counter(l).items() if v > 1]
C++:
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { unordered_set<int> seen; unordered_set<int> dup; vector<string> result; vector<char> m(26); m['A' - 'A'] = 0; m['C' - 'A'] = 1; m['G' - 'A'] = 2; m['T' - 'A'] = 3; for (int i = 0; i + 10 <= s.size(); ++i) { string substr = s.substr(i, 10); int v = 0; for (int j = i; j < i + 10; ++j) { //20 bits < 32 bit int v <<= 2; v |= m[s[j] - 'A']; } if (seen.count(v) == 0) { //not seen seen.insert(v); } else if (dup.count(v) == 0) { //seen but not dup dup.insert(v); result.push_back(substr); } //dup } return result; } };