-
Notifications
You must be signed in to change notification settings - Fork 4.9k
/
repeatedDNASequences.cpp
109 lines (94 loc) · 2.54 KB
/
repeatedDNASequences.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// Source : https://oj.leetcode.com/problems/repeated-dna-sequences/
// Author : Hao Chen
// Date : 2015-02-09
/**********************************************************************************
*
* All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T,
*
* For example: "ACGAATTCCG".
* When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
*
* Write a function to find all the 10-letter-long sequences (substrings) that
* occur more than once in a DNA molecule.
*
* For example,
*
* Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",
*
* Return:
* ["AAAAACCCCC", "CCCCCAAAAA"].
*
*
**********************************************************************************/
#include <stdlib.h>
#include <iostream>
#include <string>
#include <vector>
#include <functional>
#include <unordered_map>
using namespace std;
const int MAX_LEN = 10;
int ACGT2Int(char ch){
switch(ch){
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
}
return -1;
}
int DNASeqs2Int(string &s, int begin){
int result=0;
for(int i=0; i<MAX_LEN; i++){
result = result*4 + ACGT2Int(s[i+begin]);
}
return result;
}
vector<string> findRepeatedDnaSequences_01(string s) {
unordered_map<int, int> stat;
vector<string> result;
for( int i=0; i+MAX_LEN<=s.size(); i++ ){
int hash_code = DNASeqs2Int(s, i);
stat[hash_code]++;
if (stat[hash_code]==2){
result.push_back(s.substr(i, MAX_LEN));
}
}
return result;
}
vector<string> findRepeatedDnaSequences_02(string s) {
unordered_map<size_t, int> stat;
hash<string> hash_func;
vector<string> result;
for( int i=0; i+MAX_LEN<=s.size(); i++ ){
string word = s.substr(i, MAX_LEN);
size_t hash_code = hash_func(word);
stat[hash_code]++;
if (stat[hash_code]==2){
result.push_back(word);
}
}
return result;
}
vector<string> findRepeatedDnaSequences(string s) {
srand(time(0));
if (random()%2){
return findRepeatedDnaSequences_01(s);
}
return findRepeatedDnaSequences_02(s);
}
void printVector( vector<string> v ) {
cout << "[ " ;
for(int i=0; i<v.size(); i++ ){
cout << v[i] << (i<v.size()-1 ? ", " : "");
}
cout << " ]" << endl;
}
int main(int argc, char** argv)
{
string s = "GAGAGAGAGAGAG" ;
if (argc > 1){
s = argv[1];
}
printVector(findRepeatedDnaSequences(s));
}