Skip to content

Commit

Permalink
Oct 12, 2024: IUPAC allowed in -p
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Oct 12, 2024
1 parent 4eae9f5 commit dd86e72
Show file tree
Hide file tree
Showing 7 changed files with 172 additions and 64 deletions.
2 changes: 1 addition & 1 deletion gfalibs
9 changes: 9 additions & 0 deletions include/functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#ifndef FUNCTIONS_H
#define FUNCTIONS_H

#include <vector>
#include <string>

void generate_combinations(const std::string &pattern, std::string &current, size_t index, std::vector<std::string> &combinations);

#endif // FUNCTIONS_H
2 changes: 2 additions & 0 deletions include/input.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ struct UserInputTeloscope : UserInput {
uint32_t windowSize = 1000;
uint8_t kmerLen = 21;
uint32_t step = 500;
std::pair<std::string, std::string> canonicalPatterns;

double maxMem = 0;
std::string prefix = ".", outFile = "";
bool keepWindowData = false; // Memory intensive
Expand Down
1 change: 1 addition & 0 deletions include/main.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@
#include "input-gfa.h"

#include "input.h" // was in Mac's code
#include "functions.h"

#endif /* MAIN_H */
36 changes: 36 additions & 0 deletions src/functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "functions.h"
#include <unordered_map>

std::unordered_map<char, std::vector<char>> IUPAC_DNA_map = {
{'A', {'A'}},
{'C', {'C'}},
{'G', {'G'}},
{'T', {'T'}},
{'R', {'A', 'G'}},
{'Y', {'C', 'T'}},
{'M', {'A', 'C'}},
{'K', {'G', 'T'}},
{'S', {'C', 'G'}},
{'W', {'A', 'T'}},
{'H', {'A', 'C', 'T'}},
{'B', {'C', 'G', 'T'}},
{'V', {'A', 'C', 'G'}},
{'D', {'A', 'G', 'T'}},
{'N', {'A', 'C', 'G', 'T'}}
};


void generate_combinations(const std::string &pattern, std::string &current, size_t index, std::vector<std::string> &combinations) {
if (index == pattern.size()) {
combinations.push_back(current);
return;
}

char base = pattern[index];
for (char c : IUPAC_DNA_map[base]) {
current[index] = c;
generate_combinations(pattern, current, index + 1, combinations);
}
}


156 changes: 107 additions & 49 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,92 +118,118 @@ int main(int argc, char **argv) {
break;


case 'o':
{
outRoute = optarg;

if (outRoute.empty()) {
fprintf(stderr, "Error: Output route is required. Use --output or -o to specify it.\n"); // Jack: we have to define output as default
exit(EXIT_FAILURE);
}
}
break;


case 'j': // max threads
maxThreads = atoi(optarg);
userInput.stats_flag = 1;
break;


// case 'c': // canonical pattern
// userInput.canPatterns = optarg;
// break;


case 'm': {
std::istringstream modeStream(optarg);
std::string mode;
std::set<std::string> providedModes;

while (std::getline(modeStream, mode, ',')) {
if (mode.empty()) continue;

if (std::any_of(mode.begin(), mode.end(), ::isdigit)) {
std::cerr << "Error: Mode '" << mode << "' contains numerical characters.\n";
exit(EXIT_FAILURE);
}
case 'c': { // Handle canonical pattern
std::string canonicalPattern = optarg;
unmaskSequence(canonicalPattern);

std::transform(mode.begin(), mode.end(), mode.begin(), ::tolower);
if (canonicalPattern.empty()) {
canonicalPattern = "TTAGGG";
}

if (mode == "all") {
providedModes = {"match", "entropy", "gc"};
break;
} else {
providedModes.insert(mode);
}
// Check for numerical characters
if (std::any_of(canonicalPattern.begin(), canonicalPattern.end(), ::isdigit)) {
std::cerr << "Error: Canonical pattern '" << canonicalPattern << "' contains numerical characters.\n";
exit(EXIT_FAILURE);
}

// Only disable modes that were not provided
if (providedModes.find("match") == providedModes.end()) userInput.modeMatch = false;
if (providedModes.find("entropy") == providedModes.end()) userInput.modeEntropy = false;
if (providedModes.find("gc") == providedModes.end()) userInput.modeGC = false;
// Store canonical pattern and its reverse complement
userInput.canonicalPatterns.first = canonicalPattern;
userInput.canonicalPatterns.second = revCom(canonicalPattern);

break;
std::cout << "Setting canonical pattern: " << canonicalPattern << " and its reverse complement: " << userInput.canonicalPatterns.second << "\n";
}
break;


case 'o':
{
outRoute = optarg;
// case 'p':
// {
// std::istringstream patternStream(optarg);
// std::string pattern;

if (outRoute.empty()) {
fprintf(stderr, "Error: Output route is required. Use --output or -o to specify it.\n"); // Jack: we have to define output as default
exit(EXIT_FAILURE);
}
}
break;

// while (std::getline(patternStream, pattern, ',')) {
// if (pattern.empty()) continue;

// if (std::any_of(pattern.begin(), pattern.end(), ::isdigit)) {
// std::cerr << "Error: Pattern '" << pattern << "' contains numerical characters.\n";
// exit(EXIT_FAILURE);
// }

// unmaskSequence(pattern);

// std::cout << "Adding pattern: " << pattern << " and its reverse complement" << "\n";
// userInput.patterns.emplace_back(pattern);
// userInput.patterns.emplace_back(revCom(pattern));
// }

// if (userInput.patterns.empty()) {
// userInput.patterns = {"TTAGGG", "CCCTAA"};
// std::cout << "No patterns provided. Only scanning for canonical patterns: TTAGGG, CCCTAA" << "\n";
// } else {
// // Remove duplicates
// std::sort(userInput.patterns.begin(), userInput.patterns.end());
// auto last = std::unique(userInput.patterns.begin(), userInput.patterns.end());
// userInput.patterns.erase(last, userInput.patterns.end());
// }
// }
// break;

case 'p':
{
std::istringstream patternStream(optarg);
std::string pattern;

while (std::getline(patternStream, pattern, ',')) {
if (pattern.empty()) continue;

if (std::any_of(pattern.begin(), pattern.end(), ::isdigit)) {
std::cerr << "Error: Pattern '" << pattern << "' contains numerical characters.\n";
exit(EXIT_FAILURE);
}

unmaskSequence(pattern);

std::cout << "Adding pattern: " << pattern << " and its reverse complement" << "\n";
userInput.patterns.emplace_back(pattern);
userInput.patterns.emplace_back(revCom(pattern));

// Generate all combinations for the pattern based on IUPAC codes
std::vector<std::string> combinations;
std::string current_pattern = pattern;
generate_combinations(pattern, current_pattern, 0, combinations);

// Add each combination and its reverse complement to userInput.patterns
for (const std::string &comb : combinations) {
std::cout << "Adding pattern: " << comb << " and its reverse complement" << "\n";
userInput.patterns.emplace_back(comb);
userInput.patterns.emplace_back(revCom(comb));
}
}

if (userInput.patterns.empty()) {
userInput.patterns = {"TTAGGG", "CCCTAA"};
std::cout << "No patterns provided. Using canonical patterns: TTAGGG, CCCTAA" << "\n";
std::cout << "No patterns provided. Only scanning for canonical patterns: TTAGGG, CCCTAA" << "\n";
} else {
// Remove duplicates
std::sort(userInput.patterns.begin(), userInput.patterns.end());
auto last = std::unique(userInput.patterns.begin(), userInput.patterns.end());
userInput.patterns.erase(last, userInput.patterns.end());
}
}
break;
break;


case 'w':
Expand All @@ -230,6 +256,38 @@ int main(int argc, char **argv) {
break;


case 'm': {
std::istringstream modeStream(optarg);
std::string mode;
std::set<std::string> providedModes;

while (std::getline(modeStream, mode, ',')) {
if (mode.empty()) continue;

if (std::any_of(mode.begin(), mode.end(), ::isdigit)) {
std::cerr << "Error: Mode '" << mode << "' contains numerical characters.\n";
exit(EXIT_FAILURE);
}

std::transform(mode.begin(), mode.end(), mode.begin(), ::tolower);

if (mode == "all") {
providedModes = {"match", "entropy", "gc"};
break;
} else {
providedModes.insert(mode);
}
}

// Only disable modes that were not provided
if (providedModes.find("match") == providedModes.end()) userInput.modeMatch = false;
if (providedModes.find("entropy") == providedModes.end()) userInput.modeEntropy = false;
if (providedModes.find("gc") == providedModes.end()) userInput.modeGC = false;

break;
}


case 'v': // software version
printf("/// Teloscope v%s\n", version.c_str());
printf("\nDeveloped by:\nGiulio Formenti [email protected]\n");
Expand Down
30 changes: 16 additions & 14 deletions src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ std::vector<TelomereBlock> Teloscope::mergeTelomereBlocks(const std::vector<Telo
uint64_t currentEnd = currentBlock.start + currentBlock.blockLen;
uint64_t distance = nextBlock.start - currentEnd;

if (distance <= D) {
if (distance <= 2 * D) {
// Merge the blocks by extending the current block
uint64_t newEnd = nextBlock.start + nextBlock.blockLen;
currentBlock.blockLen = newEnd - currentBlock.start;
Expand Down Expand Up @@ -225,7 +225,8 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, W

if (current->isEndOfWord) {
std::string pattern = window.substr(i, j - i + 1);
bool isCanonical = (pattern == "TTAGGG" || pattern == "CCCTAA"); // Check canonical patterns
// bool isCanonical = (pattern == "TTAGGG" || pattern == "CCCTAA"); // Check canonical patterns
bool isCanonical = (pattern == userInput.canonicalPatterns.first || pattern == userInput.canonicalPatterns.second); // Check canonical patterns

// Update windowData from prevOverlapData
if (j >= overlapSize || overlapSize == 0 || windowStart == 0 ) {
Expand Down Expand Up @@ -343,16 +344,25 @@ void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcConten
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles,
std::ofstream& telomereBlocksFile) {

if (!userInput.keepWindowData) { // If windowData is not stored, return
return;
}

for (const auto& pathData : allPathData) {
// const auto& seqPos = pathData.seqPos;
const auto& header = pathData.header;
const auto& windows = pathData.windows;

// Process telomere blocks
for (const auto& [groupName, blocks] : pathData.mergedBlocks) {
for (const auto& block : blocks) {
uint64_t blockEnd = block.start + block.blockLen;
telomereBlocksFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n";
}
}

if (!userInput.keepWindowData) { // If windowData is not stored, return
continue;
}

// Process window data
for (const auto& window : windows) {
totalNWindows++; // Update total window count
uint32_t windowEnd = window.windowStart + window.currentWindowSize; // Start is already 0-based
Expand Down Expand Up @@ -394,14 +404,6 @@ void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcConten
}
}
}

// Handle telomere blocks
for (const auto& [groupName, blocks] : pathData.mergedBlocks) {
for (const auto& block : blocks) {
uint64_t blockEnd = block.start + block.blockLen;
telomereBlocksFile << header << "\t" << block.start << "\t" << blockEnd << "\t" << groupName << "\n";
}
}
}
}

Expand Down

0 comments on commit dd86e72

Please sign in to comment.