Skip to content

Commit

Permalink
Sep 18, 2024: Header cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Sep 18, 2024
1 parent 53c302a commit 5907084
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 147 deletions.
148 changes: 3 additions & 145 deletions include/teloscope.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
#include <memory>
#include <unordered_map>

// std::string removeCarriageReturns(const std::string& input);

class Trie {
struct TrieNode {
std::unordered_map<char, std::shared_ptr<TrieNode>> children;
Expand Down Expand Up @@ -106,7 +104,6 @@ class Teloscope {
}
}


bool walkPath(InPath* path, std::vector<InSegment*> &inSegments, std::vector<InGap> &inGaps);

void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData);
Expand All @@ -117,153 +114,14 @@ class Teloscope {

void sortWindowsBySeqPos();

void annotateTelomeres(); // CHECK

void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
// std::ofstream& telomereBEDFile, std::ofstream& telomereCountFile, // CHECK
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles) {

if (!userInput.keepWindowData) { // If windowData is not stored, return
return;
}

for (const auto& windowData : allWindows) {
unsigned int seqPos;
std::string header;
std::vector<WindowData> windows;
std::tie(seqPos, header, windows) = windowData; // Unpack the tuple

for (const auto& window : windows) {
totalNWindows++; // Update total window count
uint32_t windowEnd = window.windowStart + window.currentWindowSize; // Start is already 0-based

// Write window Shannon entropy if enabled
if (userInput.modeEntropy) {
shannonFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.shannonEntropy << "\n";
entropyValues.push_back(window.shannonEntropy); // Update entropy values
}

// Write window GC content if enabled
if (userInput.modeGC) {
gcContentFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.gcContent << "\n";
gcContentValues.push_back(window.gcContent);
}

// Write pattern data if enabled
if (userInput.modeMatch) {
for (const auto& [pattern, data] : window.patternMap) {
for (auto pos : data.wMatches) {
patternMatchFiles[pattern] << header << "\t"
<< window.windowStart + pos << "\t"
<< window.windowStart + pos + pattern.length() << "\t" // Start is already 0-based
<< pattern << "\n";
patternCounts[pattern]++; // Update total pattern counts
}

patternCountFiles[pattern] << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< data.count << "\n";

patternDensityFiles[pattern] << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< data.density << "\n";
}
}
}

annotateTelomeres(); // CHECK

}
}


void handleBEDFile() {
std::ofstream shannonFile;
std::ofstream gcContentFile;
std::ofstream telomereBEDFile; // CHECK
std::ofstream telomereCountFile; // CHECK
std::unordered_map<std::string, std::ofstream> patternMatchFiles; // Jack: replace with vector to reduce cache locality?
std::unordered_map<std::string, std::ofstream> patternCountFiles;
std::unordered_map<std::string, std::ofstream> patternDensityFiles;
std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n";

// Open files once if their modes are enabled
if (userInput.modeEntropy) {
shannonFile.open(outRoute + "/shannonEntropy.bedgraph");
}

if (userInput.modeGC) {
gcContentFile.open(outRoute + "/gcContent.bedgraph");
}
std::unordered_map<std::string, std::ofstream>& patternDensityFiles);

if (userInput.modeMatch) {
// telomereBEDFile.open(outRoute + "/telomere_blocks.bed"); // CHECK
// telomereCountFile.open(outRoute + "/telomere_block_counts.txt"); // CHECK
void handleBEDFile();

for (const auto& pattern : userInput.patterns) {
patternMatchFiles[pattern].open(outRoute + "/" + pattern + "_matches.bed");
patternCountFiles[pattern].open(outRoute + "/" + pattern + "_count.bedgraph");
patternDensityFiles[pattern].open(outRoute + "/" + pattern + "_density.bedgraph");
}
}

// Write data for each window
writeBEDFile(shannonFile, gcContentFile,
// telomereBEDFile, telomereCountFile, // CHECK
patternMatchFiles, patternCountFiles, patternDensityFiles);

// Close all files once
if (userInput.modeEntropy) {
shannonFile.close();
}
if (userInput.modeGC) {
gcContentFile.close();
}
if (userInput.modeMatch) {
// telomereBEDFile.close(); // CHECK
// telomereCountFile.close(); // CHECK

for (auto& [pattern, file] : patternMatchFiles) {
file.close();
}
for (auto& [pattern, file] : patternCountFiles) {
file.close();
}
for (auto& [pattern, file] : patternDensityFiles) {
file.close();
}
}
}


void printSummary() {
std::cout << "\n+++Summary Report+++\n";
std::cout << "Total windows analyzed:\t" << totalNWindows << "\n";
std::cout << "Total input patterns found:\n";
for (const auto& [pattern, count] : patternCounts) {
std::cout << "Pattern:\t" << pattern << "\t" << count << "\n";
}

// For each pattern, print the path header with the highest number of matches - PENDING
// For each pattern, print the path header with the lowest number of matches - PENDING
if (userInput.keepWindowData) {
std::cout << "Max Shannon Entropy:\t" << getMax(entropyValues) << "\n";
std::cout << "Mean Shannon Entropy:\t" << getMean(entropyValues) << "\n";
std::cout << "Median Shannon Entropy:\t" << getMedian(entropyValues) << "\n";
std::cout << "Min Shannon Entropy:\t" << getMin(entropyValues) << "\n";

std::cout << "Max GC Content:\t" << getMax(gcContentValues) << "\n";
std::cout << "Mean GC Content:\t" << getMean(gcContentValues) << "\n";
std::cout << "Median GC Content:\t" << getMedian(gcContentValues) << "\n";
std::cout << "Min GC Content:\t" << getMin(gcContentValues) << "\n";
}
}
void printSummary();
};

#endif // TELOSCOPE_H/
140 changes: 138 additions & 2 deletions src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,143 @@ std::vector<WindowData> Teloscope::analyzeSegment(std::string &sequence, UserInp
return windows;
}

void Teloscope::annotateTelomeres() {
// uint8_t d = userInput.blockDistance;

void Teloscope::writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles) {

if (!userInput.keepWindowData) { // If windowData is not stored, return
return;
}

for (const auto& windowData : allWindows) {
unsigned int seqPos;
std::string header;
std::vector<WindowData> windows;
std::tie(seqPos, header, windows) = windowData; // Unpack the tuple

for (const auto& window : windows) {
totalNWindows++; // Update total window count
uint32_t windowEnd = window.windowStart + window.currentWindowSize; // Start is already 0-based

// Write window Shannon entropy if enabled
if (userInput.modeEntropy) {
shannonFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.shannonEntropy << "\n";
entropyValues.push_back(window.shannonEntropy); // Update entropy values
}

// Write window GC content if enabled
if (userInput.modeGC) {
gcContentFile << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< window.gcContent << "\n";
gcContentValues.push_back(window.gcContent);
}

// Write pattern data if enabled
if (userInput.modeMatch) {
for (const auto& [pattern, data] : window.patternMap) {
for (auto pos : data.wMatches) {
patternMatchFiles[pattern] << header << "\t"
<< window.windowStart + pos << "\t"
<< window.windowStart + pos + pattern.length() << "\t" // Start is already 0-based
<< pattern << "\n";
patternCounts[pattern]++; // Update total pattern counts
}

patternCountFiles[pattern] << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< data.count << "\n";

patternDensityFiles[pattern] << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< data.density << "\n";
}
}
}
}
}

void Teloscope::handleBEDFile() {
std::ofstream shannonFile;
std::ofstream gcContentFile;
std::ofstream telomereBEDFile; // CHECK
std::ofstream telomereCountFile; // CHECK
std::unordered_map<std::string, std::ofstream> patternMatchFiles; // Jack: replace with vector to reduce cache locality?
std::unordered_map<std::string, std::ofstream> patternCountFiles;
std::unordered_map<std::string, std::ofstream> patternDensityFiles;
std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n";

// Open files once if their modes are enabled
if (userInput.modeEntropy) {
shannonFile.open(outRoute + "/shannonEntropy.bedgraph");
}

if (userInput.modeGC) {
gcContentFile.open(outRoute + "/gcContent.bedgraph");
}

if (userInput.modeMatch) {
// telomereBEDFile.open(outRoute + "/telomere_blocks.bed"); // CHECK
// telomereCountFile.open(outRoute + "/telomere_block_counts.txt"); // CHECK

for (const auto& pattern : userInput.patterns) {
patternMatchFiles[pattern].open(outRoute + "/" + pattern + "_matches.bed");
patternCountFiles[pattern].open(outRoute + "/" + pattern + "_count.bedgraph");
patternDensityFiles[pattern].open(outRoute + "/" + pattern + "_density.bedgraph");
}
}

// Write data for each window
writeBEDFile(shannonFile, gcContentFile,
// telomereBEDFile, telomereCountFile, // CHECK
patternMatchFiles, patternCountFiles, patternDensityFiles);

// Close all files once
if (userInput.modeEntropy) {
shannonFile.close();
}
if (userInput.modeGC) {
gcContentFile.close();
}
if (userInput.modeMatch) {
// telomereBEDFile.close(); // CHECK
// telomereCountFile.close(); // CHECK

for (auto& [pattern, file] : patternMatchFiles) {
file.close();
}
for (auto& [pattern, file] : patternCountFiles) {
file.close();
}
for (auto& [pattern, file] : patternDensityFiles) {
file.close();
}
}
}

void Teloscope::printSummary() {
std::cout << "\n+++Summary Report+++\n";
std::cout << "Total windows analyzed:\t" << totalNWindows << "\n";
std::cout << "Total input patterns found:\n";
for (const auto& [pattern, count] : patternCounts) {
std::cout << "Pattern:\t" << pattern << "\t" << count << "\n";
}

// For each pattern, print the path header with the highest number of matches - PENDING
// For each pattern, print the path header with the lowest number of matches - PENDING
if (userInput.keepWindowData) {
std::cout << "Max Shannon Entropy:\t" << getMax(entropyValues) << "\n";
std::cout << "Mean Shannon Entropy:\t" << getMean(entropyValues) << "\n";
std::cout << "Median Shannon Entropy:\t" << getMedian(entropyValues) << "\n";
std::cout << "Min Shannon Entropy:\t" << getMin(entropyValues) << "\n";

std::cout << "Max GC Content:\t" << getMax(gcContentValues) << "\n";
std::cout << "Mean GC Content:\t" << getMean(gcContentValues) << "\n";
std::cout << "Median GC Content:\t" << getMedian(gcContentValues) << "\n";
std::cout << "Min GC Content:\t" << getMin(gcContentValues) << "\n";
}
}

0 comments on commit 5907084

Please sign in to comment.