Skip to content

Commit

Permalink
Sep 11, 2024: BED Update
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Sep 12, 2024
1 parent 08216bc commit e6bb8d4
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 31 deletions.
73 changes: 46 additions & 27 deletions include/teloscope.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,14 @@ class Teloscope {
std::vector<std::tuple<unsigned int, std::string, std::vector<WindowData>>> allWindows; // Assembly windows

int totalNWindows = 0; // Total windows analyzed
std::map<std::string, int> patternCounts; // Total counts
std::unordered_map<std::string, int> patternCounts; // Total counts
std::vector<float> entropyValues; // Total entropy values
std::vector<float> gcContentValues; // Total GC content values

float getShannonEntropy(const std::unordered_map<char, uint64_t>& nucleotideCounts, uint32_t windowSize);
float getGCContent(const std::unordered_map<char, uint64_t>& nucleotideCounts, uint32_t windowSize);


float getMean(const std::vector<float>& values);
float getMedian(std::vector<float> values);
float getMin(std::vector<float> values);
Expand All @@ -100,49 +101,33 @@ class Teloscope {
}
}


bool walkPath(InPath* path, std::vector<InSegment*> &inSegments, std::vector<InGap> &inGaps);


void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData);


std::vector<WindowData> analyzeSegment(std::string &sequence, UserInputTeloscope userInput, uint64_t absPos);


void insertWindowData(unsigned int seqPos, const std::string& header, std::vector<WindowData>& pathWindows) {
allWindows.push_back(std::make_tuple(seqPos, header, pathWindows)); // Giulio: cleaner with struct
}


void sortWindowsBySeqPos() {
std::sort(allWindows.begin(), allWindows.end(), [](const auto& one, const auto& two) {
return std::get<0>(one) < std::get<0>(two);
});
}

void generateBEDFile() {
std::ofstream shannonFile; // Declare file streams
std::ofstream gcContentFile;

std::unordered_map<std::string, std::ofstream> patternMatchFiles; // Hold file streams for pattern data
std::unordered_map<std::string, std::ofstream> patternCountFiles;
std::unordered_map<std::string, std::ofstream> patternDensityFiles;
std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n";

// Only create and write to files if their modes are enabled
if (userInput.modeEntropy) {
shannonFile.open(outRoute + "/shannonEntropy.bedgraph");
}
void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles) {

if (userInput.modeGC) {
gcContentFile.open(outRoute + "/gcContent.bedgraph");
}

if (userInput.modeMatch) {
for (const auto& pattern : userInput.patterns) {
patternMatchFiles[pattern].open(outRoute + "/" + pattern + "_matches.bed");
patternCountFiles[pattern].open(outRoute + "/" + pattern + "_count.bedgraph");
patternDensityFiles[pattern].open(outRoute + "/" + pattern + "_density.bedgraph");
}
}

// Write data for each window
for (const auto& windowData : allWindows) {
unsigned int seqPos;
std::string header;
Expand Down Expand Up @@ -179,18 +164,50 @@ class Teloscope {
<< pattern << "\n";
patternCounts[pattern]++; // Update total pattern counts
}

patternCountFiles[pattern] << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< data.count << "\n";

patternDensityFiles[pattern] << header << "\t" << window.windowStart << "\t"
<< windowEnd << "\t"
<< data.density << "\n";
}
}
}
}
}


void handleBEDFile() {
std::ofstream shannonFile;
std::ofstream gcContentFile;
std::unordered_map<std::string, std::ofstream> patternMatchFiles; // Jack: replace with vector to reduce cache locality?
std::unordered_map<std::string, std::ofstream> patternCountFiles;
std::unordered_map<std::string, std::ofstream> patternDensityFiles;
std::cout << "Reporting window matches and metrics in BED/BEDgraphs...\n";

// Close all files that were opened
// Open files once if their modes are enabled
if (userInput.modeEntropy) {
shannonFile.open(outRoute + "/shannonEntropy.bedgraph");
}

if (userInput.modeGC) {
gcContentFile.open(outRoute + "/gcContent.bedgraph");
}

if (userInput.modeMatch) {
for (const auto& pattern : userInput.patterns) {
patternMatchFiles[pattern].open(outRoute + "/" + pattern + "_matches.bed");
patternCountFiles[pattern].open(outRoute + "/" + pattern + "_count.bedgraph");
patternDensityFiles[pattern].open(outRoute + "/" + pattern + "_density.bedgraph");
}
}

// Write data for each window
writeBEDFile(shannonFile, gcContentFile, patternMatchFiles, patternCountFiles, patternDensityFiles);

// Close all files once
if (userInput.modeEntropy) {
shannonFile.close();
}
Expand All @@ -210,6 +227,7 @@ class Teloscope {
}
}


void printSummary() {
std::cout << "\n+++Summary Report+++\n";
std::cout << "Total windows analyzed:\t" << totalNWindows << "\n";
Expand All @@ -232,6 +250,7 @@ class Teloscope {
std::cout << "Min GC Content:\t" << getMin(gcContentValues) << "\n";
}


void annotateTelomeres() {
/// For each path we need two telomeric coordinates: p (start) and q (end)
/// For p telomere: Start to last semi-continous repeat
Expand Down
4 changes: 1 addition & 3 deletions src/input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void Input::read(InSequences &inSequences) {
teloscope.sortWindowsBySeqPos();
lg.verbose("\nPaths sorted by original position");

teloscope.generateBEDFile();
teloscope.handleBEDFile();
lg.verbose("\nBED/BEDgraph files generated");

teloscope.printSummary();
Expand Down Expand Up @@ -108,12 +108,10 @@ bool Teloscope::walkPath(InPath* path, std::vector<InSegment*> &inSegments, std:

}

// std::unique_lock<std::mutex> lck (mtx);
std::lock_guard<std::mutex> lck(mtx);
insertWindowData(seqPos, header, pathWindows);

threadLog.add("\tCompleted walking path:\t" + path->getHeader());
// std::lock_guard<std::mutex> lck(mtx); // Jack: does the first lock_guard protect both?
logs.push_back(threadLog);

return true;
Expand Down
8 changes: 7 additions & 1 deletion src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <algorithm> // removeCarriageReturns
#include <array> // check
#include <cmath>
#include <type_traits> // generateBEDFile
#include <type_traits> // handlesBEDFile
#include <chrono> // check

#include "log.h"
Expand Down Expand Up @@ -56,6 +56,7 @@ std::string removeCarriageReturns(const std::string& input) {
// input.erase(std::remove(input.begin(), input.end(), rmChar), input.end());
// }


float Teloscope::getShannonEntropy(const std::unordered_map<char, uint64_t>& nucleotideCounts, uint32_t windowSize) {
float entropy = 0.0;
for (auto &[nucleotide, count] : nucleotideCounts) {
Expand All @@ -79,6 +80,7 @@ float Teloscope::getMean(const std::vector<float>& values) {
return sum / values.size();
}


float Teloscope::getMedian(std::vector<float> values) {
if (values.empty()) return 0.0;
std::sort(values.begin(), values.end());
Expand All @@ -89,16 +91,20 @@ float Teloscope::getMedian(std::vector<float> values) {
return values[size / 2];
}
}


float Teloscope::getMin(const std::vector<float> values) {
if (values.empty()) return 0.0;
return *std::min_element(values.begin(), values.end());
}


float Teloscope::getMax(const std::vector<float> values) {
if (values.empty()) return 0.0;
return *std::max_element(values.begin(), values.end());
}


void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData) {

windowData.windowStart = windowStart;
Expand Down

0 comments on commit e6bb8d4

Please sign in to comment.