Skip to content

Commit

Permalink
Sep 18, 2024: Keep windowData optional
Browse files Browse the repository at this point in the history
  • Loading branch information
AldhairMedico committed Sep 18, 2024
1 parent bbf4da8 commit d8cd621
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 40 deletions.
2 changes: 1 addition & 1 deletion gfalibs
10 changes: 7 additions & 3 deletions include/input.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ struct UserInputTeloscope : UserInput {
uint32_t step = 500;
double maxMem = 0;
std::string prefix = ".", outFile = "";
bool modeMatch = true, modeEntropy = true, modeGC = true; // Change to: de novo, user-defined
// uint8_t blockDistance = 6;
};
bool keepWindowData = false;
bool modeMatch = true, modeEntropy = true, modeGC = true; // Change to: de novo, user-defined
bool storeWindowData = false; // Memory intensive
float densityThreshold = 0.5f; // Threshold for telomere detection
uint32_t maxGaps = 2; // Allowed gaps in telomere blocks
uint8_t mergeDistance = 6;

};

class Input {

Expand Down
48 changes: 23 additions & 25 deletions include/teloscope.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,17 @@ struct WindowData {
WindowData() : windowStart(0), gcContent(0.0f), shannonEntropy(0.0f), nucleotideCounts{{'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}} {}
};

struct TelomereBlock {
uint64_t start;
uint64_t end;
};

class Teloscope {

Trie trie; // Declare trie instance
UserInputTeloscope userInput; // Declare user input instance
std::vector<std::tuple<unsigned int, std::string, std::vector<WindowData>>> allWindows; // Assembly windows
std::vector<std::tuple<unsigned int, std::string, std::vector<TelomereBlock>>> allTelomereBlocks; // Assembly elomere blocks

int totalNWindows = 0; // Total windows analyzed
std::unordered_map<std::string, int> patternCounts; // Total counts
Expand All @@ -103,33 +108,25 @@ class Teloscope {

bool walkPath(InPath* path, std::vector<InSegment*> &inSegments, std::vector<InGap> &inGaps);


void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData);

void analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData);

std::vector<WindowData> analyzeSegment(std::string &sequence, UserInputTeloscope userInput, uint64_t absPos);

void insertWindowData(unsigned int seqPos, const std::string& header, std::vector<WindowData>& pathWindows);

void insertWindowData(unsigned int seqPos, const std::string& header, std::vector<WindowData>& pathWindows) {
allWindows.push_back(std::make_tuple(seqPos, header, pathWindows)); // Giulio: cleaner with struct
}


void sortWindowsBySeqPos() {
std::sort(allWindows.begin(), allWindows.end(), [](const auto& one, const auto& two) {
return std::get<0>(one) < std::get<0>(two);
});
}

void sortWindowsBySeqPos();

void annotateTelomeres(); // CHECK


void writeBEDFile(std::ofstream& shannonFile, std::ofstream& gcContentFile,
// std::ofstream& telomereBEDFile, std::ofstream& telomereCountFile, // CHECK
std::unordered_map<std::string, std::ofstream>& patternMatchFiles,
std::unordered_map<std::string, std::ofstream>& patternCountFiles,
std::unordered_map<std::string, std::ofstream>& patternDensityFiles) {

if (!userInput.storeWindowData) { // If windowData is not stored, return
return;
}

for (const auto& windowData : allWindows) {
unsigned int seqPos;
Expand Down Expand Up @@ -254,16 +251,17 @@ class Teloscope {

// For each pattern, print the path header with the highest number of matches - PENDING
// For each pattern, print the path header with the lowest number of matches - PENDING

std::cout << "Max Shannon Entropy:\t" << getMax(entropyValues) << "\n";
std::cout << "Mean Shannon Entropy:\t" << getMean(entropyValues) << "\n";
std::cout << "Median Shannon Entropy:\t" << getMedian(entropyValues) << "\n";
std::cout << "Min Shannon Entropy:\t" << getMin(entropyValues) << "\n";

std::cout << "Max GC Content:\t" << getMax(gcContentValues) << "\n";
std::cout << "Mean GC Content:\t" << getMean(gcContentValues) << "\n";
std::cout << "Median GC Content:\t" << getMedian(gcContentValues) << "\n";
std::cout << "Min GC Content:\t" << getMin(gcContentValues) << "\n";
if (userInput.storeWindowData) {
std::cout << "Max Shannon Entropy:\t" << getMax(entropyValues) << "\n";
std::cout << "Mean Shannon Entropy:\t" << getMean(entropyValues) << "\n";
std::cout << "Median Shannon Entropy:\t" << getMedian(entropyValues) << "\n";
std::cout << "Min Shannon Entropy:\t" << getMin(entropyValues) << "\n";

std::cout << "Max GC Content:\t" << getMax(gcContentValues) << "\n";
std::cout << "Mean GC Content:\t" << getMean(gcContentValues) << "\n";
std::cout << "Median GC Content:\t" << getMedian(gcContentValues) << "\n";
std::cout << "Min GC Content:\t" << getMin(gcContentValues) << "\n";
}
}
};

Expand Down
18 changes: 15 additions & 3 deletions src/input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,16 @@ void Input::read(InSequences &inSequences) {
bool Teloscope::walkPath(InPath* path, std::vector<InSegment*> &inSegments, std::vector<InGap> &inGaps) {

Log threadLog;

uint64_t absPos = 0;
unsigned int cUId = 0, gapLen = 0, seqPos = path->getSeqPos();
std::vector<PathComponent> pathComponents = path->getComponents();
uint64_t absPos = 0;
std::vector<WindowData> pathWindows;

std::string header = removeCarriageReturns(path->getHeader());
threadLog.add("\n\tWalking path:\t" + path->getHeader());

std::vector<WindowData> pathWindows;
std::vector<TelomereBlock> pathTelomereBlocks;

// std::string header = path->getHeader();
// eraseChar(header, '\r');

Expand All @@ -91,6 +93,10 @@ bool Teloscope::walkPath(InPath* path, std::vector<InSegment*> &inSegments, std:
std::vector<WindowData> segmentWindows = analyzeSegment(sequence, userInput, absPos);
pathWindows.insert(pathWindows.end(), segmentWindows.begin(), segmentWindows.end());

// std::vector<TelomereBlock> segmentTelomereBlocks = analyzeSegment(sequence, userInput, absPos);
// pathTelomereBlocks.insert(pathTelomereBlocks.end(), segmentTelomereBlocks.begin(), segmentTelomereBlocks.end());


} else {
}

Expand All @@ -111,6 +117,12 @@ bool Teloscope::walkPath(InPath* path, std::vector<InSegment*> &inSegments, std:
std::lock_guard<std::mutex> lck(mtx);
insertWindowData(seqPos, header, pathWindows);

allTelomereBlocks.push_back({seqPos, header, pathTelomereBlocks});

// uint8_t mergeDistance = userInput.mergeDistance;
// mergeTelomereBlocks(pathTelomereBlocks, mergeDistance);


threadLog.add("\tCompleted walking path:\t" + path->getHeader());
logs.push_back(threadLog);

Expand Down
22 changes: 16 additions & 6 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,20 @@ int main(int argc, char **argv) {

if (argc == 1) { // case: with no arguments

printf("teloscope -f input.[fa/fa.gz] \n-h for additional help. Use -f to initiate the tool.\n");
printf("teloscope -f input.[fa/fa.gz] \nUse-h for additional help. \nUse -f to initiate the tool.");
exit(0);

}

static struct option long_options[] = { // struct mapping long options
{"input-sequence", required_argument, 0, 'f'},
{"output", required_argument, 0, 'o'},
{"patterns", required_argument, 0, 'p'},
{"window", required_argument, 0, 'w'},
{"step", required_argument, 0, 's'},
{"mode", required_argument, 0, 'm'},
{"output", required_argument, 0, 'o'},
{"threads", required_argument, 0, 'j'},
{"keep-window-data", no_argument, 0, 'k'},

{"verbose", no_argument, &verbose_flag, 1},
{"cmd", no_argument, &cmd_flag, 1},
Expand Down Expand Up @@ -213,22 +214,31 @@ int main(int argc, char **argv) {

case 'v': // software version
printf("/// Teloscope v%s\n", version.c_str());
printf("Giulio Formenti [email protected]\n");
printf("\nDeveloped by:\nGiulio Formenti [email protected]\n");
printf("Jack A. Medico [email protected]\n");
exit(0);

case 'h': // help
printf("teloscope [commands]\n");
printf("\nRequired Parameters:\n");
printf("\t'-f'\t--input-sequence\tInitiate tool with fasta file.\n");
printf("\t'-p'\t--patterns\tSet patterns to explore.\n");
printf("\t'-w'\t--window\tSet sliding window size.\n");
printf("\t'-s'\t--step\tSet sliding window step.\n");
printf("\t'-o'\t--output\tSet output route.\n");
printf("\t'-p'\t--patterns\tSet patterns to explore, separate them by commas [Default: TTAGGG]\n");
printf("\t'-w'\t--window\tSet sliding window size. [Default: 1000]\n");
printf("\t'-s'\t--step\tSet sliding window step. [Default: 500]\n");
printf("\t'-j'\t--threads\tSet maximum number of threads. [Default: max. available]\n");

printf("\nOptional Parameters:\n");
printf("\t'-m'\t--mode\tSet analysis modes, separate them by commas. [Options: all,match,gc,entropy]\n");
printf("\t'-k'\t--keep-window-data\tKeep window data for analysis, memory-intensive. [Default: false]\n");
printf("\t'-v'\t--version\tPrint current software version.\n");
printf("\t'-h'\t--help\tPrint current software options.\n");
printf("\t--verbose\tverbose output.\n");
exit(0);

case 'k':
userInput.keepWindowData = true;
break;
}

if (argc == 2 || // handle various cases in which the output should include summary stats
Expand Down
36 changes: 34 additions & 2 deletions src/teloscope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,41 @@ float Teloscope::getMax(const std::vector<float> values) {
}


void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData) {
void Teloscope::insertWindowData(unsigned int seqPos, const std::string& header, std::vector<WindowData>& pathWindows) {
if (userInput.storeWindowData) {
allWindows.push_back(std::make_tuple(seqPos, header, pathWindows));
}
}


void Teloscope::sortWindowsBySeqPos() {
std::sort(allWindows.begin(), allWindows.end(), [](const auto& one, const auto& two) {
return std::get<0>(one) < std::get<0>(two);
});
}


// JACK: TO IMPLEMENT!
// void Teloscope::mergeTelomereBlocks(std::vector<TelomereBlock>& blocks, uint8_t mergeDistance) {
// if (blocks.empty()) return;

// std::vector<TelomereBlock> mergedBlocks;
// TelomereBlock currentBlock = blocks[0];

// for (size_t i = 1; i < blocks.size(); ++i) {
// if (blocks[i].start - currentBlock.end <= mergeDistance) {
// currentBlock.end = blocks[i].end;
// } else {
// mergedBlocks.push_back(currentBlock);
// currentBlock = blocks[i];
// }
// }
// mergedBlocks.push_back(currentBlock);
// blocks = mergedBlocks;
// }


void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, WindowData& windowData) {
windowData.windowStart = windowStart;
unsigned short int longestPatternSize = this->trie.getLongestPatternSize();

Expand Down Expand Up @@ -157,7 +190,6 @@ void Teloscope::analyzeWindow(const std::string &window, uint32_t windowStart, W


std::vector<WindowData> Teloscope::analyzeSegment(std::string &sequence, UserInputTeloscope userInput, uint64_t absPos) {

std::vector<WindowData> windows;
uint32_t windowStart = 0;
uint32_t currentWindowSize = std::min(userInput.windowSize, static_cast<uint32_t>(sequence.size())); // In case segment is short
Expand Down

0 comments on commit d8cd621

Please sign in to comment.