diff --git a/ExtractOfftargets/ExtractOfftargets.cpp b/ExtractOfftargets/ExtractOfftargets.cpp index edc40b6..19d442f 100644 --- a/ExtractOfftargets/ExtractOfftargets.cpp +++ b/ExtractOfftargets/ExtractOfftargets.cpp @@ -26,13 +26,13 @@ int main(int argc, char** argv) } auto startTime = std::chrono::steady_clock::now(); - fs::path tempWorkingDir = fs::path(fs::temp_directory_path() / "Crackling-extractOfftargets"); + fs::path tempWorkingDir = fs::path("/mnt/ssd1/Crackling-extractOfftargets"); fs::create_directory(tempWorkingDir); std::atomic_ullong fileCounter = 0; std::cout << "Spliting input(s)" << std::endl; // Split each sequence to it's own file -#pragma omp parallel for schedule(static,1) + #pragma omp parallel for schedule(static,1) for (int i = 2; i < argc; i++) { // Command line input @@ -108,16 +108,19 @@ int main(int argc, char** argv) } std::cout << "Done" << std::endl; - std::cout << "Sorting intermediate files" << std::endl; - // Mulithread process each extact and sort -#pragma omp parallel for schedule(static,1) + std::cout << "Identifying off-targets" << std::endl; + // Mulithread process each extacts off targets + std::atomic_ullong batchFileCounter = 0; + uint64_t offTargetBatchSize = 30000000; + #pragma omp parallel for schedule(static,1) for (int i = 0; i < fileCounter; i++) { - ofstream tempOutFile; + ofstream outFile; ifstream inFile; string inputLine; - vector offTargets; + uint64_t offtargetsFound = 0; inFile.open((tempWorkingDir / fmt::format("{}.txt", i)).string(), std::ios::binary | std::ios::in); + outFile.open((tempWorkingDir / fmt::format("{}_batch.txt", batchFileCounter++)).string(), std::ios::binary | std::ios::out); for (inputLine; std::getline(inFile, inputLine);) { std::getline(inFile, inputLine); @@ -125,24 +128,67 @@ int main(int argc, char** argv) // Add forward matches for (sregex_iterator regexItr(inputLine.begin(), inputLine.end(), fwdExp); regexItr != sregex_iterator(); regexItr++) { - offTargets.push_back((*regexItr)[1].str().substr(0, 20)); + if (offtargetsFound < offTargetBatchSize) + { + outFile << (*regexItr)[1].str().substr(0, 20) << "\n"; + ++offtargetsFound; + } + else + { + outFile.close(); + outFile.open((tempWorkingDir / fmt::format("{}_batch.txt", batchFileCounter++)).string(), std::ios::binary | std::ios::out); + outFile << (*regexItr)[1].str().substr(0, 20) << "\n"; + offtargetsFound = 1; + } } // Add reverse matches for (sregex_iterator regexItr(inputLine.begin(), inputLine.end(), bwdExp); regexItr != sregex_iterator(); regexItr++) { - offTargets.push_back(rc((*regexItr)[1].str()).substr(0, 20)); + if (offtargetsFound < offTargetBatchSize) + { + outFile << rc((*regexItr)[1].str()).substr(0, 20) << "\n"; + ++offtargetsFound; + } + else + { + outFile.close(); + outFile.open((tempWorkingDir / fmt::format("{}_batch.txt", batchFileCounter++)).string(), std::ios::binary | std::ios::out); + outFile << rc((*regexItr)[1].str()).substr(0, 20) << "\n"; + offtargetsFound = 1; + } } } inFile.close(); + outFile.close(); fs::remove((tempWorkingDir / fmt::format("{}.txt", i)).string()); + } + std::cout << "Done" << std::endl; + + std::cout << "Sorting batch files" << std::endl; + #pragma omp parallel for schedule(static,1) + for (int i = 0; i < batchFileCounter; i++) + { + + std::vector offTargets; + ifstream inFile; + ofstream outFile; + string inputLine; + inFile.open((tempWorkingDir / fmt::format("{}_batch.txt", i)).string(), std::ios::binary | std::ios::in); + offTargets.reserve(offTargetBatchSize); + while(std::getline(inFile, inputLine)) + { + trim(inputLine); + offTargets.push_back(inputLine); + } + inFile.close(); std::sort(offTargets.begin(), offTargets.end()); - tempOutFile.open((tempWorkingDir / fmt::format("{}_sorted.txt", i)).string(), std::ios::binary | std::ios::out); + outFile.open((tempWorkingDir / fmt::format("{}_sorted.txt", i)).string(), std::ios::binary | std::ios::out); for (const string& s : offTargets) { - tempOutFile << s << "\n"; + outFile << s << "\n"; } - tempOutFile.close(); - + outFile.close(); + fs::remove((tempWorkingDir / fmt::format("{}_batch.txt", i)).string()); } std::cout << "Done" << std::endl; @@ -168,7 +214,7 @@ int main(int argc, char** argv) #pragma omp parallel for schedule(static,1) for (int i = 0; i < threads; ++i) { - std::cout << "Merging thread " << i << std::endl; + // std::cout << "Merging thread " << i << std::endl; vector fileNames; for (int j = i; j < fileCounter; j += threads) @@ -180,7 +226,7 @@ int main(int argc, char** argv) vector offTargets(fileNames.size()); for (int j = 0; j < fileNames.size(); ++j) { - std::cout << fileNames[j] << std::endl; + // std::cout << fileNames[j] << std::endl; sortedFiles[j].open(fileNames[j], std::ios::binary | std::ios::in); std::getline(sortedFiles[j], offTargets[j]); } @@ -206,7 +252,7 @@ int main(int argc, char** argv) } } - std::cout << "Merging thread " << i << std::endl; + // std::cout << "Merging thread " << i << std::endl; ofstream mergedFile; mergedFile.open((tempWorkingDir / fmt::format("{}_merged.txt", i)).string(), std::ios::binary | std::ios::out); diff --git a/ISSLCreateIndex/CMakeLists.txt b/ISSLCreateIndex/CMakeLists.txt index 94ed997..6edeb25 100644 --- a/ISSLCreateIndex/CMakeLists.txt +++ b/ISSLCreateIndex/CMakeLists.txt @@ -1,9 +1,9 @@ +# Boost +find_package(Boost REQUIRED COMPONENTS iostreams) + +# OpenMP +find_package(OpenMP REQUIRED) + # Add ISSLCreateIndex executable add_executable (ISSLCreateIndex ISSLCreateIndex.cpp ISSLCreateIndex.hpp) - -# Find OpenMP -find_package(OpenMP) -# Link OpenMP if available -if(OpenMP_CXX_FOUND) - target_link_libraries(ISSLCreateIndex PRIVATE OpenMP::OpenMP_CXX) -endif() \ No newline at end of file +target_link_libraries(ISSLCreateIndex PRIVATE Boost::dynamic_linking Boost::iostreams OpenMP::OpenMP_CXX) \ No newline at end of file diff --git a/ISSLCreateIndex/ISSLCreateIndex.cpp b/ISSLCreateIndex/ISSLCreateIndex.cpp index ed0b86e..5b6be63 100644 --- a/ISSLCreateIndex/ISSLCreateIndex.cpp +++ b/ISSLCreateIndex/ISSLCreateIndex.cpp @@ -4,12 +4,14 @@ using std::vector; using std::map; using std::pair; using std::string; +using std::ofstream; using std::ifstream; using std::regex; using std::regex_iterator; using std::filesystem::path; using std::filesystem::exists; using std::filesystem::file_size; +using std::filesystem::remove; const std::regex extractNumbers("[1234567890]+"); const std::vector nucleotideIndex{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,3 }; @@ -369,22 +371,30 @@ int main(int argc, char** argv) uint64_t globalCount = 0; uint64_t offtargetsCount = 0; - vector seqSignatures; - vector seqSignaturesOccurrences; + // vector seqSignatures; + // vector seqSignaturesOccurrences; // Read off targets into memory - ifstream otInFile; - otInFile.open(argv[1], std::ios::in | std::ios::binary); - vector entireDataSet(otFileSize); - otInFile.read(entireDataSet.data(), otFileSize); - otInFile.close(); + // Does not work for extremely large files + // ifstream otInFile; + // otInFile.open(argv[1], std::ios::in | std::ios::binary); + // vector entireDataSet(otFileSize); + // otInFile.read(entireDataSet.data(), otFileSize); + // otInFile.close(); + + boost::iostreams::mapped_file_source entireDataSet; + entireDataSet.open(argv[1]); + + ofstream tempSeqSignatures; + ofstream tempSeqSignaturesOccurrences; + tempSeqSignatures.open(fmt::format("{}.tmp.1", argv[4]), std::ios::out | std::ios::binary); + tempSeqSignaturesOccurrences.open(fmt::format("{}.tmp.2", argv[4]), std::ios::out | std::ios::binary); uint64_t progressCount = 0; uint64_t offtargetId = 0; - std::cout << "Counting occurrences..." << std::endl; while (progressCount < seqCount) { - char* ptr = &entireDataSet[progressCount * seqLineLength]; + const char* ptr = entireDataSet.data() + (progressCount * seqLineLength); uint64_t signature = sequenceToSignature(ptr); // check how many times the off-target appears // (assumed the list is sorted) @@ -395,15 +405,26 @@ int main(int argc, char** argv) std::cout << fmt::format("{}/{} : {}", (progressCount + occurrences), seqCount, offtargetsCount) << std::endl; } - seqSignatures.push_back(signature); - seqSignaturesOccurrences.push_back(occurrences); + tempSeqSignatures.write(reinterpret_cast(&signature), sizeof(signature)); + tempSeqSignaturesOccurrences.write(reinterpret_cast(&occurrences), sizeof(occurrences)); + // seqSignatures.push_back(signature); + // seqSignaturesOccurrences.push_back(occurrences); offtargetsCount++; if (progressCount % 10000 == 0) std::cout << fmt::format("{}/{} : {}", progressCount, seqCount, offtargetsCount) << std::endl; progressCount += occurrences; } + entireDataSet.close(); + tempSeqSignatures.close(); + tempSeqSignaturesOccurrences.close(); std::cout << "Finished!" << std::endl; + boost::iostreams::mapped_file_source seqSignatures; + boost::iostreams::mapped_file_source seqSignaturesOccurrences; + seqSignatures.open(fmt::format("{}.tmp.1", argv[4])); + seqSignaturesOccurrences.open(fmt::format("{}.tmp.2", argv[4])); + uint64_t seqSignaturesCount = std::distance(reinterpret_cast(seqSignatures.begin()), reinterpret_cast(seqSignatures.end())); + std::cout << "Writing index header to file..." << std::endl; std::ofstream isslIndex; isslIndex.open(argv[4], std::ios::out | std::ios::binary); @@ -413,7 +434,7 @@ int main(int argc, char** argv) std::cout << "Finished!" << std::endl; std::cout << "Writing offtargets to file..." << std::endl; - isslIndex.write(reinterpret_cast(seqSignatures.data()), sizeof(uint64_t) * seqSignatures.size()); + isslIndex.write(seqSignatures.data(), seqSignatures.size()); std::cout << "Finished!" << std::endl; std::cout << "Writing slice masks to file..." << std::endl; @@ -431,20 +452,19 @@ int main(int argc, char** argv) std::cout << fmt::format("\tBuilding slice list {}", i+1) << std::endl; size_t sliceListSize = 1ULL << (sliceMasks[i].size() * 2); vector> sliceList(sliceListSize); - uint32_t signatureId = 0; - for (uint64_t signature : seqSignatures) { - uint32_t occurrences = seqSignaturesOccurrences[signatureId]; + for (uint32_t signatureId = 0; signatureId < seqSignaturesCount; signatureId++) { + const uint64_t* signature = reinterpret_cast(seqSignatures.data()) + signatureId; + const uint32_t* occurrences = reinterpret_cast(seqSignaturesOccurrences.data()) + signatureId; uint32_t sliceVal = 0ULL; for (size_t j = 0; j < sliceMasks[i].size(); j++) { - sliceVal |= ((signature >> (sliceMasks[i][j] * 2)) & 3ULL) << (j * 2); + sliceVal |= ((*signature >> (sliceMasks[i][j] * 2)) & 3ULL) << (j * 2); } // seqSigIdVal represnets the sequence signature ID and number of occurrences of the associated sequence. // (((uint64_t)occurrences) << 32), the most significant 32 bits is the count of the occurrences. // (uint64_t)signatureId, the index of the sequence in `seqSignatures` - uint64_t seqSigIdVal = (static_cast(occurrences) << 32) | static_cast(signatureId); + uint64_t seqSigIdVal = (static_cast(*occurrences) << 32) | static_cast(signatureId); sliceList[sliceVal].push_back(seqSigIdVal); - signatureId++; } std::cout << "\tFinished!" << std::endl; @@ -462,6 +482,11 @@ int main(int argc, char** argv) isslIndex.close(); std::cout << "\tFinished!" << std::endl; } + seqSignatures.close(); + seqSignaturesOccurrences.close(); + remove(fmt::format("{}.tmp.1", argv[4])); + remove(fmt::format("{}.tmp.2", argv[4])); + std::cout << "Finished!" << std::endl; std::cout << "Done" << std::endl; return 0; diff --git a/ISSLCreateIndex/ISSLCreateIndex.hpp b/ISSLCreateIndex/ISSLCreateIndex.hpp index 7ad78c8..a8122a3 100644 --- a/ISSLCreateIndex/ISSLCreateIndex.hpp +++ b/ISSLCreateIndex/ISSLCreateIndex.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #define FMT_HEADER_ONLY #include "../include/fmt/format.h" #include "../include/libpopcnt/libpopcnt.h" \ No newline at end of file diff --git a/ISSLScoreOfftargets/ISSLScoreOfftargets.cpp b/ISSLScoreOfftargets/ISSLScoreOfftargets.cpp index 1921ad0..9f82522 100644 --- a/ISSLScoreOfftargets/ISSLScoreOfftargets.cpp +++ b/ISSLScoreOfftargets/ISSLScoreOfftargets.cpp @@ -263,9 +263,19 @@ int main(int argc, char** argv) } } + uint64_t GtotalFalseSites = 0; + uint64_t GtotalTrueSites = 0; + uint64_t GuniqueFalseSites = 0; + uint64_t GuniqueTrueSites = 0; + std::mutex countMutex; + /** Begin scoring */ #pragma omp parallel { + uint64_t totalFalseSites = 0; + uint64_t totalTrueSites = 0; + uint64_t uniqueFalseSites = 0; + uint64_t uniqueTrueSites = 0; vector offtargetToggles(numOfftargetToggles); uint64_t* offtargetTogglesTail = offtargetToggles.data() + numOfftargetToggles - 1; /** For each candidate guide */ @@ -310,6 +320,19 @@ int main(int argc, char** argv) uint64_t* ptrOfftargetFlag = (offtargetTogglesTail - (signatureId / 64)); seenOfftargetAlready = (*ptrOfftargetFlag >> (signatureId % 64)) & 1ULL; + uint64_t xoredSignatures = searchSignature ^ offtargets[signatureId]; + uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL; + uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL; + uint64_t mismatches = (evenBits >> 1) | oddBits; + uint64_t dist = popcount64(mismatches); + + if (dist >= 0 && dist <= 4) { + totalTrueSites++; + } + else { + totalFalseSites++; + } + if (!seenOfftargetAlready) { *ptrOfftargetFlag |= (1ULL << (signatureId % 64)); @@ -339,11 +362,19 @@ int main(int argc, char** argv) * * popcount(mismatches): 4 */ - uint64_t xoredSignatures = searchSignature ^ offtargets[signatureId]; - uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL; - uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL; - uint64_t mismatches = (evenBits >> 1) | oddBits; - uint64_t dist = popcount64(mismatches); + // uint64_t xoredSignatures = searchSignature ^ offtargets[signatureId]; + // uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL; + // uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL; + // uint64_t mismatches = (evenBits >> 1) | oddBits; + // uint64_t dist = popcount64(mismatches); + if (dist >= 0 && dist <= 4) { + uniqueTrueSites++; + } + else + { + uniqueFalseSites++; + } + if (dist >= 0 && dist <= 4) { // Begin calculating MIT score @@ -461,6 +492,12 @@ int main(int argc, char** argv) memset(offtargetToggles.data(), 0, sizeof(uint64_t) * offtargetToggles.size()); } + countMutex.lock(); + GtotalFalseSites += totalFalseSites; + GtotalTrueSites += totalTrueSites; + GuniqueFalseSites += uniqueFalseSites; + GuniqueTrueSites += uniqueTrueSites; + countMutex.unlock(); } auto endProcessing = std::chrono::high_resolution_clock::now(); @@ -476,6 +513,8 @@ int main(int argc, char** argv) printf("%02ld:%02ld\t", minutes.count(), seconds.count()); } printf("\n"); + std::cout << GuniqueTrueSites << "\t" << GuniqueFalseSites << "\t" << GtotalTrueSites << "\t" << GtotalFalseSites << std::endl; + /** Print global scores to stdout */ for (size_t searchIdx = 0; searchIdx < querySignatures.size(); searchIdx++) { diff --git a/ISSLScoreOfftargets/ISSLScoreOfftargets.hpp b/ISSLScoreOfftargets/ISSLScoreOfftargets.hpp index 4d928d7..6b0283c 100644 --- a/ISSLScoreOfftargets/ISSLScoreOfftargets.hpp +++ b/ISSLScoreOfftargets/ISSLScoreOfftargets.hpp @@ -1,6 +1,7 @@ #ifndef ISSLScoreOfftargetsInclude #define ISSLScoreOfftargetsInclude #include +#include #include #include #include diff --git a/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.cpp b/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.cpp index 42c0f5d..7805517 100644 --- a/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.cpp +++ b/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.cpp @@ -238,9 +238,19 @@ int main(int argc, char** argv) } } + uint64_t GtotalFalseSites = 0; + uint64_t GtotalTrueSites = 0; + uint64_t GuniqueFalseSites = 0; + uint64_t GuniqueTrueSites = 0; + std::mutex countMutex; + /** Begin scoring */ #pragma omp parallel { + uint64_t totalFalseSites = 0; + uint64_t totalTrueSites = 0; + uint64_t uniqueFalseSites = 0; + uint64_t uniqueTrueSites = 0; vector offtargetToggles(numOfftargetToggles); uint64_t* offtargetTogglesTail = offtargetToggles.data() + numOfftargetToggles - 1; /** For each candidate guide */ @@ -285,6 +295,19 @@ int main(int argc, char** argv) uint64_t* ptrOfftargetFlag = (offtargetTogglesTail - (signatureId / 64)); seenOfftargetAlready = (*ptrOfftargetFlag >> (signatureId % 64)) & 1ULL; + uint64_t xoredSignatures = searchSignature ^ offtargetsPtr[signatureId]; + uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL; + uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL; + uint64_t mismatches = (evenBits >> 1) | oddBits; + uint64_t dist = popcount64(mismatches); + + if (dist >= 0 && dist <= 4) { + totalTrueSites++; + } + else { + totalFalseSites++; + } + if (!seenOfftargetAlready) { *ptrOfftargetFlag |= (1ULL << (signatureId % 64)); @@ -314,11 +337,19 @@ int main(int argc, char** argv) * * popcount(mismatches): 4 */ - uint64_t xoredSignatures = searchSignature ^ offtargetsPtr[signatureId]; - uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL; - uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL; - uint64_t mismatches = (evenBits >> 1) | oddBits; - uint64_t dist = popcount64(mismatches); + // uint64_t xoredSignatures = searchSignature ^ offtargetsPtr[signatureId]; + // uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL; + // uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL; + // uint64_t mismatches = (evenBits >> 1) | oddBits; + // uint64_t dist = popcount64(mismatches); + + if (dist >= 0 && dist <= 4) { + uniqueTrueSites++; + } + else + { + uniqueFalseSites++; + } if (dist >= 0 && dist <= 4) { // Begin calculating MIT score @@ -436,6 +467,12 @@ int main(int argc, char** argv) memset(offtargetToggles.data(), 0, sizeof(uint64_t) * offtargetToggles.size()); } + countMutex.lock(); + GtotalFalseSites += totalFalseSites; + GtotalTrueSites += totalTrueSites; + GuniqueFalseSites += uniqueFalseSites; + GuniqueTrueSites += uniqueTrueSites; + countMutex.unlock(); } auto endProcessing = std::chrono::high_resolution_clock::now(); @@ -451,6 +488,7 @@ int main(int argc, char** argv) printf("%02ld:%02ld\t", minutes.count(), seconds.count()); } printf("\n"); + std::cout << GuniqueTrueSites << "\t" << GuniqueFalseSites << "\t" << GtotalTrueSites << "\t" << GtotalFalseSites << std::endl; /** Print global scores to stdout */ for (size_t searchIdx = 0; searchIdx < querySignatures.size(); searchIdx++) { diff --git a/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.hpp b/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.hpp index a4fa29d..6e64f7e 100644 --- a/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.hpp +++ b/ISSLScoreOfftargetsMMF/ISSLScoreOfftargetsMMF.hpp @@ -1,6 +1,7 @@ #ifndef ISSLScoreOfftargetsMMFInclude #define ISSLScoreOfftargetsMMFInclude #include +#include #include #include #include