Skip to content

Commit

Permalink
Added code to count off targets processed
Browse files Browse the repository at this point in the history
  • Loading branch information
CeSchmitz committed Aug 17, 2023
1 parent 82b2cfd commit 3423f12
Show file tree
Hide file tree
Showing 8 changed files with 202 additions and 51 deletions.
78 changes: 62 additions & 16 deletions ExtractOfftargets/ExtractOfftargets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ int main(int argc, char** argv)
}

auto startTime = std::chrono::steady_clock::now();
fs::path tempWorkingDir = fs::path(fs::temp_directory_path() / "Crackling-extractOfftargets");
fs::path tempWorkingDir = fs::path("/mnt/ssd1/Crackling-extractOfftargets");
fs::create_directory(tempWorkingDir);
std::atomic_ullong fileCounter = 0;

std::cout << "Spliting input(s)" << std::endl;
// Split each sequence to it's own file
#pragma omp parallel for schedule(static,1)
#pragma omp parallel for schedule(static,1)
for (int i = 2; i < argc; i++)
{
// Command line input
Expand Down Expand Up @@ -108,41 +108,87 @@ int main(int argc, char** argv)
}
std::cout << "Done" << std::endl;

std::cout << "Sorting intermediate files" << std::endl;
// Mulithread process each extact and sort
#pragma omp parallel for schedule(static,1)
std::cout << "Identifying off-targets" << std::endl;
// Mulithread process each extacts off targets
std::atomic_ullong batchFileCounter = 0;
uint64_t offTargetBatchSize = 30000000;
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < fileCounter; i++)
{
ofstream tempOutFile;
ofstream outFile;
ifstream inFile;
string inputLine;
vector<string> offTargets;
uint64_t offtargetsFound = 0;
inFile.open((tempWorkingDir / fmt::format("{}.txt", i)).string(), std::ios::binary | std::ios::in);
outFile.open((tempWorkingDir / fmt::format("{}_batch.txt", batchFileCounter++)).string(), std::ios::binary | std::ios::out);
for (inputLine; std::getline(inFile, inputLine);)
{
std::getline(inFile, inputLine);
trim(inputLine);
// Add forward matches
for (sregex_iterator regexItr(inputLine.begin(), inputLine.end(), fwdExp); regexItr != sregex_iterator(); regexItr++)
{
offTargets.push_back((*regexItr)[1].str().substr(0, 20));
if (offtargetsFound < offTargetBatchSize)
{
outFile << (*regexItr)[1].str().substr(0, 20) << "\n";
++offtargetsFound;
}
else
{
outFile.close();
outFile.open((tempWorkingDir / fmt::format("{}_batch.txt", batchFileCounter++)).string(), std::ios::binary | std::ios::out);
outFile << (*regexItr)[1].str().substr(0, 20) << "\n";
offtargetsFound = 1;
}
}
// Add reverse matches
for (sregex_iterator regexItr(inputLine.begin(), inputLine.end(), bwdExp); regexItr != sregex_iterator(); regexItr++)
{
offTargets.push_back(rc((*regexItr)[1].str()).substr(0, 20));
if (offtargetsFound < offTargetBatchSize)
{
outFile << rc((*regexItr)[1].str()).substr(0, 20) << "\n";
++offtargetsFound;
}
else
{
outFile.close();
outFile.open((tempWorkingDir / fmt::format("{}_batch.txt", batchFileCounter++)).string(), std::ios::binary | std::ios::out);
outFile << rc((*regexItr)[1].str()).substr(0, 20) << "\n";
offtargetsFound = 1;
}
}
}
inFile.close();
outFile.close();
fs::remove((tempWorkingDir / fmt::format("{}.txt", i)).string());
}
std::cout << "Done" << std::endl;

std::cout << "Sorting batch files" << std::endl;
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < batchFileCounter; i++)
{

std::vector<string> offTargets;
ifstream inFile;
ofstream outFile;
string inputLine;
inFile.open((tempWorkingDir / fmt::format("{}_batch.txt", i)).string(), std::ios::binary | std::ios::in);
offTargets.reserve(offTargetBatchSize);
while(std::getline(inFile, inputLine))
{
trim(inputLine);
offTargets.push_back(inputLine);
}
inFile.close();
std::sort(offTargets.begin(), offTargets.end());
tempOutFile.open((tempWorkingDir / fmt::format("{}_sorted.txt", i)).string(), std::ios::binary | std::ios::out);
outFile.open((tempWorkingDir / fmt::format("{}_sorted.txt", i)).string(), std::ios::binary | std::ios::out);
for (const string& s : offTargets)
{
tempOutFile << s << "\n";
outFile << s << "\n";
}
tempOutFile.close();

outFile.close();
fs::remove((tempWorkingDir / fmt::format("{}_batch.txt", i)).string());
}
std::cout << "Done" << std::endl;

Expand All @@ -168,7 +214,7 @@ int main(int argc, char** argv)
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < threads; ++i)
{
std::cout << "Merging thread " << i << std::endl;
// std::cout << "Merging thread " << i << std::endl;

vector<string> fileNames;
for (int j = i; j < fileCounter; j += threads)
Expand All @@ -180,7 +226,7 @@ int main(int argc, char** argv)
vector<string> offTargets(fileNames.size());
for (int j = 0; j < fileNames.size(); ++j)
{
std::cout << fileNames[j] << std::endl;
// std::cout << fileNames[j] << std::endl;
sortedFiles[j].open(fileNames[j], std::ios::binary | std::ios::in);
std::getline(sortedFiles[j], offTargets[j]);
}
Expand All @@ -206,7 +252,7 @@ int main(int argc, char** argv)
}
}

std::cout << "Merging thread " << i << std::endl;
// std::cout << "Merging thread " << i << std::endl;

ofstream mergedFile;
mergedFile.open((tempWorkingDir / fmt::format("{}_merged.txt", i)).string(), std::ios::binary | std::ios::out);
Expand Down
14 changes: 7 additions & 7 deletions ISSLCreateIndex/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Boost
find_package(Boost REQUIRED COMPONENTS iostreams)

# OpenMP
find_package(OpenMP REQUIRED)

# Add ISSLCreateIndex executable
add_executable (ISSLCreateIndex ISSLCreateIndex.cpp ISSLCreateIndex.hpp)

# Find OpenMP
find_package(OpenMP)
# Link OpenMP if available
if(OpenMP_CXX_FOUND)
target_link_libraries(ISSLCreateIndex PRIVATE OpenMP::OpenMP_CXX)
endif()
target_link_libraries(ISSLCreateIndex PRIVATE Boost::dynamic_linking Boost::iostreams OpenMP::OpenMP_CXX)
61 changes: 43 additions & 18 deletions ISSLCreateIndex/ISSLCreateIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ using std::vector;
using std::map;
using std::pair;
using std::string;
using std::ofstream;
using std::ifstream;
using std::regex;
using std::regex_iterator;
using std::filesystem::path;
using std::filesystem::exists;
using std::filesystem::file_size;
using std::filesystem::remove;

const std::regex extractNumbers("[1234567890]+");
const std::vector<uint8_t> nucleotideIndex{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,3 };
Expand Down Expand Up @@ -369,22 +371,30 @@ int main(int argc, char** argv)

uint64_t globalCount = 0;
uint64_t offtargetsCount = 0;
vector<uint64_t> seqSignatures;
vector<uint32_t> seqSignaturesOccurrences;
// vector<uint64_t> seqSignatures;
// vector<uint32_t> seqSignaturesOccurrences;

// Read off targets into memory
ifstream otInFile;
otInFile.open(argv[1], std::ios::in | std::ios::binary);
vector<char> entireDataSet(otFileSize);
otInFile.read(entireDataSet.data(), otFileSize);
otInFile.close();
// Does not work for extremely large files
// ifstream otInFile;
// otInFile.open(argv[1], std::ios::in | std::ios::binary);
// vector<char> entireDataSet(otFileSize);
// otInFile.read(entireDataSet.data(), otFileSize);
// otInFile.close();

boost::iostreams::mapped_file_source entireDataSet;
entireDataSet.open(argv[1]);

ofstream tempSeqSignatures;
ofstream tempSeqSignaturesOccurrences;
tempSeqSignatures.open(fmt::format("{}.tmp.1", argv[4]), std::ios::out | std::ios::binary);
tempSeqSignaturesOccurrences.open(fmt::format("{}.tmp.2", argv[4]), std::ios::out | std::ios::binary);

uint64_t progressCount = 0;
uint64_t offtargetId = 0;

std::cout << "Counting occurrences..." << std::endl;
while (progressCount < seqCount) {
char* ptr = &entireDataSet[progressCount * seqLineLength];
const char* ptr = entireDataSet.data() + (progressCount * seqLineLength);
uint64_t signature = sequenceToSignature(ptr);
// check how many times the off-target appears
// (assumed the list is sorted)
Expand All @@ -395,15 +405,26 @@ int main(int argc, char** argv)
std::cout << fmt::format("{}/{} : {}", (progressCount + occurrences), seqCount, offtargetsCount) << std::endl;
}

seqSignatures.push_back(signature);
seqSignaturesOccurrences.push_back(occurrences);
tempSeqSignatures.write(reinterpret_cast<const char *>(&signature), sizeof(signature));
tempSeqSignaturesOccurrences.write(reinterpret_cast<const char *>(&occurrences), sizeof(occurrences));
// seqSignatures.push_back(signature);
// seqSignaturesOccurrences.push_back(occurrences);
offtargetsCount++;
if (progressCount % 10000 == 0)
std::cout << fmt::format("{}/{} : {}", progressCount, seqCount, offtargetsCount) << std::endl;
progressCount += occurrences;
}
entireDataSet.close();
tempSeqSignatures.close();
tempSeqSignaturesOccurrences.close();
std::cout << "Finished!" << std::endl;

boost::iostreams::mapped_file_source seqSignatures;
boost::iostreams::mapped_file_source seqSignaturesOccurrences;
seqSignatures.open(fmt::format("{}.tmp.1", argv[4]));
seqSignaturesOccurrences.open(fmt::format("{}.tmp.2", argv[4]));
uint64_t seqSignaturesCount = std::distance(reinterpret_cast<const uint64_t*>(seqSignatures.begin()), reinterpret_cast<const uint64_t*>(seqSignatures.end()));

std::cout << "Writing index header to file..." << std::endl;
std::ofstream isslIndex;
isslIndex.open(argv[4], std::ios::out | std::ios::binary);
Expand All @@ -413,7 +434,7 @@ int main(int argc, char** argv)
std::cout << "Finished!" << std::endl;

std::cout << "Writing offtargets to file..." << std::endl;
isslIndex.write(reinterpret_cast<char*>(seqSignatures.data()), sizeof(uint64_t) * seqSignatures.size());
isslIndex.write(seqSignatures.data(), seqSignatures.size());
std::cout << "Finished!" << std::endl;

std::cout << "Writing slice masks to file..." << std::endl;
Expand All @@ -431,20 +452,19 @@ int main(int argc, char** argv)
std::cout << fmt::format("\tBuilding slice list {}", i+1) << std::endl;
size_t sliceListSize = 1ULL << (sliceMasks[i].size() * 2);
vector<vector<uint64_t>> sliceList(sliceListSize);
uint32_t signatureId = 0;
for (uint64_t signature : seqSignatures) {
uint32_t occurrences = seqSignaturesOccurrences[signatureId];
for (uint32_t signatureId = 0; signatureId < seqSignaturesCount; signatureId++) {
const uint64_t* signature = reinterpret_cast<const uint64_t*>(seqSignatures.data()) + signatureId;
const uint32_t* occurrences = reinterpret_cast<const uint32_t*>(seqSignaturesOccurrences.data()) + signatureId;
uint32_t sliceVal = 0ULL;
for (size_t j = 0; j < sliceMasks[i].size(); j++)
{
sliceVal |= ((signature >> (sliceMasks[i][j] * 2)) & 3ULL) << (j * 2);
sliceVal |= ((*signature >> (sliceMasks[i][j] * 2)) & 3ULL) << (j * 2);
}
// seqSigIdVal represnets the sequence signature ID and number of occurrences of the associated sequence.
// (((uint64_t)occurrences) << 32), the most significant 32 bits is the count of the occurrences.
// (uint64_t)signatureId, the index of the sequence in `seqSignatures`
uint64_t seqSigIdVal = (static_cast<uint64_t>(occurrences) << 32) | static_cast<uint64_t>(signatureId);
uint64_t seqSigIdVal = (static_cast<uint64_t>(*occurrences) << 32) | static_cast<uint64_t>(signatureId);
sliceList[sliceVal].push_back(seqSigIdVal);
signatureId++;
}
std::cout << "\tFinished!" << std::endl;

Expand All @@ -462,6 +482,11 @@ int main(int argc, char** argv)
isslIndex.close();
std::cout << "\tFinished!" << std::endl;
}
seqSignatures.close();
seqSignaturesOccurrences.close();
remove(fmt::format("{}.tmp.1", argv[4]));
remove(fmt::format("{}.tmp.2", argv[4]));

std::cout << "Finished!" << std::endl;
std::cout << "Done" << std::endl;
return 0;
Expand Down
1 change: 1 addition & 0 deletions ISSLCreateIndex/ISSLCreateIndex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <regex>
#include <filesystem>
#include <fstream>
#include <boost/iostreams/device/mapped_file.hpp>
#define FMT_HEADER_ONLY
#include "../include/fmt/format.h"
#include "../include/libpopcnt/libpopcnt.h"
49 changes: 44 additions & 5 deletions ISSLScoreOfftargets/ISSLScoreOfftargets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,9 +263,19 @@ int main(int argc, char** argv)
}
}

uint64_t GtotalFalseSites = 0;
uint64_t GtotalTrueSites = 0;
uint64_t GuniqueFalseSites = 0;
uint64_t GuniqueTrueSites = 0;
std::mutex countMutex;

/** Begin scoring */
#pragma omp parallel
{
uint64_t totalFalseSites = 0;
uint64_t totalTrueSites = 0;
uint64_t uniqueFalseSites = 0;
uint64_t uniqueTrueSites = 0;
vector<uint64_t> offtargetToggles(numOfftargetToggles);
uint64_t* offtargetTogglesTail = offtargetToggles.data() + numOfftargetToggles - 1;
/** For each candidate guide */
Expand Down Expand Up @@ -310,6 +320,19 @@ int main(int argc, char** argv)
uint64_t* ptrOfftargetFlag = (offtargetTogglesTail - (signatureId / 64));
seenOfftargetAlready = (*ptrOfftargetFlag >> (signatureId % 64)) & 1ULL;

uint64_t xoredSignatures = searchSignature ^ offtargets[signatureId];
uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL;
uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL;
uint64_t mismatches = (evenBits >> 1) | oddBits;
uint64_t dist = popcount64(mismatches);

if (dist >= 0 && dist <= 4) {
totalTrueSites++;
}
else {
totalFalseSites++;
}

if (!seenOfftargetAlready) {
*ptrOfftargetFlag |= (1ULL << (signatureId % 64));

Expand Down Expand Up @@ -339,11 +362,19 @@ int main(int argc, char** argv)
*
* popcount(mismatches): 4
*/
uint64_t xoredSignatures = searchSignature ^ offtargets[signatureId];
uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL;
uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL;
uint64_t mismatches = (evenBits >> 1) | oddBits;
uint64_t dist = popcount64(mismatches);
// uint64_t xoredSignatures = searchSignature ^ offtargets[signatureId];
// uint64_t evenBits = xoredSignatures & 0xAAAAAAAAAAAAAAAAULL;
// uint64_t oddBits = xoredSignatures & 0x5555555555555555ULL;
// uint64_t mismatches = (evenBits >> 1) | oddBits;
// uint64_t dist = popcount64(mismatches);
if (dist >= 0 && dist <= 4) {
uniqueTrueSites++;
}
else
{
uniqueFalseSites++;
}


if (dist >= 0 && dist <= 4) {
// Begin calculating MIT score
Expand Down Expand Up @@ -461,6 +492,12 @@ int main(int argc, char** argv)

memset(offtargetToggles.data(), 0, sizeof(uint64_t) * offtargetToggles.size());
}
countMutex.lock();
GtotalFalseSites += totalFalseSites;
GtotalTrueSites += totalTrueSites;
GuniqueFalseSites += uniqueFalseSites;
GuniqueTrueSites += uniqueTrueSites;
countMutex.unlock();
}

auto endProcessing = std::chrono::high_resolution_clock::now();
Expand All @@ -476,6 +513,8 @@ int main(int argc, char** argv)
printf("%02ld:%02ld\t", minutes.count(), seconds.count());
}
printf("\n");
std::cout << GuniqueTrueSites << "\t" << GuniqueFalseSites << "\t" << GtotalTrueSites << "\t" << GtotalFalseSites << std::endl;


/** Print global scores to stdout */
for (size_t searchIdx = 0; searchIdx < querySignatures.size(); searchIdx++) {
Expand Down
1 change: 1 addition & 0 deletions ISSLScoreOfftargets/ISSLScoreOfftargets.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef ISSLScoreOfftargetsInclude
#define ISSLScoreOfftargetsInclude
#include <atomic>
#include <mutex>
#include <filesystem>
#include <string>
#include <unordered_map>
Expand Down
Loading

0 comments on commit 3423f12

Please sign in to comment.