Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# Check lld exists

LLD_EXISTS := $(shell command -v ld.lld 2> /dev/null)
#XXXFLAGS += -fuse-ld=mold

ifdef LLD_EXISTS
CXXFLAGS += -fuse-ld=lld
endif

SRC_PATH = src
BUILD_PATH = build
Expand Down Expand Up @@ -54,7 +50,8 @@ EXEC_ABS := $(abspath ${EXEC})
MAIN_FILE = tksm.cpp
MAIN_OBJECT = $(OBJ_PATH)/tksm.o

SRC_FILES = tag.cpp truncate.cpp transcribe.cpp scb.cpp sequence.cpp polyA.cpp pcr.cpp model_truncation.cpp abundance.cpp strand_man.cpp filter.cpp random_wgs.cpp shuffle.cpp unsegment.cpp append_noise.cpp mutate.cpp

SRC_FILES = tag.cpp truncate.cpp transcribe.cpp scb.cpp sequence.cpp polyA.cpp pcr.cpp model_truncation.cpp abundance.cpp strand_man.cpp filter.cpp random_wgs.cpp shuffle.cpp unsegment.cpp append_noise.cpp mutate.cpp cut.cpp interval.cpp size_selection.cpp plasmids.cpp fa2mdf.cpp mdf.cpp

#Append SRC_PATH to SRC_FILES
SRC_FILES := $(addprefix $(SRC_PATH)/,$(SRC_FILES))
Expand Down
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ There are four types of modules: entry-point modules, core modules, utility modu
| Utility | `Badread`| Build base-level quality and error models using Badread |
| Utility | `KDE` | Build truncation model |
| Entry | `Tsb` | Generate transcripts molecules from GTF and expression data |
| Entry | `Wgs` | Generate fragments with length distribution Whole genome shotgun |
| Entry | `Mrg` | Merge output of one or more pipelines into a single MDF |
| Core | `plA` | Add polyA tail to molecules |
| Core | `Trc` | Truncate the molecules |
Expand Down Expand Up @@ -414,6 +415,24 @@ The `Tsb` outputs an MDF file with a record for each transcript molecule sampled
The cellular barcode for each molecule is added as a comment to the molecule header line:
`CB=<cellular_barcode>;` or `CB;` if the molecule has no cellular barcode.

#### Random Whole Genome Shotgun
```bash
tksm random-wgs [arguments]
```
Generates MDF file with a whole genome shotgun of the given reference sequences.
The `Wgs` module has following arguments:

| short | long | Description |
| - | - | - |
| -r | --reference arg | Whole genome reference file (indexed with samtools faidx) |
| | --frag-len-dist arg | quoted string of distribution parameters formatted as "[distribution] [params]...". Example: "normal 350 50" will simulate normal distribution with mean 350 and stdev 50. Implemented dists: [normal, lognormal, uniform, exponential]. |
| -o | --output arg | Output mdf file |
| | --base-count arg | Number of bases to be simulated|
| | --depth | Genome depth to be simulated |
| | --circular | Simulate circular genomes |

Either `--base-count` or `--depth` should be used.

#### Merging
The `Mrg` module concatenates a list of MDFs into a single MDF.
It is phoney module; it is really just a `cat` Linux command that is run by Snakemake.
Expand Down Expand Up @@ -478,14 +497,18 @@ The `Flt` module has these arguments:
| - | - | - |
| `-t` | `--true-output` | Output MDF file for molecules that pass the filter. |
| `-f` | `--false-output` | Output MDF file for molecules that fail the filter. |
| `-c` | `--condition` | Comma separated conditions to filter (and-ed together). |
| `-c` | `--if` | Comma separated conditions to filter (and-ed together). |
| `-n` | `--not` | Comma separated negated conditions to filter (and-ed together). |
| | `--negate` | Negate the conjunction of the condition(s). |
| | `--or` | Use `or` instead of `and` to combine multiple conditions|

The implemented conditions are:

- `info`: Check if a non-empty info tag exists in a molecule.
- `size`: Filters molecules w.r.t their size [<, >, <=, >= , ==, !=].
- `locus`: is similar to `samtools view` selection. If any of the molecule's intervals overlaps with the specified location or range, TKSM will consider the condition fulfilled.
- `id`: Check if id of a molecule matches the given regular expression ([Ecmascript](https://en.cppreference.com/w/cpp/regex/ecmascript) format)
- `rand`: Randomly split the molecules with the given probability

Examples:

Expand All @@ -496,6 +519,8 @@ Examples:
- `-c "locus chr1:1000-1500"`: Selects molecules that overlap with chr1 between position 1000 and 1500.
- `-c "locus AGATCGGAAGAGCGTCGTGTAG"`: Selects molecules with this "*contig*". While this is not a real contig name, modules such as `Tag` and `SCB` add put the sequence of the tag as the contig name ([see above](#mdf-format))

Multiple conditions can be passed by either separating by comma or calling `--if` or `--not` multiple times.

#### PCR
The `PCR` module is used to simulate PCR amplification.
The module's source code is in `src/pcr.cpp`.
Expand Down
2 changes: 2 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ exprmnts_re = "|".join([re.escape(x) for x in config["TS_experiments"]])

DEBUG = False

if "models" not in config:
config["models"] = {}
for sample in config["samples"]:
for mtype in ["Tsb", "Trc", "Seq"]:
if mtype not in config["models"]:
Expand Down
32 changes: 25 additions & 7 deletions py/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from tqdm import tqdm
import tksm_badread

from random import sample

def set_tksm_models_dicts(env_var="TKSM_MODELS"):
var = os.getenv(env_var)
Expand Down Expand Up @@ -112,6 +112,12 @@ def parse_args():
help="Badread tail model name or file path. "
+ f"Available model names: [{', '.join(tksm_badread.TAIL_NOISE_MODEL_PY.tail_model_names)}]",
)
parser.add_argument(
"--replace-non-agtc",
action='store_true',
default=False,
help="Replace non AGTC characters according to IUPAC list"
)

class ListPrinter(argparse.Action):
def __call__(self, parser, namespace, values, option_string):
Expand Down Expand Up @@ -164,8 +170,17 @@ def __call__(self, parser, namespace, values, option_string):
parser.error("Must specify either --output or --perfect.")
return args


def generate_fasta(fasta):
cm = {'W':['A','T'], 'S':['C','G'],'M':['A','C'],'K':['G','T'],'R':['A','G'],'Y':['C','T'],
'B':['C','G','T'],'D':['A','G','T'],'H':['A','C','T'],'V':['A','C','G'],
'N':['A','C','G','T']}
def charmap(c):
if c in {'A','G','T','C','a','g','t','c','U','u'}:
return c
elif c.upper() in cm:
return sample(cm[c.upper()],1)[0]
else:
return sample(cm['N'],1)[0]
def generate_fasta(fasta, replace_non_agtc):
name = ""
seq = list()
if fasta.endswith(".gz"):
Expand All @@ -182,15 +197,18 @@ def generate_fasta(fasta):
seq = list()
name = l[1:].split(" ")[0]
else:
seq.append(l)
if replace_non_agtc:
seq.append("".join([charmap(c) for c in l]))
else:
seq.append(l)
yield (name, "".join(seq))


def get_reference_seqs(reference):
def get_reference_seqs(reference, replace_non_agtc=False):
reference_seqs = dict()
for ref in reference:
print(f"Loading reference {ref}...")
reference_seqs.update({name: seq for name, seq in generate_fasta(ref)})
reference_seqs.update({name: seq for name, seq in generate_fasta(ref, replace_non_agtc)})
return reference_seqs


Expand Down Expand Up @@ -323,7 +341,7 @@ def mdf_to_seq(mdf, targets=dict()):
if __name__ == "__main__":
set_tksm_models_dicts()
args = parse_args()
reference_seqs = get_reference_seqs(args.references)
reference_seqs = get_reference_seqs(args.references, args.replace_non_agtc)

targets = dict()
target_outfiles = dict()
Expand Down
181 changes: 181 additions & 0 deletions src/cut.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#include "cut.h"

#include <cxxopts.hpp>
#include <fstream>
#include <random>
#include <string>
#include <vector>

#include "interval.h"
#include "mdf.h"
#include "module.h"
#include "util.h"
#include "stats.h"

using std::ifstream;
using std::ofstream;
using std::string;
using std::vector;

#include "pimpl.h"
inline std::pair<molecule_descriptor, molecule_descriptor>
split_molecule(const molecule_descriptor &md, int split_pos) {
int bases_so_far = 0;
molecule_descriptor md1;
molecule_descriptor md2;
for (const einterval &g : md.cget_segments()) {
if (g.size() + bases_so_far < split_pos) {
md1.append_segment(g);
}
else {
if (bases_so_far < split_pos) { // Transition interval
einterval cpy = g;

einterval cpy2 = g;
if (g.plus_strand) {
cpy.truncate(0, split_pos - bases_so_far);
cpy2.truncate(split_pos - bases_so_far, cpy2.size());
}
else {
cpy.truncate(split_pos - bases_so_far, cpy.size());
cpy2.truncate(0, split_pos - bases_so_far);
}
md1.append_segment(cpy);
md2.append_segment(cpy2);
}
else {
md2.append_segment(g);
}
}
bases_so_far += g.size();
}
return {md1, md2};
}
class Cut_module::impl : public tksm_module {

static vector<molecule_descriptor> split_molecule_n(const molecule_descriptor &md, const vector<double> &cuts){
vector<molecule_descriptor> result;
std::pair<molecule_descriptor, molecule_descriptor> mdp = {{}, md};
int index = 0;
for( double d : cuts){
mdp = split_molecule(mdp.second, d);
if(mdp.first.size() == 0 ){
logd("Molecule of size zero generated, skipping");
continue;
}
result.push_back(mdp.first);
result.back().id(fmt::format("{}_C{}",md.get_id(), index++))->depth(1);
}
return result;
}
static vector<molecule_descriptor> cut_molecule(const molecule_descriptor &md, double probability, auto &randgen) {
std::binomial_distribution<size_t> number_of_cuts_distribution{md.size(), probability};

size_t number_of_cuts = number_of_cuts_distribution(randgen);
if (number_of_cuts == 0) {
return {md};
}
vector<int> dirichelet_weights(number_of_cuts + 1, 1); // TODO: Explore non uniform dirichelet
dirichelet_distribution<> dirichelet{
dirichelet_weights, static_cast<double>(md.size())}; // TODO: Maybe add a constructor that accepts number
// of params and sets them to one instead
vector<double> cuts = dirichelet(randgen);
if (md.has_comment("circular")) {
std::uniform_int_distribution<unsigned long> uniform_dist(0, md.size());
size_t pos = uniform_dist(randgen);
auto mdp = split_molecule(md, pos);

// Find start position.
for (const einterval &ee : mdp.first.get_segments()) {
mdp.second.append_segment(ee);
}
mdp.second.id(md.get_id());
cuts.pop_back();
return split_molecule_n(mdp.second, cuts);
}
return split_molecule_n(md, cuts);
}

cxxopts::ParseResult parse(int argc, char **argv) {
// clang-format off
options.add_options("main")
(
"i,input",
"input mdf file",
cxxopts::value<string>()
)(
"o,output",
"output mdf file",
cxxopts::value<string>()
)(
"p,probability",
"Per base probability of fragmentation",
cxxopts::value<double>()
)
;
// clang-format on
return options.parse(argc, argv);
}

cxxopts::ParseResult args;

public:
impl(int argc, char **argv) : tksm_module{"<MODULE>", "<MODULE> description"}, args(parse(argc, argv)) {}

~impl() = default;

int validate_arguments() {
std::vector<string> mandatory = {"input", "output", "probability"};
int missing_parameters = 0;
for (string &param : mandatory) {
if (args.count(param) == 0) {
loge("{} is required!", param);
++missing_parameters;
}
}
// Other parameter checks here

if (missing_parameters > 0) {
fmt::print(stderr, "{}\n", options.help());
return 1;
}
return 0;
}
int run() {
if (process_utility_arguments(args)) {
return 0;
}
if (validate_arguments()) {
return 1;
}
describe_program();

string input_file = args["input"].as<string>();

string output_file = args["output"].as<string>();

double probability = args["probability"].as<double>();

ifstream input(input_file);

ofstream output(output_file);
std::uniform_int_distribution<long> lendist(0, LONG_MAX);
for (auto &md : stream_mdf(input, true)) {
auto mds = cut_molecule(md, probability, rand_gen);
for(const auto &mm : mds){
output << mm;
}
}
return 0;
}

void describe_program() {
logi("Running [MODULE]");
logi("Input file: {}", args["input"].as<string>());
logi("Output file: {}", args["output"].as<string>());
// Other parameters logs are here
fmtlog::poll(true);
}
};

MODULE_IMPLEMENT_PIMPL_CLASS(Cut_module);
7 changes: 7 additions & 0 deletions src/cut.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef _CUT_H
#define _CUT_H
#include "pimpl.h"

MODULE_DECLARE_PIMPL_CLASS(Cut_module);

#endif
Loading
Loading