vpc-ccg · f0t1h · Apr 3, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/Makefile b/Makefile
@@ -1,10 +1,6 @@
-# Check lld exists
 
-LLD_EXISTS := $(shell command -v ld.lld 2> /dev/null)
+#XXXFLAGS += -fuse-ld=mold
 
-ifdef LLD_EXISTS
-	CXXFLAGS += -fuse-ld=lld
-endif
 
 SRC_PATH = src
 BUILD_PATH = build
@@ -54,7 +50,8 @@ EXEC_ABS := $(abspath ${EXEC})
 MAIN_FILE =  tksm.cpp
 MAIN_OBJECT = $(OBJ_PATH)/tksm.o
 
-SRC_FILES =  tag.cpp truncate.cpp transcribe.cpp scb.cpp sequence.cpp polyA.cpp pcr.cpp model_truncation.cpp abundance.cpp strand_man.cpp filter.cpp random_wgs.cpp shuffle.cpp unsegment.cpp append_noise.cpp mutate.cpp
+
+SRC_FILES =  tag.cpp truncate.cpp transcribe.cpp scb.cpp sequence.cpp polyA.cpp pcr.cpp model_truncation.cpp abundance.cpp strand_man.cpp filter.cpp random_wgs.cpp shuffle.cpp unsegment.cpp append_noise.cpp mutate.cpp cut.cpp interval.cpp size_selection.cpp plasmids.cpp fa2mdf.cpp mdf.cpp
 
 #Append SRC_PATH to SRC_FILES
 SRC_FILES := $(addprefix $(SRC_PATH)/,$(SRC_FILES))

diff --git a/README.md b/README.md
@@ -79,6 +79,7 @@ There are four types of modules: entry-point modules, core modules, utility modu
 | Utility | `Badread`| Build base-level quality and error models using Badread     |
 | Utility | `KDE`    | Build truncation model                                      |
 | Entry   | `Tsb`    | Generate transcripts molecules from GTF and expression data |
+| Entry   | `Wgs`    | Generate fragments with length distribution Whole genome shotgun |
 | Entry   | `Mrg`    | Merge output of one or more pipelines into a single MDF     |
 | Core    | `plA`    | Add polyA tail to molecules                                 |
 | Core    | `Trc`    | Truncate the molecules                                      |
@@ -414,6 +415,24 @@ The `Tsb` outputs an MDF file with a record for each transcript molecule sampled
 The cellular barcode for each molecule is added as a comment to the molecule header line:
 `CB=<cellular_barcode>;` or `CB;` if the molecule has no cellular barcode. 
 
+#### Random Whole Genome Shotgun
+```bash
+tksm random-wgs [arguments]
+```
+Generates MDF file with a whole genome shotgun of the given reference sequences.
+The `Wgs` module has following arguments:
+
+| short | long | Description |
+| - | - | - |
+| -r | --reference arg | Whole genome reference file (indexed with samtools faidx) |
+| | --frag-len-dist arg | quoted string of distribution parameters formatted as "[distribution] [params]...". Example: "normal 350 50" will simulate normal distribution with mean 350 and stdev 50. Implemented dists: [normal, lognormal, uniform, exponential]. |
+| -o | --output arg | Output mdf file |
+| | --base-count arg | Number of bases to be simulated|
+| | --depth | Genome depth to be simulated |
+| | --circular | Simulate circular genomes |
+
+Either `--base-count` or `--depth` should be used.
+
 #### Merging
 The `Mrg` module concatenates a list of MDFs into a single MDF.
 It is phoney module; it is really just a `cat` Linux command that is run by Snakemake.
@@ -478,14 +497,18 @@ The `Flt` module has these arguments:
 | -     | -                 | - |
 | `-t`  | `--true-output`   | Output MDF file for molecules that pass the filter. |
 | `-f`  | `--false-output`  | Output MDF file for molecules that fail the filter. |
-| `-c`  | `--condition`     | Comma separated conditions to filter (and-ed together). |
+| `-c`  | `--if`            | Comma separated conditions to filter (and-ed together). |
+| `-n`  | `--not`           | Comma separated negated conditions to filter (and-ed together). |
 |       | `--negate`        | Negate the conjunction of the condition(s). |
+|       | `--or`            | Use `or` instead of `and` to combine multiple conditions|
 
 The implemented conditions are:
 
 - `info`: Check if a non-empty info tag exists in a molecule.
 - `size`: Filters molecules w.r.t their size [<, >, <=, >= , ==, !=].
 - `locus`: is similar to `samtools view` selection. If any of the molecule's intervals overlaps with the specified location or range, TKSM will consider the condition fulfilled.
+- `id`: Check if id of a molecule matches the given regular expression ([Ecmascript](https://en.cppreference.com/w/cpp/regex/ecmascript) format) 
+- `rand`: Randomly split the molecules with the given probability
 
 Examples:
 
@@ -496,6 +519,8 @@ Examples:
 - `-c "locus chr1:1000-1500"`: Selects molecules that overlap with chr1 between position 1000 and 1500.
 - `-c "locus AGATCGGAAGAGCGTCGTGTAG"`: Selects molecules with this "*contig*". While this is not a real contig name, modules such as `Tag` and `SCB` add put the sequence of the tag as the contig name ([see above](#mdf-format))
 
+Multiple conditions can be passed by either separating by comma or calling `--if` or `--not` multiple times.
+
 #### PCR
 The `PCR` module is used to simulate PCR amplification.
 The module's source code is in `src/pcr.cpp`.

diff --git a/Snakefile b/Snakefile
@@ -16,6 +16,8 @@ exprmnts_re = "|".join([re.escape(x) for x in config["TS_experiments"]])
 
 DEBUG = False
 
+if "models" not in config:
+    config["models"] = {}
 for sample in config["samples"]:
     for mtype in ["Tsb", "Trc", "Seq"]:
         if mtype not in config["models"]:

diff --git a/py/sequence.py b/py/sequence.py
@@ -12,7 +12,7 @@
 
 from tqdm import tqdm
 import tksm_badread
-
+from random import sample
 
 def set_tksm_models_dicts(env_var="TKSM_MODELS"):
     var = os.getenv(env_var)
@@ -112,6 +112,12 @@ def parse_args():
         help="Badread tail model name or file path. "
         + f"Available model names: [{', '.join(tksm_badread.TAIL_NOISE_MODEL_PY.tail_model_names)}]",
     )
+    parser.add_argument(
+        "--replace-non-agtc",
+        action='store_true',
+        default=False,
+        help="Replace non AGTC characters according to IUPAC list"
+    )
 
     class ListPrinter(argparse.Action):
         def __call__(self, parser, namespace, values, option_string):
@@ -164,8 +170,17 @@ def __call__(self, parser, namespace, values, option_string):
         parser.error("Must specify either --output or --perfect.")
     return args
 
-
-def generate_fasta(fasta):
+cm = {'W':['A','T'], 'S':['C','G'],'M':['A','C'],'K':['G','T'],'R':['A','G'],'Y':['C','T'],
+      'B':['C','G','T'],'D':['A','G','T'],'H':['A','C','T'],'V':['A','C','G'],
+      'N':['A','C','G','T']}
+def charmap(c):
+    if c in {'A','G','T','C','a','g','t','c','U','u'}:
+        return c
+    elif c.upper() in cm:
+        return sample(cm[c.upper()],1)[0] 
+    else:
+        return sample(cm['N'],1)[0]
+def generate_fasta(fasta, replace_non_agtc):
     name = ""
     seq = list()
     if fasta.endswith(".gz"):
@@ -182,15 +197,18 @@ def generate_fasta(fasta):
             seq = list()
             name = l[1:].split(" ")[0]
         else:
-            seq.append(l)
+            if replace_non_agtc:
+                seq.append("".join([charmap(c) for c in l]))
+            else:
+                seq.append(l)
     yield (name, "".join(seq))
 
 
-def get_reference_seqs(reference):
+def get_reference_seqs(reference, replace_non_agtc=False):
     reference_seqs = dict()
     for ref in reference:
         print(f"Loading reference {ref}...")
-        reference_seqs.update({name: seq for name, seq in generate_fasta(ref)})
+        reference_seqs.update({name: seq for name, seq in generate_fasta(ref, replace_non_agtc)})
     return reference_seqs
 
 
@@ -323,7 +341,7 @@ def mdf_to_seq(mdf, targets=dict()):
 if __name__ == "__main__":
     set_tksm_models_dicts()
     args = parse_args()
-    reference_seqs = get_reference_seqs(args.references)
+    reference_seqs = get_reference_seqs(args.references, args.replace_non_agtc)
 
     targets = dict()
     target_outfiles = dict()

diff --git a/src/cut.cpp b/src/cut.cpp
@@ -0,0 +1,181 @@
+#include "cut.h"
+
+#include <cxxopts.hpp>
+#include <fstream>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "interval.h"
+#include "mdf.h"
+#include "module.h"
+#include "util.h"
+#include "stats.h"
+
+using std::ifstream;
+using std::ofstream;
+using std::string;
+using std::vector;
+
+#include "pimpl.h"
+inline std::pair<molecule_descriptor, molecule_descriptor>
+split_molecule(const molecule_descriptor &md, int split_pos) {
+    int bases_so_far = 0;
+    molecule_descriptor md1;
+    molecule_descriptor md2;
+    for (const einterval &g : md.cget_segments()) {
+        if (g.size() + bases_so_far < split_pos) {
+            md1.append_segment(g);
+        }
+        else {
+            if (bases_so_far < split_pos) {  // Transition interval
+                einterval cpy = g;
+
+                einterval cpy2 = g;
+                if (g.plus_strand) {
+                    cpy.truncate(0, split_pos - bases_so_far);
+                    cpy2.truncate(split_pos - bases_so_far, cpy2.size());
+                }
+                else {
+                    cpy.truncate(split_pos - bases_so_far, cpy.size());
+                    cpy2.truncate(0, split_pos - bases_so_far);
+                }
+                md1.append_segment(cpy);
+                md2.append_segment(cpy2);
+            }
+            else {
+                md2.append_segment(g);
+            }
+        }
+        bases_so_far += g.size();
+    }
+    return {md1, md2};
+}
+class Cut_module::impl : public tksm_module {
+
+    static vector<molecule_descriptor> split_molecule_n(const molecule_descriptor &md, const vector<double> &cuts){
+        vector<molecule_descriptor> result;
+        std::pair<molecule_descriptor, molecule_descriptor> mdp = {{}, md};
+        int index = 0;
+        for( double d : cuts){
+            mdp = split_molecule(mdp.second, d);
+            if(mdp.first.size() == 0 ){
+                logd("Molecule of size zero generated, skipping");
+                continue;
+            }
+            result.push_back(mdp.first);
+            result.back().id(fmt::format("{}_C{}",md.get_id(),  index++))->depth(1);
+        }
+        return result;
+    }
+    static vector<molecule_descriptor> cut_molecule(const molecule_descriptor &md, double probability, auto &randgen) {
+        std::binomial_distribution<size_t> number_of_cuts_distribution{md.size(), probability};
+
+        size_t number_of_cuts = number_of_cuts_distribution(randgen);
+        if (number_of_cuts == 0) {
+            return {md};
+        }
+        vector<int> dirichelet_weights(number_of_cuts + 1, 1);  // TODO: Explore non uniform dirichelet
+        dirichelet_distribution<> dirichelet{
+            dirichelet_weights, static_cast<double>(md.size())};  // TODO: Maybe add a constructor that accepts number
+                                                                  // of params and sets them to one instead
+        vector<double> cuts = dirichelet(randgen);
+        if (md.has_comment("circular")) {
+            std::uniform_int_distribution<unsigned long> uniform_dist(0, md.size());
+            size_t pos = uniform_dist(randgen);
+            auto mdp = split_molecule(md, pos);
+
+            // Find start position.
+            for (const einterval &ee : mdp.first.get_segments()) {
+                mdp.second.append_segment(ee);
+            }
+            mdp.second.id(md.get_id());
+            cuts.pop_back();
+            return split_molecule_n(mdp.second, cuts); 
+        }
+        return split_molecule_n(md, cuts);
+    }
+
+    cxxopts::ParseResult parse(int argc, char **argv) {
+        // clang-format off
+        options.add_options("main")
+            (
+                "i,input",
+                "input mdf file",
+                cxxopts::value<string>()
+            )(
+                "o,output",
+                "output mdf file",
+                cxxopts::value<string>()
+            )(
+                "p,probability",
+                "Per base probability of fragmentation",
+                cxxopts::value<double>()
+             )
+            ;
+        // clang-format on
+        return options.parse(argc, argv);
+    }
+
+    cxxopts::ParseResult args;
+
+public:
+    impl(int argc, char **argv) : tksm_module{"<MODULE>", "<MODULE> description"}, args(parse(argc, argv)) {}
+
+    ~impl() = default;
+
+    int validate_arguments() {
+        std::vector<string> mandatory = {"input", "output", "probability"};
+        int missing_parameters        = 0;
+        for (string &param : mandatory) {
+            if (args.count(param) == 0) {
+                loge("{} is required!", param);
+                ++missing_parameters;
+            }
+        }
+        // Other parameter checks here
+
+        if (missing_parameters > 0) {
+            fmt::print(stderr, "{}\n", options.help());
+            return 1;
+        }
+        return 0;
+    }
+    int run() {
+        if (process_utility_arguments(args)) {
+            return 0;
+        }
+        if (validate_arguments()) {
+            return 1;
+        }
+        describe_program();
+
+        string input_file = args["input"].as<string>();
+
+        string output_file = args["output"].as<string>();
+
+        double probability = args["probability"].as<double>();
+
+        ifstream input(input_file);
+
+        ofstream output(output_file);
+        std::uniform_int_distribution<long> lendist(0, LONG_MAX);
+        for (auto &md : stream_mdf(input, true)) {
+            auto mds = cut_molecule(md, probability, rand_gen);
+            for(const auto &mm : mds){
+                output << mm;
+            }
+        }
+        return 0;
+    }
+
+    void describe_program() {
+        logi("Running [MODULE]");
+        logi("Input file: {}", args["input"].as<string>());
+        logi("Output file: {}", args["output"].as<string>());
+        // Other parameters logs are here
+        fmtlog::poll(true);
+    }
+};
+
+MODULE_IMPLEMENT_PIMPL_CLASS(Cut_module);
diff --git a/src/cut.h b/src/cut.h
@@ -0,0 +1,7 @@
+#ifndef _CUT_H
+#define _CUT_H
+#include "pimpl.h"
+
+MODULE_DECLARE_PIMPL_CLASS(Cut_module);
+
+#endif