FunctionLab · kathyxchen · Apr 23, 2018 · Apr 15, 2018 · Apr 15, 2018 · Apr 15, 2018
diff --git a/config_examples/parameters.yml b/config_examples/parameters.yml
@@ -1,25 +1,40 @@
 ---
+model: {
+    non_strand_specific: {
+        use_module: True,
+        mode: mean
+    },
+    import_model_from: models.deepsea
+    class: DeepSEA
+}
 sampler: !obj:selene.samplers.IntervalsSampler {
+    genome: /scratch/data_hg/male.hg19.fasta,
+    genomic_features: /scratch/data_hg/sorted_sv_aggregate.bed.gz,
+    distinct_features: /scratch/data_hg/distinct_features.txt,
+    sample_from_regions: /scratch/data_hg/TFs_coords_only.txt,
     test_holdout: [8, 9],
     validation_holdout: [6, 7],
     random_seed: 127,
     sequence_length: 1001,
     center_bin_to_predict: 201,
-    default_threshold: 0.5,
+    feature_thresholds: 0.5,
     mode: "train"
-    }
+}
 model_controller: !obj:selene.ModelController {
     batch_size: 64,
     max_steps: 500000,
-    report_metrics_every_n_steps: 16000,
-    n_validation_samples: 3200,
+    report_stats_every_n_steps: 16000,
+    n_validation_samples: 32000,
     optional_args: {
         cpu_n_threads: 32,
         use_cuda: True,
         data_parallel: False
-        },
+        logging_verbosity: 2
+    },
     checkpoint: {
         resume: False
-        }
-    }
+    },
+    output_dir: /tigress/TROYANSKAYA/kathy/example_outputs 
+}
+evaluate_on_test: True
 ...
diff --git a/config_examples/paths.yml b/config_examples/paths.yml
diff --git a/models/non_strand_specific_module.py b/models/non_strand_specific_module.py
@@ -0,0 +1,44 @@
+import torch
+from torch.nn.modules import Module
+
+
+def flip(x, dim):
+    """Reverses the elements in a given dimension `dim` of the Tensor.
+
+    source: https://github.com/pytorch/pytorch/issues/229
+    """
+    xsize = x.size()
+    dim = x.dim() + dim if dim < 0 else dim
+    x = x.contiguous()
+    x = x.view(-1, *xsize[dim:])
+    x = x.view(
+        x.size(0), x.size(1), -1)[:, getattr(
+            torch.arange(x.size(1)-1, -1, -1),
+            ('cpu','cuda')[x.is_cuda])().long(), :]
+    return x.view(xsize)
+
+
+class NonStrandSpecific(Module):
+    def __init__(self, model, mode="mean"):
+        super(NonStrandSpecific, self).__init__()
+
+        self.model = model
+
+        if mode != "mean" and mode != "max":
+            raise ValueError(f"Mode should be one of 'mean' or 'max' but was"
+                             "{mode!r}.")
+        self.mode = mode
+
+    def forward(self, input):
+
+        reverse_input = flip(
+            flip(input, 1), 2)
+
+        output = self.model.forward(input)
+        output_from_rev = self.model.forward(
+            reverse_input)
+        if self.mode == "mean":
+            return (output + output_from_rev) / 2
+        else:
+            return torch.max(output, output_from_rev)
+
diff --git a/selene.py b/selene.py
@@ -6,126 +6,61 @@
     Saves model to a user-specified output file.
 
 Usage:
-    selene.py <import-module> <model-class-name> <lr>
-        <paths-yml> <train-model-yml>
-        [-s | --stdout] [--verbosity=<level>]
+    selene.py <lr> <config-yml>
     selene.py -h | --help
 
 Options:
     -h --help               Show this screen.
 
-    <import-module>         Import the module containing the model
-    <model-class-name>      Must be a model class in the imported module
     <lr>                    Choose the optimizer's learning rate
-    <paths-yml>             Input data and output filepaths
-    <train-model-yml>       Model-specific parameters
-    -s --stdout             Will also output logging information to stdout
-                            [default: False]
-    --verbosity=<level>     Logging verbosity level (0=WARN, 1=INFO, 2=DEBUG)
-                            [default: 1]
+    <config-yml>            Model-specific parameters
 """
 import importlib
-import logging
-import os
-from time import strftime, time
 
 from docopt import docopt
 import torch
 
-from selene.model_train import ModelController
-from selene.samplers import IntervalsSampler
-from selene.utils import initialize_logger, read_yaml_file
-from selene.utils import load, load_path, instantiate
+from selene.utils import load_path, instantiate
 
 if __name__ == "__main__":
     arguments = docopt(
         __doc__,
         version="1.0")
-    import_model_from = arguments["<import-module>"]
-    model_class_name = arguments["<model-class-name>"]
-    use_module = importlib.import_module(import_model_from)
-    model_class = getattr(use_module, model_class_name)
-
     lr = float(arguments["<lr>"])
 
-    paths = read_yaml_file(
-        arguments["<paths-yml>"])
-
-    train_model = load_path(arguments["<train-model-yml>"], instantiate=False)
-
-
-    ##################################################
-    # PATHS
-    ##################################################
-    dir_path = paths["dir_path"]
-    files = paths["files"]
-    genome_fasta = os.path.join(
-        dir_path, files["genome"])
-    genomic_features = os.path.join(
-        dir_path, files["genomic_features"])
-    coords_only = os.path.join(
-        dir_path, files["sample_from_regions"])
-    distinct_features = os.path.join(
-        dir_path, files["distinct_features"])
-
-    output_dir = paths["output_dir"]
-    os.makedirs(output_dir, exist_ok=True)
-
-    current_run_output_dir = os.path.join(
-        output_dir, strftime("%Y-%m-%d-%H-%M-%S"))
-    os.makedirs(current_run_output_dir)
+    configs = load_path(arguments["<config-yml>"], instantiate=False)
 
     ##################################################
     # TRAIN MODEL PARAMETERS
     ##################################################
-    sampler_info = train_model["sampler"]
-    model_controller_info = train_model["model_controller"]
+    model_info = configs["model"]
+    sampler_info = configs["sampler"]
+    model_controller_info = configs["model_controller"]
 
-    ##################################################
-    # OTHER ARGS
-    ##################################################
-    to_stdout = arguments["--stdout"]
-    verbosity_level = int(arguments["--verbosity"])
-
-    initialize_logger(
-        os.path.join(current_run_output_dir, "{0}.log".format(__name__)),
-        verbosity=verbosity_level,
-        stdout_handler=to_stdout)
-    logger = logging.getLogger("selene")
-
-    t_i = time()
-    feature_thresholds = None
-    if "specific_feature_thresholds" in sampler_info.keywords:
-        feature_thresholds = sampler_info.pop("specific_feature_thresholds")
-    else:
-        feature_thresholds = None
-    if "default_threshold" in sampler_info.keywords:
-        if feature_thresholds:
-            feature_thresholds["default"] = sampler_info.pop("default_threshold")
-        else:
-            feature_thresholds = sampler_info.pop("default_threshold")
-
-    if feature_thresholds:
-        sampler_info.bind(feature_thresholds=feature_thresholds)
-    sampler_info.bind(genome=genome_fasta,
-                      query_feature_data=genomic_features,
-                      distinct_features=distinct_features,
-                      intervals_file=coords_only)
     sampler = instantiate(sampler_info)
 
-    t_i_model = time()
     torch.manual_seed(1337)
     torch.cuda.manual_seed_all(1337)
 
+    import_model_from = model_info["import_model_from"]
+    model_class_name = model_info["class"]
+    use_module = importlib.import_module(import_model_from)
+    model_class = getattr(use_module, model_class_name)
+
     model = model_class(sampler.sequence_length, sampler.n_features)
     print(model)
 
+    if model_info["non_strand_specific"]["use_module"]:
+        from models.non_strand_specific_module import NonStrandSpecific
+        model = NonStrandSpecific(
+            model, mode=model_info["non_strand_specific"]["mode"])
+
     checkpoint_info = model_controller_info.pop("checkpoint")
     checkpoint_resume = checkpoint_info.pop("resume")
     checkpoint = None
     if checkpoint_resume:
         checkpoint_file = checkpoint_info.pop("model_file")
-        logger.info("Resuming training from checkpoint {0}.".format(
+        print("Resuming training from checkpoint {0}.".format(
             checkpoint_file))
         checkpoint = torch.load(checkpoint_file)
         model.load_state_dict(checkpoint["state_dict"])
@@ -135,45 +70,26 @@
     criterion = use_module.criterion()
     optimizer_class, optimizer_args = use_module.get_optimizer(lr)
 
-    t_f_model = time()
-    logger.debug(
-        "Finished initializing the {0} model from module {1}: {2} s".format(
-            model.__class__.__name__,
-            import_model_from,
-            t_f_model - t_i_model))
-
-    logger.info(model)
-    logger.info(optimizer_args)
-
-
-    if feature_thresholds:
-        sampler_info.bind(feature_thresholds=feature_thresholds)
-    sampler_info.bind(genome=genome_fasta,
-                      query_feature_data=genomic_features,
-                      distinct_features=distinct_features,
-                      intervals_file=coords_only)
-    sampler = instantiate(sampler_info)
-
     batch_size = model_controller_info.keywords["batch_size"] # Would love to find a better way.
     max_steps = model_controller_info.keywords["max_steps"]
-    report_metrics_every_n_steps = \
-        model_controller_info.keywords["report_metrics_every_n_steps"]
+    report_stats_every_n_steps = \
+        model_controller_info.keywords["report_stats_every_n_steps"]
     n_validation_samples = model_controller_info.keywords["n_validation_samples"]
 
     model_controller_info.bind(model=model,
                                data_sampler=sampler,
                                loss_criterion=criterion,
                                optimizer_class=optimizer_class,
-                               optimizer_args=optimizer_args,
-                               output_dir=current_run_output_dir)
+                               optimizer_args=optimizer_args)
     if "optional_args" in model_controller_info.keywords:
         optional_args = model_controller_info.pop("optional_args")
         model_controller_info.bind(**optional_args)
     runner = instantiate(model_controller_info)
 
-    logger.info("Training model: {0} steps, {1} batch size.".format(
+    print("Training model: {0} steps, {1} batch size.".format(
         max_steps, batch_size))
     runner.train_and_validate()
-
-    t_f = time()
-    logger.info("./train_model.py completed in {0} s.".format(t_f - t_i))
+    if configs["evaluate_on_test"]:
+        runner.evaluate()
+    if configs["save_datasets"]:
+        runner.write_datasets_to_file()
diff --git a/selene/__init__.py b/selene/__init__.py
@@ -1,2 +1,2 @@
-__all__ = ["sequences", "targets", "samplers", "utils"]
+__all__ = ["predict", "sequences", "targets", "samplers", "utils"]
 from .model_train import ModelController
diff --git a/selene/model_predict.py b/selene/model_predict.py