Merge pull request #24 from evancofer/master

evancofer · web-flow · commit 1c6e8565f9ef · 2018-04-13T16:42:29.000-04:00
Configs can now use new user content
diff --git a/config_examples/parameters.yml b/config_examples/parameters.yml
@@ -1,25 +1,25 @@
 ---
-sampler:
-    test_holdout:
-        - 8
-        - 9
-    validation_holdout:
-        - 6
-        - 7
-    random_seed: 127
-    sequence_length: 1001
-    center_bin_to_predict: 201
-    default_threshold: 0.5
-    mode: train
-model_controller:
-    batch_size: 64
-    max_steps: 500000
-    report_metrics_every_n_steps: 16000
-    n_validation_samples: 3200
-    optional_args:
-        cpu_n_threads: 32
-        use_cuda: True
+sampler: !obj:selene.samplers.IntervalsSampler {
+    test_holdout: [8, 9],
+    validation_holdout: [6, 7],
+    random_seed: 127,
+    sequence_length: 1001,
+    center_bin_to_predict: 201,
+    default_threshold: 0.5,
+    mode: "train"
+    }
+model_controller: !obj:selene.ModelController {
+    batch_size: 64,
+    max_steps: 500000,
+    report_metrics_every_n_steps: 16000,
+    n_validation_samples: 3200,
+    optional_args: {
+        cpu_n_threads: 32,
+        use_cuda: True,
         data_parallel: False
-    checkpoint:
+        },
+    checkpoint: {
         resume: False
+        }
+    }
 ...
diff --git a/selene.py b/selene.py
@@ -33,8 +33,9 @@
 import torch
 
 from selene.model_train import ModelController
-from selene.sampler import IntervalsSampler
+from selene.samplers import IntervalsSampler
 from selene.utils import initialize_logger, read_yaml_file
+from selene.utils import load, load_path, instantiate
 
 if __name__ == "__main__":
     arguments = docopt(
@@ -49,8 +50,9 @@
 
     paths = read_yaml_file(
         arguments["<paths-yml>"])
-    train_model = read_yaml_file(
-        arguments["<train-model-yml>"])
+
+    train_model = load_path(arguments["<train-model-yml>"], instantiate=False)
+
 
     ##################################################
     # PATHS
@@ -93,28 +95,23 @@
 
     t_i = time()
     feature_thresholds = None
-    if "specific_feature_thresholds" in sampler_info:
-        feature_thresholds = sampler_info["specific_feature_thresholds"]
-        del sampler_info["specific_feature_thresholds"]
+    if "specific_feature_thresholds" in sampler_info.keywords:
+        feature_thresholds = sampler_info.pop("specific_feature_thresholds")
     else:
         feature_thresholds = None
-    if "default_threshold" in sampler_info:
+    if "default_threshold" in sampler_info.keywords:
         if feature_thresholds:
-            feature_thresholds["default"] = \
-                sampler_info["default_threshold"]
+            feature_thresholds["default"] = sampler_info.pop("default_threshold")
         else:
-            feature_thresholds = sampler_info["default_threshold"]
-        del sampler_info["default_threshold"]
+            feature_thresholds = sampler_info.pop("default_threshold")
 
     if feature_thresholds:
-        sampler_info["feature_thresholds"] = feature_thresholds
-
-    sampler = IntervalsSampler(
-        genome_fasta,
-        genomic_features,
-        distinct_features,
-        coords_only,
-        **sampler_info)
+        sampler_info.bind(feature_thresholds=feature_thresholds)
+    sampler_info.bind(genome=genome_fasta,
+                      query_feature_data=genomic_features,
+                      distinct_features=distinct_features,
+                      intervals_file=coords_only)
+    sampler = instantiate(sampler_info)
 
     t_i_model = time()
     torch.manual_seed(1337)
@@ -123,16 +120,17 @@
     model = model_class(sampler.sequence_length, sampler.n_features)
     print(model)
 
-    checkpoint_info = model_controller_info["checkpoint"]
-    checkpoint_resume = checkpoint_info["resume"]
+    checkpoint_info = model_controller_info.pop("checkpoint")
+    checkpoint_resume = checkpoint_info.pop("resume")
     checkpoint = None
     if checkpoint_resume:
-        checkpoint_file = checkpoint_info["model_file"]
+        checkpoint_file = checkpoint_info.pop("model_file")
         logger.info("Resuming training from checkpoint {0}.".format(
             checkpoint_file))
         checkpoint = torch.load(checkpoint_file)
         model.load_state_dict(checkpoint["state_dict"])
         model.eval()
+    model_controller_info.bind(checkpoint_resume=checkpoint)
 
     criterion = use_module.criterion()
     optimizer_class, optimizer_args = use_module.get_optimizer(lr)
@@ -147,19 +145,31 @@
     logger.info(model)
     logger.info(optimizer_args)
 
-    batch_size = model_controller_info["batch_size"]
-    max_steps = model_controller_info["max_steps"]
+
+    if feature_thresholds:
+        sampler_info.bind(feature_thresholds=feature_thresholds)
+    sampler_info.bind(genome=genome_fasta,
+                      query_feature_data=genomic_features,
+                      distinct_features=distinct_features,
+                      intervals_file=coords_only)
+    sampler = instantiate(sampler_info)
+
+    batch_size = model_controller_info.keywords["batch_size"] # Would love to find a better way.
+    max_steps = model_controller_info.keywords["max_steps"]
     report_metrics_every_n_steps = \
-        model_controller_info["report_metrics_every_n_steps"]
-    n_validation_samples = model_controller_info["n_validation_samples"]
-
-    runner = ModelController(
-        model, sampler, criterion, optimizer_class, optimizer_args,
-        batch_size, max_steps, report_metrics_every_n_steps,
-        current_run_output_dir,
-        n_validation_samples,
-        checkpoint_resume=checkpoint,
-        **model_controller_info["optional_args"])
+        model_controller_info.keywords["report_metrics_every_n_steps"]
+    n_validation_samples = model_controller_info.keywords["n_validation_samples"]
+
+    model_controller_info.bind(model=model,
+                               data_sampler=sampler,
+                               loss_criterion=criterion,
+                               optimizer_class=optimizer_class,
+                               optimizer_args=optimizer_args,
+                               output_dir=current_run_output_dir)
+    if "optional_args" in model_controller_info.keywords:
+        optional_args = model_controller_info.pop("optional_args")
+        model_controller_info.bind(**optional_args)
+    runner = instantiate(model_controller_info)
 
     logger.info("Training model: {0} steps, {1} batch size.".format(
         max_steps, batch_size))
diff --git a/selene/__init__.py b/selene/__init__.py
@@ -1 +1,2 @@
-__all__ = ["sequences", "targets", "samplers", "model_controller"]
+__all__ = ["sequences", "targets", "samplers", "utils"]
+from .model_train import ModelController
diff --git a/selene/samplers/online_sampler.py b/selene/samplers/online_sampler.py
@@ -114,15 +114,3 @@ def get_feature_from_index(self, feature_index):
 
     def get_sequence_from_encoding(self, encoding):
         return self.genome.encoding_to_sequence(encoding)
-
-    # @abstractmethod
-    # def sample(self, batch_size):
-    #     raise NotImplementedError
-    #
-    # @abstractmethod
-    # def get_data_and_targets(self, mode, batch_size, n_samples):
-    #     raise NotImplementedError
-    #
-    # @abstractmethod
-    # def get_validation_set(self, batch_size, n_samples=None):
-    #     raise NotImplementedError
diff --git a/selene/targets/target.py b/selene/targets/target.py
@@ -12,6 +12,6 @@ class Target(metaclass=ABCMeta):
     @abstractmethod
     def get_feature_data(self, *args, **kwargs):
         """
-        Gets feature data for some input coordinate.
+        Retrieve the feature data for some coordinate.
         """
         raise NotImplementedError
diff --git a/selene/utils/__init__.py b/selene/utils/__init__.py
@@ -1 +1,3 @@
-from .utils import *
+from .utils import initialize_logger, read_yaml_file
+from .config import load, load_path, instantiate
+
diff --git a/selene/utils/config.py b/selene/utils/config.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-__all__ = ["sequences", "targets", "samplers", "model_controller"]`
	`1`	`+__all__ = ["sequences", "targets", "samplers", "utils"]`
	`2`	`+from .model_train import ModelController`