Merge pull request #28 from ksuderman/release-1.1.0

ksuderman · web-flow · commit 6d9b86ab3ba6 · 2021-08-04T20:20:21.000-05:00
Release 1.1.0

- New YAML runtime configuration.
- Run multiple workflows from a single file
- Separate reference and input datasets
diff --git a/README.md b/README.md
@@ -67,35 +67,47 @@ When a workflow is run with the `run` command the invocation details will be sav
 
 ## Runtime Configuration
 
-The runtime parameters for a benchmarking run are specified in a YAML file.  This file can be stored anywhere, but several examples are included in the `config` directory. The configuration YAML must include:
+The runtime parameters for benchmarking runs are specified in a YAML configuration file.  The configuration file can contain more than one runtime configuration specified as a YAML list. This file can be stored anywhere, but several examples are included in the `config` directory. 
 
-- **workflow_id**
-  The ID of the workflow to run.
-- **inputs**
-  A list of dictionaries that specify:
-  1. **name** the name of the input as specifed in the the workflow editor.
-  2. **dataset_id**: the ID of the dataset to be used as input.  This dataset can be located in any publicy accessible history.
-- **output_history_name**
-  A new history with this name will be created and all processed datasets will be stored into this history.
-
-#### Example
+The YAML configuration for a single run looks like:
 
 ```
-workflow_id: b94314cb9cb46380
-inputs:
-  - name: FASTQ Dataset
-    dataset_id: e49d4a2f705b9571
-output_history_name: Example Paired DNA Test
+- workflow_id: d6d3c2119c4849e4
+  output_history_base_name: RNA-seq
+  reference_data:
+    - name: Reference Transcript (FASTA)
+      dataset_id: 50a269b7a99356aa
+  runs:
+    - history_name: 1
+      inputs:
+      - name: FASTQ RNA Dataset
+        dataset_id: 28fa757e56346a34
+    - history_name: 2
+      inputs:
+      - name: FASTQ RNA Dataset
+        dataset_id: 1faa2d3b2ed5c436
 ```
 
-
-
-## Obtaining Results
-
-TBD. 
-
-Scrape the results of a workflow invocation and output in a format suitable for importing into a spreadsheet or database. See issue [#3](../../issues/3). 
-
+- **workflow_id** 
+  The ID of the workflow to run.
+  
+- **output_history_ base_name**  (optional)
+  Name to use as the basis for histories created.  If the *output_history_base_name* is not specified then the  *workflow_id* is used.
+  
+- **reference_data** (optional)
+  Input data that is the same for all benchmarking runs and only needs to be set once.  See the section on *inputs* below for a description of the fields
+
+- **runs**
+  Input definitions for a benchmarking run.  Each run defintion shoud contain:
+
+  - **history_name** (optional) 
+    The name of the history created for the output.  The final output history name is generated by concatenating the *output_history_base_name* from above and the *history_name*.  If the *history_name* is not specified an incrementing integer counter is used.
+  - **inputs**
+    The one or more input datasets to the workflow.  Each input specification consists of:
+    1. **name** the input name as specified in the workflow editor
+    2. **dataset_id** the History API ID as displayed in the workflow editor or with the `./workflow.py histories` command.
+
+  
 ### Contributing
 
 Fork this repository and then create a working branch for yourself from the `dev` branch. All pull requests should target  `dev` and not the `master` branch.
diff --git a/config/example.yml b/config/example.yml
@@ -0,0 +1,32 @@
+# DO NOT RUN
+# This is a non-working example used for discussion purposes only.
+- workflow_id: b94314cb9cb46380
+  comments: DNA testing
+  output_history_base_name: DNA Testing 10 CPU
+  reference_data:
+    - name: FASTQ Reference
+      dataset_id: badfood
+    - name: GTF
+      dataset_id: more
+  runs:
+    - history_name: run one
+      inputs:
+      - name: input one
+        dataset_id: 1f04e612d8649780
+      - name: input two
+        dataset_id: 1f04e612d8649780
+    - history_name: run two
+      inputs:
+      - name: FASTQ Dataset
+        dataset_id: 1f04e612d8649780
+      - name: GTF index
+        dataset_id: 1f04e612d8649780
+- workflow_id: b94314cb9cb46380
+  comments: RNA testing
+  output_history_base_name: RNA Testing 10 CPU
+  runs:
+    - hostory_name: FASTQ Dataset
+      inputs:
+      - name: FASTQ Dataset
+        dataset_ids: 1f04e612d8649780
+
diff --git a/config/rna-seq.yml b/config/rna-seq.yml
@@ -1,10 +1,29 @@
-workflow_id: 3606d3101a772650
-inputs:
-  - name: Reference FASTA
-    id: '3947ba9ca107312f'
-  - name: GTF
-    id: '048a970701a6dc44'
-  - name: FASTA Dataset
-    id: 'ca5081d2c8f1088a'
-output_history_name: RNA seq test results
+- workflow_id: 8557135ce1bff84d
+  output_history_base_name: PairRNA 16C 58G-MEM
+  reference_data:
+    - name: Reference Transcript (FASTA)
+      dataset_id: d61e7f405474c541
+  runs:
+    - history_name: SRS9276533
+      inputs:
+      - name: FASTQ RNA Dataset
+        dataset_id: 28fa757e56346a34
+    - history_name: SRS9276520
+      inputs:
+      - name: FASTQ RNA Dataset
+        dataset_id: 1faa2d3b2ed5c436
+    - history_name: SRS9276534
+      inputs:
+      - name: FASTQ RNA Dataset
+        dataset_id: ec8c5112d867eb82
+- workflow_id: 69906830c7478863
+  output_history_base_name: RNA 16C 58G-MEM
+  reference_data:
+    - name: Reference Transcript (FASTA)
+      dataset_id: d61e7f405474c541
+  runs:
+    - history_name: SRS9551191
+      inputs:
+      - name: FASTQ RNA Dataset
+        dataset_id: 0aedafdec1eb4aeb
 
diff --git a/workflow.py b/workflow.py
@@ -15,7 +15,7 @@
 
 from pprint import pprint
 
-VERSION='1.0.0'
+VERSION='1.1.0'
 
 BOLD = '\033[1m'
 CLEAR = '\033[0m'
@@ -31,6 +31,16 @@
 # The directory where the workflow invocation data will be saved.
 INVOCATIONS_DIR = 'invocations'
 
+class Keys:
+    NAME = 'name'
+    RUNS = 'runs'
+    INPUTS = 'inputs'
+    REFERENCE_DATA = 'reference_data'
+    WORKFLOW_ID = 'workflow_id'
+    DATASET_ID = 'dataset_id'
+    HISTORY_BASE_NAME = 'output_history_base_name'
+    HISTORY_NAME = 'history_name'
+
 
 def workflows():
     """
@@ -81,33 +91,51 @@ def run(args):
     with open(name, 'r') as stream:
         try:
             config = yaml.safe_load(stream)
+            print(f"Loaded {name}")
         except yaml.YAMLError as exc:
             print(exc)
 
     gi = bioblend.galaxy.GalaxyInstance(url=GALAXY_SERVER, key=API_KEY)
     print(f"Connected to {GALAXY_SERVER}")
 
-    workflow = config['workflow_id']
-    inputs = {}
-    for spec in config['inputs']:
-        input = gi.workflows.get_workflow_inputs(workflow, spec['name'])
-        if input is None or len(input) == 0:
-            print('ERROR: Invalid input specification')
-            sys.exit(1)
-        inputs[input[0]] = {'id': spec['dataset_id'], 'src': 'hda'}
-
-    if 'output_history_name' in config:
-        print(f"Saving output to a history named {config['output_history_name']}")
-        invocation = gi.workflows.invoke_workflow(workflow, inputs=inputs, history_name=config['output_history_name'])
-    else:
-        invocation = gi.workflows.invoke_workflow(workflow, inputs=inputs)
-
-    pprint(invocation)
-
-    output_path = os.path.join(INVOCATIONS_DIR, invocation['id'] + '.json')
-    with open(output_path, 'w') as f:
-        json.dump(invocation, f, indent=4)
-        print(f"Wrote {output_path}")
+    print(f"Found {len(config)} workflow definitions")
+    for workflow in config:
+        wfid = workflow['workflow_id']
+        inputs = {}
+        history_base_name = wfid
+        if Keys.HISTORY_BASE_NAME in workflow:
+            history_base_name = workflow[Keys.HISTORY_BASE_NAME]
+
+        if Keys.REFERENCE_DATA in workflow:
+            for spec in workflow[Keys.REFERENCE_DATA]:
+                input = gi.workflows.get_workflow_inputs(wfid, spec[Keys.NAME])
+                if input is None or len(input) == 0:
+                    print(f'ERROR: Invalid input specification for {spec[Keys.NAME]}')
+                    sys.exit(1)
+                inputs[input[0]] = { 'id': spec[Keys.DATASET_ID], 'src':'hda'}
+
+        count = 0
+        for run in workflow[Keys.RUNS]:
+            count += 1
+            if Keys.HISTORY_NAME in run:
+                output_history_name = f"{history_base_name} {run[Keys.HISTORY_NAME]}"
+            else:
+                output_history_name = f"{history_base_name} run {count}"
+            for spec in run[Keys.INPUTS]:
+                input = gi.workflows.get_workflow_inputs(wfid, spec[Keys.NAME])
+                if input is None or len(input) == 0:
+                    print(f'ERROR: Invalid input specification for {spec[Keys.NAME]}')
+                    sys.exit(1)
+
+                inputs[input[0]] = {'id': spec[Keys.DATASET_ID], 'src' : 'hda' }
+
+            invocation = gi.workflows.invoke_workflow(wfid, inputs=inputs, history_name=output_history_name)
+            pprint(invocation)
+            # output_path = os.path.join(INVOCATIONS_DIR, invocation['id'] + '.json')
+            output_path = os.path.join(INVOCATIONS_DIR, output_history_name.replace(' ', '_') + '.json')
+            with open(output_path, 'w') as f:
+                json.dump(invocation, f, indent=4)
+                print(f"Wrote {output_path}")
 
 
 def histories(args):