reorder snp matrix (#198)

sage-wright · web-flow · commit 42659de53520 · 2023-01-31T09:23:26.000-08:00
* reorder matrix with terminal ends

* fix typo

* fix more typos

* fix silly python indentation issues

* change to breadth-first search

* undo breadth-first

* cast index/header as strings

* remove unnecessary print statement

* reroot tree

* enable phandango coloring

* add data summary option

* rename things for clarity

* syntax changes

* terrafy

* add changes to core_gene_snp

* add changes to mashtree

* debug statements added

* more debug statements

* fix syntax

* syntax + debug changes

* trying other things

* hopefully last syntax bug fix

* more debug

* escapes parentheses in searches

* terrafy

* ensure sample_id is always a string

* terrafy

* fix indentation

* changes requested

* remove unused var

* remove extra white space

* changed data_summary_sample_names to sample_names

* add cluster_name as summarized_data's output prefix

* add _contigs suffix to mashtree outputs

* correct syntax and comment

* fix typo

* remove addition of _contigs

* remove mashtree boolean from workflows

* remove _contigs suffix from terminal ends

* using sed to remove "_contigs" from tree

* remove unused var

* fix typo :(

* more remnants of past mistakes :(

* remove _contigs from the matrix too

* update version

* fix indentations
diff --git a/tasks/phylogenetic_inference/task_snp_dists.wdl b/tasks/phylogenetic_inference/task_snp_dists.wdl
@@ -12,104 +12,85 @@ task snp_dists {
     snp-dists -v | tee VERSION
 
     # create snp-dists matrix file
-    snp-dists ~{alignment} > snp-dists-matrix.tsv 
+    snp-dists ~{alignment} > ~{cluster_name}_snp_distance_matrix.tsv
+  >>>
+  output {
+    String date = read_string("DATE")
+    String version = read_string("VERSION")
+    File snp_matrix = "~{cluster_name}_snp_distance_matrix.tsv"
+  }
+  runtime {
+    docker: "quay.io/staphb/snp-dists:0.8.2"
+    memory: "2 GB"
+    cpu: 2
+    disks: "local-disk " + disk_size + " SSD"
+    disk: disk_size + " GB"
+    maxRetries: 3
+    preemptible: 0
+  }
+}
+
+task reorder_matrix {
+  input {
+    File input_tree
+    File matrix
+    String cluster_name
+    Int disk_size = 100
+  }
+  command <<<
+    # removing any "_contigs" suffixes from the tree and matrix
+    sed 's/_contigs//g' ~{input_tree} > temporary_tree.nwk
+    sed 's/_contigs//g' ~{matrix} > temporary_matrix.tsv
 
-    # create oredered snp-dists molten file 
-    snp-dists -m ~{alignment} | awk '{print $NF,$0}' | sort -n | cut -f2- -d' ' > snp-dists-molten-ordered.tsv 
+    python3 <<CODE
+    from Bio import Phylo
+    import pandas as pd
+    import os
 
-    # create list of isolates in order of SNP-dists 
-    cut -f2 snp-dists-molten-ordered.tsv | awk '!seen[$0]++' > ordered-isolates.tsv
+    # read in newick tree
+    tree = Phylo.read("temporary_tree.nwk", "newick")
+    
+    # read in matrix into pandas data frame
+    snps = pd.read_csv("temporary_matrix.tsv", header=0, index_col=0, delimiter="\t")
 
-    python <<CODE
-    # This script is based on Logan Fink's https://github.com/StaPH-B/CDPHE/blob/master/ordered_pairwise_matrix_generator_1.0.py
-    # which takes in an ordered file and then returns a pairwise matrix based on that order.
-    import sys, os
-    cwd = os.getcwd()
+    # ensure all header and index values are strings for proper reindexing
+    # this is because if sample_name is entirely composed of integers, pandas 
+    # auto-casts them as integers; get_terminals() interprets those as strings. 
+    # this incompatibility leads to failure and an empty ordered SNP matrix
+    snps.columns = snps.columns.astype(str)
+    snps.index = snps.index.astype(str)
 
-    #Create a pairwise dictionary with all values possible, and create a list of the isolates
-    pairwiseDict = {}
-    isolates = []
-    f = open("snp-dists-molten-ordered.tsv", 'r')
-    flines  = f.readlines()
-    for line in flines:
-        print line
-        line = line.strip('\n')
-        linesplit = line.split('\t')
-        pairwiseDict[linesplit[0], linesplit[1]] = linesplit[2]
-        pairwiseDict[linesplit[1], linesplit[0]] = linesplit[2]
-        if not linesplit[0] in isolates:
-            isolates.append(linesplit[0])
-        if not linesplit[1] in isolates:
-            isolates.append(linesplit[1])
-    f.close()
+    # reroot tree with midpoint
+    tree.root_at_midpoint()
 
-    #Take in the ordered list created by the user and create a list object
-    orderedList = []
-    g = open("ordered-isolates.tsv", 'r')
-    glines = g.readlines()
-    for line in glines:
-        line = line.strip('\n')
-        if str(line)[0] == "#":
-            pass
-        else:
-            orderedList.append(line)
-    g.close()
+    # extract ordered terminal ends of rerooted tree
+    term_names = [term.name for term in tree.get_terminals()]
 
-    # Seems inappropriate for snp-dists use case; removing for now
-    #This is to make sure that if the names aren't exactly the same in the ordered file
-    #(eg. doesn't include ".cleaned"), they will still work
-    #newOrderedList = []
-    #for item in orderedList:
-    #    original = 1
-    #    for isolate in isolates:
-    #        if str(item) in str(isolate):
-    #            newOrderedList.append(isolate)
-    #            original = 0
-    #        elif str(isolate) in str(item):
-    #           newOrderedList.append(isolate)
-    #           original = 0
-    #        else:
-    #            continue
-    #    if original==1:
-    #        newOrderedList.append(item)
+    # reorder matrix with re-ordered terminal ends
+    snps = snps.reindex(index=term_names, columns=term_names)
 
-    #Open the output file and write the first line
-    z=open("~{cluster_name}_snp_distance_matrix.tsv", 'w')
-    z.write(".")
-    for item in orderedList:
-        z.write('\t')
-        z.write(str(item))
-    z.write('\n')
+    # add phandango suffix to ensure continuous coloring
+    snps_out2 = snps.add_suffix(":c1")
 
-    #Write the matrix using the ordered pairs dictionary
-    for item1 in orderedList:
-        z.write(str(item1))
-        for item2 in orderedList:
-            if item1 == item2:
-                z.write('\t')
-                z.write("-")
-            else:
-                z.write('\t')
-                z.write(str(pairwiseDict[item1, item2]))
-        z.write('\n')
-    z.close()
-    print "Matrix has been created in current directory as '~{cluster_name}_snp_distance_matrix.tsv.'"
+    # write out reordered matrix of rerooted tree to a file
+    snps_out2.to_csv("~{cluster_name}_snp_matrix.csv", sep=",")
+
+    # write rerooted tree to a file
+    Phylo.write(tree, "~{cluster_name}_tree.nwk", "newick")
 
     CODE
   >>>
-  output {
-    String date = read_string("DATE")
-    String version = read_string("VERSION")
-    File snp_matrix = "${cluster_name}_snp_distance_matrix.tsv"
-    File snp_dists_molten_ordered = "snp-dists-molten-ordered.tsv"
+  output{
+    File ordered_matrix = "~{cluster_name}_snp_matrix.csv"
+    File tree = "~{cluster_name}_tree.nwk"
   }
   runtime {
-    docker: "quay.io/staphb/snp-dists:0.8.2"
+    docker: "staphb/mykrobe:0.12.1" # used because it contains both biopython and pandas
     memory: "2 GB"
     cpu: 2
     disks: "local-disk " + disk_size + " SSD"
     disk: disk_size + " GB"
-    maxRetries: 3
+   # maxRetries: 3
     preemptible: 0
   }
 }
diff --git a/tasks/task_versioning.wdl b/tasks/task_versioning.wdl
@@ -8,7 +8,7 @@ task version_capture {
     volatile: true
   }
   command {
-    PHBG_Version="PHBG v1.1.0"
+    PHBG_Version="PHBG v1.1.1"
     ~{default='' 'export TZ=' + timezone}
     date +"%Y-%m-%d" > TODAY
     echo "$PHBG_Version" > PHBG_VERSION
diff --git a/tasks/utilities/task_summarize_data.wdl b/tasks/utilities/task_summarize_data.wdl
@@ -0,0 +1,116 @@
+version 1.0
+
+task summarize_data {
+  input {
+    Array[String]? sample_names
+    String? terra_project
+    String? terra_workspace
+    String? terra_table
+    String? column_names # string of comma-delimited column names
+    String? output_prefix 
+
+    Int disk_size = 100
+    File? input_table
+    Boolean phandango_coloring = true
+  }
+  command <<<   
+    # when running on terra, comment out all input_table mentions
+    python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{terra_project}" --workspace "~{terra_workspace}" --entity_type ~{terra_table} --tsv_filename ~{terra_table}-data.tsv 
+    
+    # when running locally, use the input_table in place of downloading from Terra
+    #cp ~{input_table} ~{terra_table}-data.tsv
+    
+    if ~{phandango_coloring}; then
+      export phandango_coloring="true"
+    else 
+      export phandango_coloring="false"
+    fi
+
+    python3 <<CODE 
+  import pandas as pd
+  import numpy as np
+  import itertools
+  import os
+  import re
+
+  # read exported Terra table into pandas
+  tablename = "~{terra_table}-data.tsv" 
+  table = pd.read_csv(tablename, delimiter='\t', header=0, index_col=False, dtype={"~{terra_table}_id": 'str'}) # ensure sample_id is always a string
+
+  # extract the samples for upload from the entire table
+  table = table[table["~{terra_table}_id"].isin("~{sep='*' sample_names}".split("*"))]
+
+  # split column list into an array
+  columns = "~{column_names}".split(",")
+
+  temporarylist = []
+  temporarylist.append("~{terra_table}_id")
+  temporarylist += columns
+
+  table = table[temporarylist].copy()
+
+  # create a table to search through containing only columns of interest
+  searchtable = table[columns].copy()
+
+  # iterate through the columns of interest and combine into a single list
+  genes = []
+  for item in columns:
+    genes.append(table[item].str.split(",").explode().tolist())
+
+  # add phandango coloring tags if indicated
+  if (os.environ["phandango_coloring"] == "true"):
+    i = 1 # initialize a starting group at 1
+    newgenes = [] # create a new temporary list
+    for group in genes: # iterate through gene list by items found within a columnkk
+      if (i < 10):
+        newgroup = [] # create a new temporary sublist
+        for item in group: # for every item in a column, 
+          newitem = item + ":o" + str(i) # add a unique :o coloring (as indicated by the str(i))
+          newgroup.append(newitem) # add phandango-suffixed item to the new sublist
+        newgenes.append(newgroup) # add the new sublist to the new list
+        i += 1 # increment the i value so each column gets its own coloring     
+      else:
+        print("DEBUG: Some items will be ignored because a maximum of ten columns can be presented at once with phandango coloring.")
+
+    # overwrite genes with newgenes (which now has the phandango coloring suffix)
+    genes = newgenes 
+  else:
+    print("NOTE: Phandango coloring was not applied")
+
+  # flattening the list
+  genes = list(itertools.chain.from_iterable(genes))
+
+  # removing duplicates but maintaining order
+  genes = sorted(set(genes), key=lambda x: genes.index(x))
+
+  # add genes as true/false entries into table
+  for item in genes:
+    if (os.environ["phandango_coloring"] == "true"): # temporarily removes coloring suffix (CAUTION: ASSUMES LESS THAN 10 COLUMN NAMES PROVIDED)
+      table[item] = searchtable.apply(lambda row: row.astype(str).str.contains(re.escape(item[:len(item)-3])).any(), axis=1)
+
+    else:
+      table[item] = searchtable.apply(lambda row: row.astype(str).str.contains(re.escape(item)).any(), axis=1)
+
+  # replace all "False" cells with empty strings
+  table[table.eq(False)] = np.nan
+
+  # dropping columns of interest so only true/false ones remain
+  table.drop(columns,axis=1,inplace=True)
+
+  table.to_csv("~{output_prefix}_summarized_data.csv", sep=',', index=False)
+
+  CODE
+  >>>
+  output {
+    File summarized_data = "~{output_prefix}_summarized_data.csv"
+  }
+  runtime {
+    docker: "broadinstitute/terra-tools:tqdm"
+    memory: "8 GB"
+    cpu: 1
+    disks: "local-disk " + disk_size + " SSD"
+    disk: disk_size + " GB"
+    dx_instance_type: "mem1_ssd1_v2_x2"
+    maxRetries: 3
+  }
+}
diff --git a/workflows/wf_core_gene_snp.wdl b/workflows/wf_core_gene_snp.wdl
@@ -4,6 +4,8 @@ import "../tasks/phylogenetic_inference/task_pirate.wdl" as pirate
 import "../tasks/phylogenetic_inference/task_iqtree.wdl" as iqtree
 import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
 import "../tasks/task_versioning.wdl" as versioning
+import "../tasks/utilities/task_summarize_data.wdl" as data_summary
+
 
 workflow core_gene_snp_workflow {
   input {
@@ -16,6 +18,12 @@ workflow core_gene_snp_workflow {
     Boolean core_tree = true
     # use pan_tree = true to produce a phylogenetic tree and snp distance matrix from the pangenome alignment
     Boolean pan_tree = false
+    # data summary input variables
+    Array[String]? sample_names
+    String? data_summary_terra_project
+    String? data_summary_terra_workspace
+    String? data_summary_terra_table
+    String? data_summary_column_names
   }
   call pirate.pirate as pirate {
     input:
@@ -35,6 +43,12 @@ workflow core_gene_snp_workflow {
           alignment = select_first([pirate.pirate_core_alignment_fasta]),
           cluster_name = cluster_name
       }
+      call snp_dists.reorder_matrix as core_reorder_matrix {
+        input:
+          input_tree = core_iqtree.ml_tree,
+          matrix = core_snp_dists.snp_matrix,
+          cluster_name = cluster_name + "_core"
+      }
     }
     if (pan_tree) {
       call iqtree.iqtree as pan_iqtree {
@@ -47,6 +61,23 @@ workflow core_gene_snp_workflow {
           alignment = select_first([pirate.pirate_pangenome_alignment_fasta]),
           cluster_name = cluster_name
       }
+      call snp_dists.reorder_matrix as pan_reorder_matrix {
+        input:
+          input_tree = pan_iqtree.ml_tree,
+          matrix = pan_snp_dists.snp_matrix,
+          cluster_name = cluster_name + "_pan"
+      }
+    }
+  }
+  if (defined(data_summary_column_names)) {
+    call data_summary.summarize_data {
+      input:
+        sample_names = sample_names,
+        terra_project = data_summary_terra_project,
+        terra_workspace = data_summary_terra_workspace,
+        terra_table = data_summary_terra_table,
+        column_names = data_summary_column_names,
+        output_prefix = cluster_name
     }
   }
   call versioning.version_capture{
@@ -67,11 +98,14 @@ workflow core_gene_snp_workflow {
     String pirate_docker_image = pirate.pirate_docker_image
     # snp_dists outputs
     String? pirate_snps_dists_version = select_first([core_snp_dists.version,pan_snp_dists.version,""])
-    File? pirate_core_snp_matrix = core_snp_dists.snp_matrix
-    File? pirate_pan_snp_matrix = pan_snp_dists.snp_matrix
     # iqtree outputs
     String? pirate_iqtree_version = select_first([core_iqtree.version,pan_iqtree.version,""])
-    File? pirate_iqtree_core_tree = core_iqtree.ml_tree
-    File? pirate_iqtree_pan_tree = pan_iqtree.ml_tree
+    # reorder matrix outputs
+    File? pirate_core_snp_matrix = core_reorder_matrix.ordered_matrix
+    File? pirate_iqtree_core_tree = core_reorder_matrix.tree
+    File? pirate_pan_snp_matrix = pan_reorder_matrix.ordered_matrix
+    File? pirate_iqtree_pan_tree = pan_reorder_matrix.tree
+    # Data summary outputs
+    File? pirate_summarized_data = summarize_data.summarized_data
   }
 }
diff --git a/workflows/wf_ksnp3.wdl b/workflows/wf_ksnp3.wdl
diff --git a/workflows/wf_mashtree_fasta.wdl b/workflows/wf_mashtree_fasta.wdl

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ task version_capture {`
`8`	`8`	`volatile: true`
`9`	`9`	`}`
`10`	`10`	`command {`
`11`		`- PHBG_Version="PHBG v1.1.0"`
	`11`	`+ PHBG_Version="PHBG v1.1.1"`
`12`	`12`	`~{default='' 'export TZ=' + timezone}`
`13`	`13`	`date +"%Y-%m-%d" > TODAY`
`14`	`14`	`echo "$PHBG_Version" > PHBG_VERSION`