Skip to content

Commit 42659de

Browse files
authored
reorder snp matrix (#198)
* reorder matrix with terminal ends * fix typo * fix more typos * fix silly python indentation issues * change to breadth-first search * undo breadth-first * cast index/header as strings * remove unnecessary print statement * reroot tree * enable phandango coloring * add data summary option * rename things for clarity * syntax changes * terrafy * add changes to core_gene_snp * add changes to mashtree * debug statements added * more debug statements * fix syntax * syntax + debug changes * trying other things * hopefully last syntax bug fix * more debug * escapes parentheses in searches * terrafy * ensure sample_id is always a string * terrafy * fix indentation * changes requested * remove unused var * remove extra white space * changed data_summary_sample_names to sample_names * add cluster_name as summarized_data's output prefix * add _contigs suffix to mashtree outputs * correct syntax and comment * fix typo * remove addition of _contigs * remove mashtree boolean from workflows * remove _contigs suffix from terminal ends * using sed to remove "_contigs" from tree * remove unused var * fix typo :( * more remnants of past mistakes :( * remove _contigs from the matrix too * update version * fix indentations
1 parent b7c4324 commit 42659de

File tree

6 files changed

+296
-106
lines changed

6 files changed

+296
-106
lines changed

tasks/phylogenetic_inference/task_snp_dists.wdl

Lines changed: 62 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -12,104 +12,85 @@ task snp_dists {
1212
snp-dists -v | tee VERSION
1313
1414
# create snp-dists matrix file
15-
snp-dists ~{alignment} > snp-dists-matrix.tsv
15+
snp-dists ~{alignment} > ~{cluster_name}_snp_distance_matrix.tsv
16+
>>>
17+
output {
18+
String date = read_string("DATE")
19+
String version = read_string("VERSION")
20+
File snp_matrix = "~{cluster_name}_snp_distance_matrix.tsv"
21+
}
22+
runtime {
23+
docker: "quay.io/staphb/snp-dists:0.8.2"
24+
memory: "2 GB"
25+
cpu: 2
26+
disks: "local-disk " + disk_size + " SSD"
27+
disk: disk_size + " GB"
28+
maxRetries: 3
29+
preemptible: 0
30+
}
31+
}
32+
33+
task reorder_matrix {
34+
input {
35+
File input_tree
36+
File matrix
37+
String cluster_name
38+
Int disk_size = 100
39+
}
40+
command <<<
41+
# removing any "_contigs" suffixes from the tree and matrix
42+
sed 's/_contigs//g' ~{input_tree} > temporary_tree.nwk
43+
sed 's/_contigs//g' ~{matrix} > temporary_matrix.tsv
1644
17-
# create oredered snp-dists molten file
18-
snp-dists -m ~{alignment} | awk '{print $NF,$0}' | sort -n | cut -f2- -d' ' > snp-dists-molten-ordered.tsv
45+
python3 <<CODE
46+
from Bio import Phylo
47+
import pandas as pd
48+
import os
1949
20-
# create list of isolates in order of SNP-dists
21-
cut -f2 snp-dists-molten-ordered.tsv | awk '!seen[$0]++' > ordered-isolates.tsv
50+
# read in newick tree
51+
tree = Phylo.read("temporary_tree.nwk", "newick")
52+
53+
# read in matrix into pandas data frame
54+
snps = pd.read_csv("temporary_matrix.tsv", header=0, index_col=0, delimiter="\t")
2255
23-
python <<CODE
24-
# This script is based on Logan Fink's https://github.com/StaPH-B/CDPHE/blob/master/ordered_pairwise_matrix_generator_1.0.py
25-
# which takes in an ordered file and then returns a pairwise matrix based on that order.
26-
import sys, os
27-
cwd = os.getcwd()
56+
# ensure all header and index values are strings for proper reindexing
57+
# this is because if sample_name is entirely composed of integers, pandas
58+
# auto-casts them as integers; get_terminals() interprets those as strings.
59+
# this incompatibility leads to failure and an empty ordered SNP matrix
60+
snps.columns = snps.columns.astype(str)
61+
snps.index = snps.index.astype(str)
2862
29-
#Create a pairwise dictionary with all values possible, and create a list of the isolates
30-
pairwiseDict = {}
31-
isolates = []
32-
f = open("snp-dists-molten-ordered.tsv", 'r')
33-
flines = f.readlines()
34-
for line in flines:
35-
print line
36-
line = line.strip('\n')
37-
linesplit = line.split('\t')
38-
pairwiseDict[linesplit[0], linesplit[1]] = linesplit[2]
39-
pairwiseDict[linesplit[1], linesplit[0]] = linesplit[2]
40-
if not linesplit[0] in isolates:
41-
isolates.append(linesplit[0])
42-
if not linesplit[1] in isolates:
43-
isolates.append(linesplit[1])
44-
f.close()
63+
# reroot tree with midpoint
64+
tree.root_at_midpoint()
4565
46-
#Take in the ordered list created by the user and create a list object
47-
orderedList = []
48-
g = open("ordered-isolates.tsv", 'r')
49-
glines = g.readlines()
50-
for line in glines:
51-
line = line.strip('\n')
52-
if str(line)[0] == "#":
53-
pass
54-
else:
55-
orderedList.append(line)
56-
g.close()
66+
# extract ordered terminal ends of rerooted tree
67+
term_names = [term.name for term in tree.get_terminals()]
5768
58-
# Seems inappropriate for snp-dists use case; removing for now
59-
#This is to make sure that if the names aren't exactly the same in the ordered file
60-
#(eg. doesn't include ".cleaned"), they will still work
61-
#newOrderedList = []
62-
#for item in orderedList:
63-
# original = 1
64-
# for isolate in isolates:
65-
# if str(item) in str(isolate):
66-
# newOrderedList.append(isolate)
67-
# original = 0
68-
# elif str(isolate) in str(item):
69-
# newOrderedList.append(isolate)
70-
# original = 0
71-
# else:
72-
# continue
73-
# if original==1:
74-
# newOrderedList.append(item)
69+
# reorder matrix with re-ordered terminal ends
70+
snps = snps.reindex(index=term_names, columns=term_names)
7571
76-
#Open the output file and write the first line
77-
z=open("~{cluster_name}_snp_distance_matrix.tsv", 'w')
78-
z.write(".")
79-
for item in orderedList:
80-
z.write('\t')
81-
z.write(str(item))
82-
z.write('\n')
72+
# add phandango suffix to ensure continuous coloring
73+
snps_out2 = snps.add_suffix(":c1")
8374
84-
#Write the matrix using the ordered pairs dictionary
85-
for item1 in orderedList:
86-
z.write(str(item1))
87-
for item2 in orderedList:
88-
if item1 == item2:
89-
z.write('\t')
90-
z.write("-")
91-
else:
92-
z.write('\t')
93-
z.write(str(pairwiseDict[item1, item2]))
94-
z.write('\n')
95-
z.close()
96-
print "Matrix has been created in current directory as '~{cluster_name}_snp_distance_matrix.tsv.'"
75+
# write out reordered matrix of rerooted tree to a file
76+
snps_out2.to_csv("~{cluster_name}_snp_matrix.csv", sep=",")
77+
78+
# write rerooted tree to a file
79+
Phylo.write(tree, "~{cluster_name}_tree.nwk", "newick")
9780
9881
CODE
9982
>>>
100-
output {
101-
String date = read_string("DATE")
102-
String version = read_string("VERSION")
103-
File snp_matrix = "${cluster_name}_snp_distance_matrix.tsv"
104-
File snp_dists_molten_ordered = "snp-dists-molten-ordered.tsv"
83+
output{
84+
File ordered_matrix = "~{cluster_name}_snp_matrix.csv"
85+
File tree = "~{cluster_name}_tree.nwk"
10586
}
10687
runtime {
107-
docker: "quay.io/staphb/snp-dists:0.8.2"
88+
docker: "staphb/mykrobe:0.12.1" # used because it contains both biopython and pandas
10889
memory: "2 GB"
10990
cpu: 2
11091
disks: "local-disk " + disk_size + " SSD"
11192
disk: disk_size + " GB"
112-
maxRetries: 3
93+
# maxRetries: 3
11394
preemptible: 0
11495
}
11596
}

tasks/task_versioning.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ task version_capture {
88
volatile: true
99
}
1010
command {
11-
PHBG_Version="PHBG v1.1.0"
11+
PHBG_Version="PHBG v1.1.1"
1212
~{default='' 'export TZ=' + timezone}
1313
date +"%Y-%m-%d" > TODAY
1414
echo "$PHBG_Version" > PHBG_VERSION
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
version 1.0
2+
3+
task summarize_data {
4+
input {
5+
Array[String]? sample_names
6+
String? terra_project
7+
String? terra_workspace
8+
String? terra_table
9+
String? column_names # string of comma-delimited column names
10+
String? output_prefix
11+
12+
Int disk_size = 100
13+
File? input_table
14+
Boolean phandango_coloring = true
15+
}
16+
command <<<
17+
# when running on terra, comment out all input_table mentions
18+
python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{terra_project}" --workspace "~{terra_workspace}" --entity_type ~{terra_table} --tsv_filename ~{terra_table}-data.tsv
19+
20+
# when running locally, use the input_table in place of downloading from Terra
21+
#cp ~{input_table} ~{terra_table}-data.tsv
22+
23+
if ~{phandango_coloring}; then
24+
export phandango_coloring="true"
25+
else
26+
export phandango_coloring="false"
27+
fi
28+
29+
python3 <<CODE
30+
import pandas as pd
31+
import numpy as np
32+
import itertools
33+
import os
34+
import re
35+
36+
# read exported Terra table into pandas
37+
tablename = "~{terra_table}-data.tsv"
38+
table = pd.read_csv(tablename, delimiter='\t', header=0, index_col=False, dtype={"~{terra_table}_id": 'str'}) # ensure sample_id is always a string
39+
40+
# extract the samples for upload from the entire table
41+
table = table[table["~{terra_table}_id"].isin("~{sep='*' sample_names}".split("*"))]
42+
43+
# split column list into an array
44+
columns = "~{column_names}".split(",")
45+
46+
temporarylist = []
47+
temporarylist.append("~{terra_table}_id")
48+
temporarylist += columns
49+
50+
table = table[temporarylist].copy()
51+
52+
# create a table to search through containing only columns of interest
53+
searchtable = table[columns].copy()
54+
55+
# iterate through the columns of interest and combine into a single list
56+
genes = []
57+
for item in columns:
58+
genes.append(table[item].str.split(",").explode().tolist())
59+
60+
# add phandango coloring tags if indicated
61+
if (os.environ["phandango_coloring"] == "true"):
62+
i = 1 # initialize a starting group at 1
63+
newgenes = [] # create a new temporary list
64+
for group in genes: # iterate through gene list by items found within a columnkk
65+
if (i < 10):
66+
newgroup = [] # create a new temporary sublist
67+
for item in group: # for every item in a column,
68+
newitem = item + ":o" + str(i) # add a unique :o coloring (as indicated by the str(i))
69+
newgroup.append(newitem) # add phandango-suffixed item to the new sublist
70+
newgenes.append(newgroup) # add the new sublist to the new list
71+
i += 1 # increment the i value so each column gets its own coloring
72+
else:
73+
print("DEBUG: Some items will be ignored because a maximum of ten columns can be presented at once with phandango coloring.")
74+
75+
# overwrite genes with newgenes (which now has the phandango coloring suffix)
76+
genes = newgenes
77+
else:
78+
print("NOTE: Phandango coloring was not applied")
79+
80+
# flattening the list
81+
genes = list(itertools.chain.from_iterable(genes))
82+
83+
# removing duplicates but maintaining order
84+
genes = sorted(set(genes), key=lambda x: genes.index(x))
85+
86+
# add genes as true/false entries into table
87+
for item in genes:
88+
if (os.environ["phandango_coloring"] == "true"): # temporarily removes coloring suffix (CAUTION: ASSUMES LESS THAN 10 COLUMN NAMES PROVIDED)
89+
table[item] = searchtable.apply(lambda row: row.astype(str).str.contains(re.escape(item[:len(item)-3])).any(), axis=1)
90+
91+
else:
92+
table[item] = searchtable.apply(lambda row: row.astype(str).str.contains(re.escape(item)).any(), axis=1)
93+
94+
# replace all "False" cells with empty strings
95+
table[table.eq(False)] = np.nan
96+
97+
# dropping columns of interest so only true/false ones remain
98+
table.drop(columns,axis=1,inplace=True)
99+
100+
table.to_csv("~{output_prefix}_summarized_data.csv", sep=',', index=False)
101+
102+
CODE
103+
>>>
104+
output {
105+
File summarized_data = "~{output_prefix}_summarized_data.csv"
106+
}
107+
runtime {
108+
docker: "broadinstitute/terra-tools:tqdm"
109+
memory: "8 GB"
110+
cpu: 1
111+
disks: "local-disk " + disk_size + " SSD"
112+
disk: disk_size + " GB"
113+
dx_instance_type: "mem1_ssd1_v2_x2"
114+
maxRetries: 3
115+
}
116+
}

workflows/wf_core_gene_snp.wdl

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import "../tasks/phylogenetic_inference/task_pirate.wdl" as pirate
44
import "../tasks/phylogenetic_inference/task_iqtree.wdl" as iqtree
55
import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
66
import "../tasks/task_versioning.wdl" as versioning
7+
import "../tasks/utilities/task_summarize_data.wdl" as data_summary
8+
79

810
workflow core_gene_snp_workflow {
911
input {
@@ -16,6 +18,12 @@ workflow core_gene_snp_workflow {
1618
Boolean core_tree = true
1719
# use pan_tree = true to produce a phylogenetic tree and snp distance matrix from the pangenome alignment
1820
Boolean pan_tree = false
21+
# data summary input variables
22+
Array[String]? sample_names
23+
String? data_summary_terra_project
24+
String? data_summary_terra_workspace
25+
String? data_summary_terra_table
26+
String? data_summary_column_names
1927
}
2028
call pirate.pirate as pirate {
2129
input:
@@ -35,6 +43,12 @@ workflow core_gene_snp_workflow {
3543
alignment = select_first([pirate.pirate_core_alignment_fasta]),
3644
cluster_name = cluster_name
3745
}
46+
call snp_dists.reorder_matrix as core_reorder_matrix {
47+
input:
48+
input_tree = core_iqtree.ml_tree,
49+
matrix = core_snp_dists.snp_matrix,
50+
cluster_name = cluster_name + "_core"
51+
}
3852
}
3953
if (pan_tree) {
4054
call iqtree.iqtree as pan_iqtree {
@@ -47,6 +61,23 @@ workflow core_gene_snp_workflow {
4761
alignment = select_first([pirate.pirate_pangenome_alignment_fasta]),
4862
cluster_name = cluster_name
4963
}
64+
call snp_dists.reorder_matrix as pan_reorder_matrix {
65+
input:
66+
input_tree = pan_iqtree.ml_tree,
67+
matrix = pan_snp_dists.snp_matrix,
68+
cluster_name = cluster_name + "_pan"
69+
}
70+
}
71+
}
72+
if (defined(data_summary_column_names)) {
73+
call data_summary.summarize_data {
74+
input:
75+
sample_names = sample_names,
76+
terra_project = data_summary_terra_project,
77+
terra_workspace = data_summary_terra_workspace,
78+
terra_table = data_summary_terra_table,
79+
column_names = data_summary_column_names,
80+
output_prefix = cluster_name
5081
}
5182
}
5283
call versioning.version_capture{
@@ -67,11 +98,14 @@ workflow core_gene_snp_workflow {
6798
String pirate_docker_image = pirate.pirate_docker_image
6899
# snp_dists outputs
69100
String? pirate_snps_dists_version = select_first([core_snp_dists.version,pan_snp_dists.version,""])
70-
File? pirate_core_snp_matrix = core_snp_dists.snp_matrix
71-
File? pirate_pan_snp_matrix = pan_snp_dists.snp_matrix
72101
# iqtree outputs
73102
String? pirate_iqtree_version = select_first([core_iqtree.version,pan_iqtree.version,""])
74-
File? pirate_iqtree_core_tree = core_iqtree.ml_tree
75-
File? pirate_iqtree_pan_tree = pan_iqtree.ml_tree
103+
# reorder matrix outputs
104+
File? pirate_core_snp_matrix = core_reorder_matrix.ordered_matrix
105+
File? pirate_iqtree_core_tree = core_reorder_matrix.tree
106+
File? pirate_pan_snp_matrix = pan_reorder_matrix.ordered_matrix
107+
File? pirate_iqtree_pan_tree = pan_reorder_matrix.tree
108+
# Data summary outputs
109+
File? pirate_summarized_data = summarize_data.summarized_data
76110
}
77111
}

0 commit comments

Comments
 (0)