Skip to content

Commit

Permalink
Add optional parameter to use config file to specify reference(s) (#61)
Browse files Browse the repository at this point in the history
* Add reference_config file option for inputting reference(s) and reference weight(s)

* refactor ntjoin_test.py

* Add tests for reference config file

* Add files for reference config tests

* Update config file naming

* Update README.md
  • Loading branch information
lcoombe authored Feb 5, 2021
1 parent 4aad6d0 commit 9c9e58d
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 47 deletions.
23 changes: 20 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ m Minimum percentage of increasing/decreasing minimizer positions to orient co
mkt If True, use Mann-Kendall Test to predict contig orientation (computationally-intensive, overrides 'm') [False]
agp If True, output AGP file describing output scaffolds [False]
no_cut If True, will not cut contigs at putative misassemblies [False]
time If True, will log the time for each step [False]"
time If True, will log the time for each step [False]
reference_config Config file with reference assemblies and reference weights as comma-separated values (See README for example)
This is optional, and will override the 'references' and 'reference_weights' variables if specified
Notes:
- Ensure the lists of reference assemblies and weights are in the same order, and that both are space-separated
Expand All @@ -76,8 +78,8 @@ Notes:

Running `ntJoin help` prints the help documentation.

### Example

### Examples
#### Typical ntJoin usage:
* Target assembly to scaffold: my_scaffolds.fa
* Assembly to use as 'reference': assembly_ref1.fa
* Giving the target asssembly a weight of '1' and reference assembly a weight of '2'
Expand All @@ -88,6 +90,21 @@ Running `ntJoin help` prints the help documentation.
ntJoin assemble target=my_scaffolds.fa target_weight=1 references='assembly_ref1.fa' reference_weights='2' k=32 w=500
```

#### Using a config file to specify references (optional):
* Alternatively, the reference(s) and reference weight(s) can be specified in a comma-separated config file with one row per reference assembly/weight:
```
reference1.fa,reference1_weight
reference2.fa,reference2_weight
```
* Then, the ntJoin command would use the file specified by `reference_config` for determining the reference(s) and reference weight(s) instead of `references` and `reference_weights`
* If both the `reference_config` and the `references` variables are specified, `reference_config` will override the other variables
* Example config files can be found in the `tests` directory: `test_config_single.csv`, `test_config_multiple.csv`
* **As with the typical ntJoin usage, ensure that all input assembly files are in or have soft-links to the current working directory, and do not use absolute/relative paths in the config file**

```
ntJoin assemble target=my_scaffolds.fa target_weight=1 reference_config=config_file.csv k=32 w=500
```

### Output files

* Scaffolded targeted assembly (`<target assembly>.k<k>.w<w>.n<n>.all.scaffolds.fa`)
Expand Down
43 changes: 31 additions & 12 deletions ntJoin
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,24 @@
# Input files
target=None
references=None
min_ref_targets=$(addsuffix .k$(k).w$(w).tsv, $(references))
fai_ref_targets=$(addsuffix .fai, $(references))
reference_config=None

# List of edge weights
reference_weights=None
target_weight=1

# Prepare targets
ifneq ($(reference_config), None)
references_list=$(shell cat $(reference_config) | awk -F"," '{print $$1}')
reference_weights_list=$(shell cat $(reference_config) | awk -F"," '{print $$2}')
else
references_list=$(references)
reference_weights_list=$(reference_weights)
endif

min_ref_targets=$(addsuffix .k$(k).w$(w).tsv, $(references_list))
fai_ref_targets=$(addsuffix .fai, $(references_list))

# Path to minimizer assemble code
assemble_path=$(shell dirname $(realpath $(MAKEFILE_LIST)))

Expand Down Expand Up @@ -112,6 +123,8 @@ help:
@echo "agp If True, output AGP file describing output scaffolds [False]"
@echo "no_cut If True, will not cut contigs at putative misassemblies [False]"
@echo "time If True, will log the time for each step [False]"
@echo "reference_config Config file with reference assemblies and reference weights as comma-separated values (See README for example)"
@echo " This is optional, and will override the 'references' and 'reference_weights' variables if specified"
@echo ""
@echo "Notes: "
@echo " - Ensure the lists of reference assemblies and weights are in the same order, and that both are space-separated"
Expand All @@ -129,19 +142,25 @@ assemble: check_params \
$(target).k$(k).w$(w).n$(n).all.scaffolds.fa

analysis: check_params_analysis \
$(addsuffix .bam.bai, $(references)) \
$(addsuffix .bam.bai, $(references_list)) \
$(target).bam.bai \
$(target).k$(k).w$(w).n$(n).all.scaffolds.fa.bam.bai

all: check_params assemble analysis

check_params:
ifeq ($(references), None)
ifeq ($(reference_config), None)
$(error ERROR: Must set references)
endif
endif

ifeq ($(reference_weights), None)
ifeq ($(reference_config), None)
$(error ERROR: Must set reference_weights)
endif
endif

ifeq ($(target), None)
$(error ERROR: Must set target)
endif
Expand Down Expand Up @@ -173,36 +192,36 @@ ifeq ($(mkt), True)
ifeq ($(agp), True)
ifeq ($(no_cut), True)
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt --agp --no_cut $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt --agp --no_cut $(min_ref_targets)
else
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt --agp $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt --agp $(min_ref_targets)
endif
else
ifeq ($(no_cut), True)
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt --no_cut $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt --no_cut $(min_ref_targets)
else
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) --mkt $(min_ref_targets)
endif
endif
else
ifeq ($(agp), True)
ifeq ($(no_cut), True)
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) --agp --no_cut $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) --agp --no_cut $(min_ref_targets)
else
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) --agp $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) --agp $(min_ref_targets)
endif
else
ifeq ($(no_cut), True)
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) --no_cut $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) --no_cut $(min_ref_targets)
else
$(log_time) $(assemble_path)/bin/ntjoin_assemble.py -p $(prefix) -n $(n) -s $< -l $(target_weight) \
-r "$(reference_weights)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) $(min_ref_targets)
-r "$(reference_weights_list)" -k $(k) -g $(g) -G $(G) -t $(assemble_t) -m $(m) $(min_ref_targets)
endif
endif
endif
Expand All @@ -220,7 +239,7 @@ $(target).k$(k).w$(w).n$(n).unassigned.scaffolds.fa: $(target).k$(k).w$(w).n$(n)
samtools index $<

# Run QUAST
quast_$(prefix)/report.tsv: $(references) $(target) $(target).k$(k).w$(w).n$(n).all.scaffolds.fa
quast_$(prefix)/report.tsv: $(references_list) $(target) $(target).k$(k).w$(w).n$(n).all.scaffolds.fa
ifeq ($(large), 1)
quast -t $(t) -o quast_$(prefix) -r $(ref) --fast --scaffold-gap-max-size 100000 --split-scaffolds \
--large $^
Expand Down
88 changes: 56 additions & 32 deletions tests/ntjoin_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@
import subprocess
import re


def run_ntjoin(file1, file2, prefix, window=1000, n=2):
"Run ntJoin with a pair of files"
cmd = "../ntJoin assemble -B target=%s target_weight=1 references=\'%s\' reference_weights=\'2\' " \
"prefix=%s k=32 w=%s n=%s" % (file2, file1, prefix, window, n)
def launch_ntjoin(cmd, prefix):
cmd_shlex = shlex.split(cmd)
return_code = subprocess.call(cmd_shlex)
assert return_code == 0
Expand All @@ -20,40 +16,31 @@ def run_ntjoin(file1, file2, prefix, window=1000, n=2):
return_paths.append(line.strip())
return return_paths

def run_ntjoin_nocut(file1, file2, prefix, window=1000, n=2):
def run_ntjoin(ref1, target, prefix, window=1000, n=2):
"Run ntJoin with a pair of files"
cmd = "../ntJoin assemble -B target=%s target_weight=1 references=\'%s\' reference_weights=\'2\' " \
"prefix=%s k=32 w=%s n=%s no_cut=True" % (file2, file1, prefix, window, n)
cmd_shlex = shlex.split(cmd)
return_code = subprocess.call(cmd_shlex)
assert return_code == 0
return_paths = []
with open(prefix + ".path", 'r') as paths:
for line in paths:
path_match = re.search(r'^ntJoin', line)
if path_match:
return_paths.append(line.strip())
cmd = "../ntJoin assemble -B target={target} target_weight=1 references=\'{ref}\' reference_weights=\'2\' " \
"prefix={prefix} k=32 w={w} n={n}".format(target=target, ref=ref1, prefix=prefix, w=window, n=n)
return_paths = launch_ntjoin(cmd, prefix)
return return_paths

def run_ntjoin_multiple(file1, file2, file3, prefix, window=1000, n=2):
def run_ntjoin_nocut(ref1, target, prefix, window=1000, n=2):
"Run ntJoin with a pair of files"
cmd = "../ntJoin assemble -B target={target} target_weight=1 references=\'{ref}\' reference_weights=\'2\' " \
"prefix={prefix} k=32 w={w} n={n} no_cut=True".format(target=target, ref=ref1, prefix=prefix, w=window, n=n)
return_paths = launch_ntjoin(cmd, prefix)
return return_paths

def run_ntjoin_multiple(ref1, ref2, target, prefix, window=1000, n=2):
"Run ntJoin with a target and 2 references"
cmd = "../ntJoin assemble -B target=%s target_weight=1 references=\'%s %s\' reference_weights=\'2 2\' " \
"prefix=%s k=32 w=%s n=%s" % (file3, file1, file2, prefix, window, n)
cmd_shlex = shlex.split(cmd)
return_code = subprocess.call(cmd_shlex)
assert return_code == 0
return_paths = []
with open(prefix + ".path", 'r') as paths:
for line in paths:
path_match = re.search(r'^ntJoin', line)
if path_match:
return_paths.append(line.strip())
cmd = "../ntJoin assemble -B target={target} target_weight=1 references=\'{ref1} {ref2}\' reference_weights=\'2 2\' " \
"prefix={prefix} k=32 w={w} n={n}".format(target=target, ref1=ref1, ref2=ref2, prefix=prefix, w=window, n=n)
return_paths = launch_ntjoin(cmd, prefix)
return return_paths

def run_ntjoin_agp(file1, file2, prefix, window=1000, n=2):
def run_ntjoin_agp(ref1, target, prefix, window=1000, n=2):
"Run ntJoin with a pair of files"
cmd = "../ntJoin assemble -B target=%s target_weight=1 references=\'%s\' reference_weights=\'2\' " \
"prefix=%s k=32 w=%s n=%s agp=True" % (file2, file1, prefix, window, n)
cmd = "../ntJoin assemble -B target={target} target_weight=1 references=\'{ref}\' reference_weights=\'2\' " \
"prefix={prefix} k=32 w={w} n={n} agp=True".format(target=target, ref=ref1, prefix=prefix, w=window, n=n)
cmd_shlex = shlex.split(cmd)
return_code = subprocess.call(cmd_shlex)
assert return_code == 0
Expand All @@ -63,6 +50,22 @@ def run_ntjoin_agp(file1, file2, prefix, window=1000, n=2):
return_agp.append(line.strip())
return return_agp

def run_ntjoin_config(config_file, target, prefix, window=1000, n=2):
"Run ntJoin with a target and reference config file"
cmd = "../ntJoin assemble -B target={target} target_weight=1 " \
"reference_config={config} prefix={prefix} k=32 w={w} n={n}".format(target=target, config=config_file,
prefix=prefix, w=window, n=n)
return_paths = launch_ntjoin(cmd, prefix)
return return_paths

def run_ntjoin_config_extra(config_file, target, prefix, window=1000, n=2):
"Run ntJoin with a target and reference config file, overriding reference and reference_weights variables"
cmd = "../ntJoin assemble -B target={target} target_weight=1 reference=na reference_weights=na " \
"reference_config={config} prefix={prefix} k=32 w={w} n={n}".format(target=target, config=config_file,
prefix=prefix, w=window, n=n)
return_paths = launch_ntjoin(cmd, prefix)
return return_paths

# Following 4 tests to check for the expected PATHs given 2 pieces that should be merged
# together based on the reference in different orientations
# - Pieces are the reference piece split, with ~20bp in between
Expand Down Expand Up @@ -138,12 +141,33 @@ def test_regions_fr_rf():
assert paths.pop().split("\t")[1] in expected_paths
assert paths.pop().split("\t")[1] in expected_paths

def test_regions_fr_rf_config():
"Testing ntJoin correcting misassemblies, joins in fwd-rev and rev-fwd, using config file"
paths = run_ntjoin_config("test_config_single.csv", "scaf.misassembled.f-r.r-f.fa", "regions-fr-rf_test", 500, n=2)
assert len(paths) == 2
assert paths[0] != paths[1]
expected_paths = ["2_1n-1_2n-:0-2176 212N 1_1p-2_2p+:2017-4489", "1_1p-2_2p+:0-1617 198N 2_1n-1_2n-:2675-4379"]
assert paths.pop().split("\t")[1] in expected_paths
assert paths.pop().split("\t")[1] in expected_paths

def test_regions_3():
"Testing ntJoin with target + 2 references"
paths = run_ntjoin_multiple("ref.fa", "scaf.f-f.copy.fa", "scaf.f-f.fa", "f-f-f_test", n=1)
assert len(paths) == 1
assert paths.pop() == "ntJoin0\t1_f+:0-1981 20N 2_f+:0-2329"

def test_regions_3_config():
"Testing ntJoin with target + 2 references, using config file"
paths = run_ntjoin_config("test_config_multiple.csv", "scaf.f-f.fa", "f-f-f_test", n=1)
assert len(paths) == 1
assert paths.pop() == "ntJoin0\t1_f+:0-1981 20N 2_f+:0-2329"

def test_regions_3_config_extra():
"Testing ntJoin with target + 2 references, using config file, command having extra parameters"
paths = run_ntjoin_config_extra("test_config_multiple.csv", "scaf.f-f.fa", "f-f-f_test", n=1)
assert len(paths) == 1
assert paths.pop() == "ntJoin0\t1_f+:0-1981 20N 2_f+:0-2329"

# Testing AGP output
def test_mx_r_f():
"Testing ntJoin with assembly + reference, rev-fwd orientation - AGP output"
Expand Down
2 changes: 2 additions & 0 deletions tests/test_config_multiple.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ref.fa,2
scaf.f-f.copy.fa,2
1 change: 1 addition & 0 deletions tests/test_config_single.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ref.multiple.fa,2

0 comments on commit 9c9e58d

Please sign in to comment.