Skip to content

Commit abc220f

Browse files
committed
Always filter :largecontigs by length
Before, datasets initiated from assembly bypassed this filter. Now all loaded assemblies are processed to remove contigs shorter than 1 kbp
1 parent 939adb2 commit abc220f

File tree

5 files changed

+36
-16
lines changed

5 files changed

+36
-16
lines changed

lib/miga/common/format.rb

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ def tabulate(header, values, tabular = false)
2323
end
2424

2525
##
26-
# Cleans a FastA file in place.
27-
def clean_fasta_file(file)
26+
# Cleans a FastA file in place, removing all sequences shorter than
27+
# +min_len+
28+
def clean_fasta_file(file, min_len = 1)
2829
tmp_fh = nil
2930
tmp_path = nil
3031
begin
@@ -39,19 +40,24 @@ def clean_fasta_file(file)
3940
tmp_path = tmp_fh.path
4041
fh = File.open(file, 'r')
4142
end
42-
buffer = ''.dup
43+
next_seq = ['', '']
4344
fh.each_line do |ln|
4445
ln.chomp!
4546
if ln =~ /^>\s*(\S+)(.*)/
4647
id, df = $1, $2
47-
tmp_fh.print buffer.wrap_width(80)
48-
buffer = ''.dup
49-
tmp_fh.puts ">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}"
48+
if next_seq[1].length >= min_len
49+
tmp_fh.puts next_seq[0]
50+
tmp_fh.print next_seq[1].wrap_width(80)
51+
end
52+
next_seq = [">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}", '']
5053
else
51-
buffer << ln.gsub(/[^A-Za-z\.\-]/, '')
54+
next_seq[1] += ln.gsub(/[^A-Za-z\.\-]/, '')
5255
end
5356
end
54-
tmp_fh.print buffer.wrap_width(80)
57+
if next_seq[1].length >= min_len
58+
tmp_fh.puts next_seq[0]
59+
tmp_fh.print next_seq[1].wrap_width(80)
60+
end
5561
tmp_fh.close
5662
fh.close
5763
FileUtils.mv(tmp_path, file)

lib/miga/dataset/result/add.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def add_result_assembly(base, opts)
8484
opts[:is_clean] ||= false
8585
r.clean! if opts[:is_clean]
8686
unless r.clean?
87-
MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs))
87+
MiGA::MiGA.clean_fasta_file(r.file_path(:largecontigs), 1000)
8888
r.clean!
8989
end
9090
r

lib/miga/version.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@ module MiGA
1212
# - String indicating release status:
1313
# - rc* release candidate, not released as gem
1414
# - [0-9]+ stable release, released as gem
15-
VERSION = [1.3, 18, 0].freeze
15+
VERSION = [1.3, 19, 0].freeze
1616

1717
##
1818
# Nickname for the current major.minor version.
1919
VERSION_NAME = 'mezzotint'
2020

2121
##
2222
# Date of the current gem relese.
23-
VERSION_DATE = Date.new(2024, 7, 12)
23+
VERSION_DATE = Date.new(2024, 7, 17)
2424

2525
##
2626
# References of MiGA

scripts/assembly.bash

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ if [[ -s "$DATASET/scaffold.fa" ]] ; then
5858
else
5959
ln -s "$DATASET/contig.fa" "$DATASET.AllContigs.fna"
6060
fi
61-
FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2>=1000{print $1}' \
61+
FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2 >= 1000 { print $1 }' \
6262
| FastA.filter.pl /dev/stdin "$DATASET.AllContigs.fna" \
6363
> "$DATASET.LargeContigs.fna"
6464

test/result_stats_test.rb

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,23 +83,37 @@ def test_assembly
8383
# Prepare result
8484
dir = 'data/05.assembly'
8585
fa = file_path(dir, '.LargeContigs.fna')
86-
File.open(fa, 'w') { |fh| fh.puts '>1', 'ACTAC' }
86+
File.open(fa, 'w') { |fh| fh.puts('>1', 'ACTAC' * 500) }
8787
touch_done(dir)
8888
r = dataset.add_result(:assembly)
8989

9090
# Test assertions
9191
assert_equal({}, r[:stats])
9292
r.compute_stats
9393
assert_equal(1, r[:stats][:contigs])
94-
assert_equal([5, 'bp'], r[:stats][:total_length])
94+
assert_equal([2500, 'bp'], r[:stats][:total_length])
9595
assert_equal([40.0, '%'], r[:stats][:g_c_content])
9696
end
9797

98+
def test_large_contigs
99+
# Prepare result
100+
dir = 'data/05.assembly'
101+
fa = file_path(dir, '.LargeContigs.fna')
102+
File.open(fa, 'w') { |fh| fh.puts('>1', 'ACTAC' * 50) }
103+
touch_done(dir)
104+
r = dataset.add_result(:assembly)
105+
106+
# Test assertions
107+
assert_equal({}, r[:stats])
108+
r.compute_stats
109+
assert_equal(0, r[:stats][:contigs])
110+
end
111+
98112
def test_cds
99113
# Prepare result
100114
dir = 'data/06.cds'
101115
fa = file_path(dir, '.faa')
102-
File.open(fa, 'w') { |fh| fh.puts '>1', 'M' }
116+
File.open(fa, 'w') { |fh| fh.puts('>1', 'M' * 500) }
103117
gff = file_path(dir, '.gff3.gz')
104118
Zlib::GzipWriter.open(gff) do |fh|
105119
fh.puts '# Model Data: a=b;transl_table=11;'
@@ -111,7 +125,7 @@ def test_cds
111125
assert_equal({}, r[:stats])
112126
r.compute_stats
113127
assert_equal(1, r[:stats][:predicted_proteins])
114-
assert_equal([1.0, 'aa'], r[:stats][:average_length])
128+
assert_equal([500.0, 'aa'], r[:stats][:average_length])
115129
assert_nil(r[:stats][:coding_density])
116130
test_assembly
117131
r.compute_stats

0 commit comments

Comments
 (0)