Skip to content

Commit a9c90f5

Browse files
authored
Merge pull request #172 from bio-miga/plasmids
Plasmids and Remote Overhaul
2 parents 3240538 + 8228f92 commit a9c90f5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1211
-334
lines changed

lib/miga/cli/action/add_result.rb

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,17 @@
55

66
class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
77
def parse_cli
8-
cli.defaults = { force: false }
8+
cli.defaults = { force: false, stdin_versions: false }
99
cli.parse do |opt|
1010
cli.opt_object(opt, [:project, :dataset_opt, :result])
1111
opt.on(
1212
'-f', '--force',
1313
'Force re-indexing of the result even if it\'s already registered'
1414
) { |v| cli[:force] = v }
15+
opt.on(
16+
'--stdin-versions',
17+
'Read Software versions from STDIN'
18+
) { |v| cli[:stdin_versions] = v }
1519
end
1620
end
1721

@@ -21,5 +25,22 @@ def perform
2125
cli.say "Registering result: #{cli[:result]}"
2226
r = obj.add_result(cli[:result], true, force: cli[:force])
2327
raise 'Cannot add result, incomplete expected files' if r.nil?
28+
29+
# Add Software version data
30+
if cli[:stdin_versions]
31+
versions = {}
32+
sw = nil
33+
$stdin.each do |ln|
34+
ln = ln.chomp.strip
35+
if ln =~ /^=> (.*)/
36+
sw = $1
37+
versions[sw] = ''
38+
else
39+
versions[sw] += ln
40+
end
41+
end
42+
r.add_versions(versions)
43+
r.save
44+
end
2445
end
2546
end

lib/miga/cli/action/download/gtdb.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def sanitize_cli
3131

3232
def remote_list
3333
cli.say 'Downloading genome list'
34-
extra = ['sp_reps_only=' + cli[:reference].to_s]
34+
extra = { sp_reps_only: cli[:reference].to_s }
3535
json = MiGA::RemoteDataset.download(
3636
:gtdb, :taxon, cli[:taxon], :genomes, nil, extra
3737
)

lib/miga/cli/action/download/ncbi.rb

Lines changed: 43 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,8 @@ def cli_name_modifiers(opt)
3434
'Do not add sequence version to the dataset name',
3535
'Only affects --complete and --chromosome'
3636
) { |v| cli[:add_version] = v }
37-
cli.opt_flag(
38-
opt, 'legacy-name',
39-
'Use dataset names based on chromosome entries instead of assembly',
40-
:legacy_name
41-
)
37+
# For backwards compatibility
38+
cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
4239
end
4340

4441
def sanitize_cli
@@ -52,89 +49,67 @@ def sanitize_cli
5249
end
5350

5451
def remote_list
55-
doc =
56-
if cli[:ncbi_table_file]
57-
cli.say 'Reading genome list from file'
58-
File.open(cli[:ncbi_table_file], 'r')
59-
else
60-
cli.say 'Downloading genome list'
61-
url = remote_list_url
62-
MiGA::RemoteDataset.download_url(url)
63-
end
64-
ds = parse_csv_as_datasets(doc)
65-
doc.close if cli[:ncbi_table_file]
66-
ds
52+
list = {}
53+
query = remote_list_query
54+
loop do
55+
# Query the remote collection
56+
page = MiGA::Json.parse(
57+
MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
58+
contents: true
59+
)
60+
break unless page&.any? && page[:reports]&.any?
61+
62+
# Process reports in this page
63+
list.merge!(parse_reports_as_datasets(page[:reports]))
64+
65+
# Next page
66+
break unless page[:next_page_token]
67+
query[:page_token] = page[:next_page_token]
68+
end
69+
list
6770
end
6871

69-
def parse_csv_as_datasets(doc)
72+
def parse_reports_as_datasets(reports)
7073
ds = {}
71-
CSV.parse(doc, headers: true).each do |r|
72-
asm = r['assembly']
74+
reports.each do |r|
75+
asm = r[:accession]
7376
next if asm.nil? || asm.empty? || asm == '-'
7477

75-
rep = remote_row_replicons(r)
76-
n = remote_row_name(r, rep, asm)
77-
7878
# Register for download
79+
n = remote_report_name(r, asm)
7980
ds[n] = {
8081
ids: [asm], db: :assembly, universe: :ncbi,
8182
md: {
82-
type: :genome, ncbi_asm: asm, strain: r['strain']
83+
type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
8384
}
8485
}
85-
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
86-
unless r['release_date'].nil?
87-
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
88-
end
86+
date = r.dig(:assembly_info, :release_date)
87+
ds[n][:md][:release_date] = Time.parse(date).to_s if date
88+
ds[n][:md][:ncbi_dataset] = r
8989
end
9090
ds
9191
end
9292

93-
def remote_row_replicons(r)
94-
return if r['replicons'].nil?
95-
96-
r['replicons']
97-
.split('; ')
98-
.map { |i| i.gsub(/.*:/, '') }
99-
.map { |i| i.gsub(%r{/.*}, '') }
100-
end
101-
102-
def remote_row_name(r, rep, asm)
103-
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
104-
105-
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
106-
acc = rep.nil? ? '' : rep.first
107-
else
108-
acc = asm
109-
end
93+
def remote_report_name(r, asm)
94+
acc = "#{asm}"
11095
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
111-
"#{r['#organism']}_#{acc}".miga_name
96+
org = r.dig(:organism, :organism_name)
97+
acc = "#{org}_#{acc}" if org
98+
acc.miga_name
11299
end
113100

114-
def remote_list_url
115-
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
116-
url_param = {
117-
q: '[display()].' \
118-
'from(GenomeAssemblies).' \
119-
'usingschema(/schema/GenomeAssemblies).' \
120-
'matching(tab==["Prokaryotes"] and q=="' \
121-
"#{cli[:taxon]&.tr('"', "'")}\"",
122-
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
123-
'level|level,release_date|release_date,strain|strain',
124-
nolimit: 'on'
125-
}
101+
def remote_list_query
102+
q = { taxons: [cli[:taxon]], filters: {} }
126103
if cli[:reference]
127-
url_param[:q] += ' and refseq_category==["representative"]'
104+
q[:filters][:reference_only] = true
128105
else
129-
status = {
130-
complete: 'Complete',
131-
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
132-
scaffold: 'Scaffold',
133-
contig: 'Contig'
134-
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
135-
url_param[:q] += ' and level==[' + status + ']'
106+
q[:assembly_level] = {
107+
contig: 'contig',
108+
scaffold: 'scaffold',
109+
chromosome: 'chromosome',
110+
complete: 'complete_genome'
111+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
136112
end
137-
url_param[:q] += ')'
138-
url_base + URI.encode_www_form(url_param)
113+
q
139114
end
140115
end

lib/miga/cli/action/download/seqcode.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ def remote_list
2929

3030
while current_page <= total_pages
3131
json = MiGA::RemoteDataset.download(
32-
:seqcode, :'type-genomes', nil, :json, nil,
33-
["page=#{current_page}"]
32+
:seqcode, :'type-genomes', nil, :json, nil, page: current_page
3433
)
3534
doc = MiGA::Json.parse(json, contents: true)
3635
current_page = doc[:current_page] + 1

lib/miga/cli/action/ncbi_get.rb

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
88

99
def parse_cli
1010
cli.defaults = {
11-
query: false, unlink: false,
12-
reference: false, legacy_name: false,
11+
query: false, unlink: false, reference: false,
1312
complete: false, chromosome: false,
1413
scaffold: false, contig: false, add_version: true, dry: false,
1514
get_md: false, only_md: false, save_every: 1
@@ -29,12 +28,6 @@ def parse_cli
2928
'--api-key STRING',
3029
'::HIDE::' # For backwards compatibility
3130
) { |v| ENV['NCBI_API_KEY'] = v }
32-
opt.on(
33-
'--ncbi-table-file STRING',
34-
'::HIDE::' # Only meant for debugging
35-
# It can take the table returned by NCBI and parse it from a file
36-
# instead of downloading it directly
37-
) { |v| cli[:ncbi_table_file] = v }
3831
opt.on(
3932
'--ncbi-api-key STRING',
4033
'NCBI API key'

lib/miga/cli/action/wf.rb

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ def default_opts_for_wf
88
cli.expect_files = true
99
cli.defaults = {
1010
clean: false, project_type: :genomes, dataset_type: :popgenome,
11-
ncbi_draft: true, min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
11+
ncbi_draft: true, ncbi_ref: false,
12+
min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
1213
prepare_and_exit: false
1314
}
1415
end
@@ -39,14 +40,21 @@ def opts_for_wf(opt, files_desc, params = {})
3940
'-T', '--ncbi-taxon STRING',
4041
'Download all the genomes in NCBI classified as this taxon'
4142
) { |v| cli[:ncbi_taxon] = v }
43+
opt.on(
44+
'--no-draft', '::HIDE::' # Deprecated
45+
) { |v| cli[:ncbi_draft] = v }
46+
opt.on(
47+
'--ncbi-complete',
48+
'Only download complete genomes, not drafts (requires -T)'
49+
) { |v| cli[:ncbi_draft] = !v }
50+
opt.on(
51+
'--ncbi-ref',
52+
'Only download RefSeq reference genomes (requires -T)'
53+
) { |v| cli[:ncbi_ref] = v }
4254
opt.on(
4355
'-G', '--gtdb-taxon STRING',
4456
'Download all the genomes in GTDB classified as this taxon'
4557
) { |v| cli[:gtdb_taxon] = v }
46-
opt.on(
47-
'--no-draft',
48-
'Only download complete genomes, not drafts (requires -T)'
49-
) { |v| cli[:ncbi_draft] = v }
5058
opt.on(
5159
'--gtdb-ref',
5260
'Only download reference anchor genomes in GTDB (requires -G)'
@@ -170,7 +178,8 @@ def initialize_empty_project(metadata)
170178
def download_datasets
171179
# Download datasets from NCBI
172180
unless cli[:ncbi_taxon].nil?
173-
what = cli[:ncbi_draft] ? '--all' : '--complete'
181+
what = cli[:ncbi_ref] ? '--reference' :
182+
cli[:ncbi_draft] ? '--all' : '--complete'
174183
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
175184
cmd += ['--max', cli[:max_download]] if cli[:max_download]
176185
call_cli(cmd)

lib/miga/cli/objects_helper.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def load_and_filter_datasets(silent = false)
6060
o &&= (d.ref? == self[:ref]) unless self[:ref].nil?
6161
o &&= (d.active? == self[:active]) unless self[:active].nil?
6262
o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil?
63+
unless self[:markers].nil?
64+
o &&= (self[:markers] ? d.markers? : !d.markers?)
65+
end
6366
unless self[:taxonomy].nil?
6467
o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy])
6568
end

lib/miga/cli/opt_helper.rb

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def opt_common(opt)
4343
'-h', '--help',
4444
'Display this screen'
4545
) do
46-
puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
46+
puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
4747
exit
4848
end
4949
opt.separator ''
@@ -120,10 +120,11 @@ def opt_object(opt, what = %i[project dataset])
120120
# as determined by +what+ an Array with any combination of:
121121
# - :ref To filter by reference (--ref) or query (--no-ref)
122122
# - :multi To filter by multiple (--multi) or single (--no-multi) species
123+
# - :markers To filter by with (--markers) or without markers (--no-markers)
123124
# - :active To filter by active (--active) or inactive (--no-active)
124125
# - :taxonomy To filter by taxonomy (--taxonomy)
125126
# The "k-th" filter (--dataset-k) is always included
126-
def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
127+
def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy])
127128
what.each do |w|
128129
case w
129130
when :ref
@@ -136,6 +137,11 @@ def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
136137
'--[no-]multi',
137138
'Use only multi-species (or only single-species) datasets'
138139
) { |v| self[:multi] = v }
140+
when :markers
141+
opt.on(
142+
'--[no-]markers',
143+
'Use only datasets with (or without) markers'
144+
) { |v| self[:markers] = v }
139145
when :active
140146
opt.on(
141147
'--[no-]active',

0 commit comments

Comments
 (0)