forked from BV-BRC/bvbrc_metacats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
App-MetaCATS.pl
executable file
·236 lines (220 loc) · 7.59 KB
/
App-MetaCATS.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#The MSA application with variance analysis.
use Bio::KBase::AppService::AppScript;
use Bio::KBase::AppService::AppConfig;
use strict;
use P3DataAPI;
use Data::Dumper;
use File::Basename;
use File::Slurp;
use LWP::UserAgent;
use JSON::XS;
use JSON;
use IPC::Run qw(run);
use Cwd;
use Clone;
use URI::Escape;
my $script = Bio::KBase::AppService::AppScript->new(\&process_metacats);
my $data_api = Bio::KBase::AppService::AppConfig->data_api_url;
my $rc = $script->run(\@ARGV);
exit $rc;
sub process_metacats
{
my($app, $app_def, $raw_params, $params) = @_;
print "Proc Metacats ", Dumper($app_def, $raw_params, $params);
my $token = $app->token();
my $data_api_module = P3DataAPI->new($data_api, $token);
my $output_folder = $app->result_folder();
#
# Create an output directory under the current dir. App service is meant to invoke
# the app script in a working directory; we create a folder here to encapsulate
# the job output.
#
# We also create a staging directory for the input files from the workspace.
#
my $cwd = getcwd();
my $work_dir = "$cwd/work";
my $stage_dir = "$cwd/stage";
-d $work_dir or mkdir $work_dir or die "Cannot mkdir $work_dir: $!";
-d $stage_dir or mkdir $stage_dir or die "Cannot mkdir $stage_dir: $!";
my $data_api = Bio::KBase::AppService::AppConfig->data_api_url;
my $dat = { data_api => $data_api };
my $sstring = encode_json($dat);
#
# Read parameters and discover input files that need to be staged.
#
# Make a clone so we can maintain a list of refs to the paths to be
# rewritten.
#
my %in_files;
my $params_to_app = Clone::clone($params);
# Get values common to all inputs.
my $prefix = $params_to_app->{output_file};
my $p_value = $params_to_app->{p_value};
my $alphabet = $params_to_app->{alphabet};
my $input_type = $params_to_app->{input_type};
my $check_header = 0;
my $seqFile = "";
my $metaDataFile = "";
my $staged = {};
my $ofile = "$stage_dir/downloaded_sequences.fasta";
# Get values based on input type.
if ($input_type eq "files") {
# Get sequence file and metadata file in the staging directory.
my @to_stage;
$check_header = 1;
push(@to_stage, $params_to_app->{alignment_file});
push(@to_stage, $params_to_app->{group_file});
if (@to_stage)
{
warn Dumper(\@to_stage);
$staged = $app->stage_in(\@to_stage, $stage_dir, 1);
my %new_hash = %{ $staged };
$seqFile = $new_hash{$params_to_app->{alignment_file}};
$metaDataFile = $new_hash{$params_to_app->{group_file}};
}
} elsif ($input_type eq "groups") {
# Get the sequences and create a metadata file for the groups.
open(F, ">$ofile") or die "Could not open $ofile";
$metaDataFile = "$work_dir/metadata.tsv";
open(G, ">$metaDataFile") or die "Could not open $metaDataFile";
print G "Seq_ID\t$prefix\n";
for my $feature_name (@{$params_to_app->{groups}}) {
print STDOUT "Getting features in $feature_name\n";
my $ids = $data_api_module->retrieve_patricids_from_feature_group($feature_name);
my $seq = "";
if ($alphabet eq "na") {
$seq = $data_api_module->retrieve_nucleotide_feature_sequence($ids);
} else {
$seq = $data_api_module->retrieve_protein_feature_sequence($ids);
}
for my $id (@$ids) {
my $out = ">$id\n" . uc($seq->{$id}) . "\n";
print F $out;
print G "$id\t" . "$feature_name" . "\n";
}
}
close F;
close G;
} elsif ($input_type eq "auto") {
# Put the sequences in the ofile from the patric ids, and get the metadata file from the JSON object.
open(F, ">$ofile") or die "Could not open $ofile";
$metaDataFile = "$work_dir/metadata.tsv";
open(G, ">$metaDataFile") or die "Could not open $metaDataFile";
print G "Seq_ID\t$prefix\n";
my @ids = ();
for (@{$params->{auto_groups}}) {
push(@ids, $_->{id});
print G $_->{id} . "\t" . $_->{grp} . "\n";
}
my $seq = "";
if ($alphabet eq "na") {
$seq = $data_api_module->retrieve_nucleotide_feature_sequence(\@ids);
} else {
$seq = $data_api_module->retrieve_protein_feature_sequence(\@ids);
}
for my $id (@ids) {
print F ">$id\n" . uc($seq->{$id}) . "\n";
}
} else {
die("Unrecognized input type.");
}
if (($input_type eq "groups") or ($input_type eq "auto")) {
# Align the sequences.
my @mafft_cmd = ("mafft", "--auto", "--preservecase", $ofile);
my $string_cmd = join(" ", @mafft_cmd);
print STDOUT "Running mafft.\n";
print STDOUT "$string_cmd\n";
$seqFile = "$work_dir/output.afa";
my $ok = run(\@mafft_cmd, "1>", $seqFile, "2>", "$work_dir/$prefix.mafft.log");
if (!$ok) {
die "Mafft command failed.\n";
}
print STDOUT "Finished mafft.\n";
}
# Replace leading and trailing gaps with the '#' symbol in the sequence file.
open my $fh, '<', "$seqFile" or die "Cannot open $seqFile: $!";
my $adjusted_seqFile = "$work_dir/adjusted_seqs.fasta";
open(OUT, '>', "$adjusted_seqFile") or die "Cannot open $adjusted_seqFile: $!";
my $seq_string = "";
my $header = "";
while ( my $line = <$fh> ) {
chomp $line;
if (substr($line, 0, 1) eq ">") {
if ($header) {
print OUT "$header\n";
}
$header = $line;
if ($seq_string) {
$seq_string = replace_gaps($seq_string);
print OUT "$seq_string\n";
}
$seq_string = "";
} else {
$seq_string = $seq_string.$line;
}
}
if ($header) {
print OUT "$header\n";
}
if ($seq_string) {
$seq_string = replace_gaps($seq_string);
print OUT "$seq_string\n";
}
close OUT;
# Run the analysis.
my @cmd = ("metadata_parser", $adjusted_seqFile, $metaDataFile, $alphabet, $p_value, $check_header, "$work_dir/");
run_cmd(\@cmd);
my @output_suffixes = (
[qr/Table\.tsv$/, "tsv"],
[qr/\.log$/, "txt"],
[qr/\.afa$/, "aligned_protein_fasta"]
);
opendir(D, $work_dir) or die "Cannot opendir $work_dir: $!";
my @files = sort { $a cmp $b } grep { -f "$work_dir/$_" } readdir(D);
my $output=1;
for my $file (@files)
{
for my $suf (@output_suffixes)
{
if ($file =~ $suf->[0])
{
$output=0;
my $path = "$output_folder/$file";
my $type = $suf->[1];
$app->workspace->save_file_to_file("$work_dir/$file", {}, "$output_folder/$file", $type, 1,
(-s "$work_dir/$file" > 10_000 ? 1 : 0), # use shock for larger files
$token);
}
}
}
#
# Clean up staged input files.
#
while (my($orig, $staged_file) = each %$staged)
{
unlink($staged_file) or warn "Unable to unlink $staged_file: $!";
}
return $output;
}
sub replace_gaps {
my($line) = @_;
my $start_len = 0;
my $end_len = 0;
if ($line =~ /^(-+)/) {
$start_len = length($1);
}
if ($line =~ /(-+)$/) {
$end_len = length($1);
}
substr($line, 0, $start_len) = '#' x $start_len;
substr($line, length($line) - $end_len, $end_len) = '#' x $end_len;
return $line;
}
sub run_cmd() {
my $cmd = $_[0];
my $ok = run(@$cmd);
if (!$ok)
{
die "Command failed: @$cmd\n";
}
}