-
Notifications
You must be signed in to change notification settings - Fork 1
/
reindexMissingFeatures.pl
executable file
·121 lines (86 loc) · 2.97 KB
/
reindexMissingFeatures.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env perl
###########################################################
#
# Script to reindex features that were dropped/deleted from
# solr during annotation updates or for other reason. The
# missing features are reindexed using the original index
# files generated by the annotation service ot those prepared
# from solr back up files.
#
###########################################################
use strict;
use warnings;
use FindBin qw($Bin);
use Getopt::Long::Descriptive;
use JSON;
use Data::Dumper;
use lib "$Bin";
use SolrAPI;
my $solrServer = $ENV{PATRIC_SOLR};
my $solrFormat="&wt=csv&csv.separator=%09&csv.mv.separator=;";
my $solrh = SolrAPI->new($ENV{PATRIC_DATA_API}, $ENV{PATRIC_REFERENCE_DATA});
my $json = JSON->new->allow_nonref;
my ($opt, $usage) = describe_options(
"%c %o",
[],
["genome_list=s", "File containing list of annotation files"],
["commit=s", "Commit updates to Solr, true|false", { default => "false"}],
[],
["help|h", "Print usage message and exit"]
);
print($usage->text), exit 0 if $opt->help;
die($usage->text) unless $opt->genome_list;
my $genome_list = $opt->genome_list;
open LIST, "$genome_list" or die "Can't open genome_list file: $genome_list!!\n";
while (my $file_name = <LIST>) {
chomp $file_name;
next unless $file_name;
open GENOME, $file_name or next "Can't open $file_name";
my $features = $json->decode(join "", <GENOME>);
close GENOME;
# global arrays to record all the updates
my $fids;
my @update_features;
my $genome_id = $features->[0]->{genome_id};
print "Processing $genome_id\t$file_name\n";
# new genome, get primary identifiers and existing features
$fids = getFeatureIDs($genome_id);
foreach my $feature (@{$features}){
# skip if the fetaure is already present in the database
next unless $feature->{annotation}=~/PATRIC/;
next if $fids->{$feature->{feature_id}};
# else, missing feature, push it for reindexing
#print "$feature->{feature_id}\t$feature->{patric_id}\n";
push @update_features, $feature;
}
my $update_file = "$genome_id.update.json";
print "\tPrepare $update_file\n";
my $feature_json = $json->pretty->encode(\@update_features);
open GF, ">$update_file";
print GF "$feature_json";
close GF;
if ($opt->commit=~/true|yes/i){
print "\tPost $update_file\n";
`post.update.sh genome_feature $update_file`;
#`rm $genome_id.genome_feature.json`;
}
}
close LIST;
sub getFeatureIDs {
my ($genome_id) = @_;
my $core = "/genome_feature";
my $query = "/select?q=annotation:PATRIC AND genome_id:$genome_id";
my $fields = "&fl=patric_id,feature_id";
my $rows = "&rows=1000000";
my $sort = "";
my $solrQuery = $solrServer.$core.$query.$fields.$rows.$sort.$solrFormat;
my @features = `wget -q -O - "$solrQuery" | grep -v patric_id`;
my %FID = ();
foreach my $feature (@features){
chomp $feature;
my ($patric_id, $feature_id) = $feature=~/(.*)\t(.*)/;
#print "$feature_id\t$patric_id\n";
$FID{$feature_id} = $patric_id;
}
return \%FID;
}