forked from weizhongli/cdhit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clstr_quality_eval.pl
executable file
·137 lines (115 loc) · 3.39 KB
/
clstr_quality_eval.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env perl
## calculate the sensitivity and specificity of clusters
## if the input fasta file has pre-defined classification term
## such as COG, pfam, TAXID etc.
## these terms can be used as benchmark to calcuate
## the sensitivity and specificity of clustering
## sensitivity is the ratio of number of correct pairs in cd-hit to number of pairs in benchmark
## specificity is the ratio of number of correct pairs in cd-hit to number of all cd-hit pairs
## here a pair is a pair of seqs that in same cluster
## the fasta file defline should be ">id||term"
## for example: ">id1||COG00001";
my %pair_status = ();
my %bench_clusters = ();
my %cdhit_clusters = ();
my %seqs = ();
my $seq_idx = 0;
my @clstr = ();
my $readin = 0;
my $i = 0;
while($ll=<>) {
if ($ll =~ /^>/) {
if ($readin) {
if (@clstr > 1) { #skip singleton
$cdhit_clusters{$i} = [@clstr];
$i++;
}
}
@clstr = ();
}
else {
$readin=1;
chop($ll);
if ($ll =~ /(\d+)(aa|nt), >(.+)\.\.\./) {
$id = $3;
my ($seq_id, $ben_id) = split(/\|\|/, $id);
if ($ben_id) {
if (not defined($bench_clusters{$ben_id})) {
$bench_clusters{$ben_id} = [];
}
push(@{$bench_clusters{$ben_id}},$seq_idx);
}
push(@clstr, $seq_idx);
push(@seqs,$seq_id); $seq_idx++;
}
}
}
if ($readin) {
if (@clstr > 1) { #skip singleton
$cdhit_clusters{$i} = [@clstr];
$i++;
}
}
#process pairs in cdhit
my $total_cdhit_pairs=0;
foreach my $family (keys %cdhit_clusters) {
my @seqs = @{$cdhit_clusters{$family}};
@seqs = sort {$a <=> $b} @seqs;
my $N = $#seqs+1;
for ($i=0; $i<$N-1; $i++) {
my $idi = $seqs[$i];
for ($j=$i+1; $j<$N; $j++){
my $idj = $seqs[$j];
$pair_status{"$idi|$idj"} = "C"; #cdhit pairs
$total_cdhit_pairs++;
}
}
}
#process pairs in benchmark
my $total_bench_pairs = 0;
my $correct_pairs=0;
foreach my $family (keys %bench_clusters) {
my @seqs = @{$bench_clusters{$family}};
next unless (@seqs > 1); #skip singleton
@seqs = sort {$a <=> $b} @seqs;
my $N = $#seqs+1;
for ($i=0; $i<$N-1; $i++) {
my $idi = $seqs[$i];
for ($j=$i+1; $j<$N; $j++){
my $idj = $seqs[$j];
if (defined( $pair_status{"$idi|$idj"} )) {
$correct_pairs++;
$pair_status{"$idi|$idj"} = "T"; #True pair
}
else {
$pair_status{"$idi|$idj"} = "B"; #bench pairs
}
$total_bench_pairs++;
}
}
}
my $sen = $correct_pairs / $total_bench_pairs;
my $spe = $correct_pairs / $total_cdhit_pairs;
print "Total benchmark pairs\t$total_bench_pairs\n";
print "Total cd-hit pairs\t$total_cdhit_pairs\n";
print "Total correct pairs\t$correct_pairs\n";
print "Sensitivity\t$sen\n";
print "Specificity\t $spe\n";
print "\n\nPairs in benchmark but not in cd-hit\n";
foreach $pair (keys %pair_status) {
next unless ($pair_status{$pair} eq "B");
my ($idx1, $idx2) = split(/\|/,$pair);
print "$seqs[$idx1]\t$seqs[$idx2]\n";
}
print "\n\nPairs in cd-hit but not in benchmark\n";
foreach $pair (keys %pair_status) {
next unless ($pair_status{$pair} eq "C");
my ($idx1, $idx2) = split(/\|/,$pair);
print "$seqs[$idx1]\t$seqs[$idx2]\n";
}
print "\n\nPairs in both cd-hit and benchmark\n";
foreach $pair (keys %pair_status) {
next unless ($pair_status{$pair} eq "T");
my ($idx1, $idx2) = split(/\|/,$pair);
print "$seqs[$idx1]\t$seqs[$idx2]\n";
}