1
+ import sys
2
+ import argparse
3
+ import logging
4
+ import time
5
+
6
+ def load_high_conf_bed (path ):
7
+ confbed = dict ()
8
+ file = open (path ,'r' )
9
+ for line in file :
10
+ seq = line .strip ('\n ' ).split ('\t ' )
11
+ chr = seq [0 ]
12
+ svtype = seq [1 ]
13
+ istart = int (seq [2 ])
14
+ iend = int (seq [3 ])
15
+
16
+ if svtype not in confbed :
17
+ confbed [svtype ] = dict ()
18
+ if chr not in confbed [svtype ]:
19
+ confbed [svtype ][chr ] = list ()
20
+ confbed [svtype ][chr ].append ([istart , iend , 0 , 0 , 0 ])
21
+ file .close ()
22
+ return confbed
23
+
24
+ def pase_info (seq ):
25
+ info = {'SVLEN' : 0 , 'END' : 0 , "SVTYPE" : '' , "RE" : 0 , "CHR2" : '' }
26
+ for i in seq .split (';' ):
27
+ if i .split ('=' )[0 ] in ["SVLEN" , "END" , "RE" ]:
28
+ try :
29
+ info [i .split ('=' )[0 ]] = abs (int (float (i .split ('=' )[1 ])))
30
+ except :
31
+ pass
32
+ if i .split ('=' )[0 ] in ["SVTYPE" , "CHR2" ]:
33
+ info [i .split ('=' )[0 ]] = i .split ('=' )[1 ]
34
+ return info
35
+
36
+ def load_callset_cuteSV (path , filter ):
37
+ callset = dict ()
38
+ file = open (path , 'r' )
39
+ for line in file :
40
+ seq = line .strip ('\n ' ).split ('\t ' )
41
+ if seq [0 ][0 ] == '#' :
42
+ continue
43
+
44
+ chr = seq [0 ]
45
+ pos = int (seq [1 ])
46
+ ALT = seq [2 ][7 :10 ]
47
+ # if ALT == "DUP":
48
+ # ALT = "INS"
49
+ if ALT not in ["INS" , "INV" , "DEL" , "DUP" , "BND" ]:
50
+ continue
51
+
52
+ info = pase_info (seq [7 ])
53
+
54
+ if ALT not in callset :
55
+ callset [ALT ] = dict ()
56
+ if chr not in callset [ALT ]:
57
+ if ALT == 'BND' :
58
+ callset [ALT ][chr ] = dict ()
59
+ else :
60
+ callset [ALT ][chr ] = list ()
61
+
62
+ if ALT == "BND" :
63
+ if info ["CHR2" ] not in callset [ALT ][chr ]:
64
+ callset [ALT ][chr ][info ["CHR2" ]] = list ()
65
+ # if info["RE"] >= filter:
66
+ callset [ALT ][chr ][info ["CHR2" ]].append ([pos , info ["END" ], 0 ])
67
+ else :
68
+ # if info["SVLEN"] >= 50:
69
+ if info ["SVLEN" ] >= 50 and info ["SVLEN" ] <= 100000 :
70
+ callset [ALT ][chr ].append ([pos , info ["SVLEN" ], info ["END" ], 0 ])
71
+ file .close ()
72
+ return callset
73
+
74
+ def eva_record (call_A , call_B , bias , offect ):
75
+ for svtype in call_A :
76
+ if svtype not in call_B :
77
+ continue
78
+ else :
79
+ if svtype == "BND" :
80
+ for chr1 in call_A [svtype ]:
81
+ if chr1 not in call_B [svtype ]:
82
+ continue
83
+ else :
84
+ for chr2 in call_A [svtype ][chr1 ]:
85
+ if chr2 not in call_B [svtype ][chr1 ]:
86
+ continue
87
+ else :
88
+ for i in call_A [svtype ][chr1 ][chr2 ]:
89
+ for j in call_B [svtype ][chr1 ][chr2 ]:
90
+ if abs (i [0 ] - j [0 ]) <= offect and abs (i [1 ] - j [1 ]) <= offect :
91
+ i [2 ] += 1
92
+ j [2 ] += 1
93
+ else :
94
+ pass
95
+ else :
96
+ for chr in call_A [svtype ]:
97
+ if chr not in call_B [svtype ]:
98
+ continue
99
+ else :
100
+ for i in call_A [svtype ][chr ]:
101
+ for j in call_B [svtype ][chr ]:
102
+ if i [0 ] - offect <= j [0 ] <= i [2 ] + offect or i [0 ] - offect <= j [2 ] <= i [2 ] + offect or j [0 ] - offect <= i [0 ] <= j [2 ] + offect :
103
+ if min (i [1 ], j [1 ])* 1.0 / max (i [1 ], j [1 ]) >= bias :
104
+ i [3 ] += 1
105
+ j [3 ] += 1
106
+ else :
107
+ pass
108
+
109
+ def statistics_true_possitive (callset , SVTYPE = "ALL" ):
110
+ record = 0
111
+ non_record = 0
112
+ if SVTYPE == "ALL" :
113
+ for svtype in callset :
114
+ if svtype == "BND" :
115
+ for chr1 in callset [svtype ]:
116
+ for chr2 in callset [svtype ][chr1 ]:
117
+ for res in callset [svtype ][chr1 ][chr2 ]:
118
+ record += 1
119
+ if res [2 ] == 0 :
120
+ non_record += 1
121
+ else :
122
+ for chr in callset [svtype ]:
123
+ for res in callset [svtype ][chr ]:
124
+ record += 1
125
+ if res [3 ] == 0 :
126
+ non_record += 1
127
+ else :
128
+ if SVTYPE == "BND" :
129
+ for chr1 in callset [SVTYPE ]:
130
+ for chr2 in callset [SVTYPE ][chr1 ]:
131
+ for res in callset [SVTYPE ][chr1 ][chr2 ]:
132
+ record += 1
133
+ if res [2 ] == 0 :
134
+ non_record += 1
135
+ else :
136
+ for chr in callset [SVTYPE ]:
137
+ for res in callset [SVTYPE ][chr ]:
138
+ record += 1
139
+ if res [3 ] == 0 :
140
+ non_record += 1
141
+ return record , non_record
142
+
143
+ def eva_hc (pac , ont , hc , bias , offect ):
144
+ for chr in hc ['INS' ]:
145
+ for ele in hc ["INS" ][chr ]:
146
+ if chr not in pac ["INS" ]:
147
+ pass
148
+ else :
149
+ for i in pac ["INS" ][chr ]:
150
+ if ele [0 ] - offect <= i [0 ] <= ele [1 ] + offect :
151
+ ele [2 ] = 1
152
+ else :
153
+ pass
154
+ if chr in ont ["INS" ]:
155
+ for j in ont ["INS" ][chr ]:
156
+ if ele [0 ] - offect <= j [0 ] <= ele [1 ] + offect :
157
+ ele [3 ] = 1
158
+ else :
159
+ pass
160
+
161
+ for chr in hc ['DEL' ]:
162
+ for ele in hc ['DEL' ][chr ]:
163
+ if chr in pac ['DEL' ]:
164
+ for i in pac ['DEL' ][chr ]:
165
+ if ele [0 ] - offect <= i [0 ] <= ele [1 ] + offect or ele [0 ] - offect <= i [2 ] <= ele [1 ] + offect or i [0 ] - offect <= ele [0 ] <= i [2 ] + offect :
166
+ if min (ele [1 ]- ele [0 ], i [1 ])* 1.0 / max (ele [1 ]- ele [0 ], i [1 ]) >= bias :
167
+ ele [2 ] = 1
168
+ if chr in ont ['DEL' ]:
169
+ for i in ont ['DEL' ][chr ]:
170
+ if ele [0 ] - offect <= i [0 ] <= ele [1 ] + offect or ele [0 ] - offect <= i [2 ] <= ele [1 ] + offect or i [0 ] - offect <= ele [0 ] <= i [2 ] + offect :
171
+ if min (ele [1 ]- ele [0 ], i [1 ])* 1.0 / max (ele [1 ]- ele [0 ], i [1 ]) >= bias :
172
+ ele [3 ] = 1
173
+
174
+ for SVTYPE in ["INS" , 'DEL' ]:
175
+ hc_pac = 0
176
+ hc_ont = 0
177
+ hc_pac_ont = 0
178
+ hc_num = 0
179
+ for chr in hc [SVTYPE ]:
180
+ for i in hc [SVTYPE ][chr ]:
181
+ hc_num += 1
182
+ if i [2 ] == 1 :
183
+ hc_pac += 1
184
+ if i [3 ] == 1 :
185
+ hc_ont += 1
186
+ if i [2 ]+ i [3 ] == 2 :
187
+ i [4 ] = 1
188
+ hc_pac_ont += 1
189
+ logging .info ("%s\t hc:%d\t PacBio_hc:%d\t ONT_hc:%d\t PacBio_ONT_hc:%d." % (SVTYPE , hc_num , hc_pac , hc_ont , hc_pac_ont ))
190
+
191
+ def main_ctrl (args ):
192
+ pac = load_callset_cuteSV (args .C1 , 10 )
193
+ ont = load_callset_cuteSV (args .C2 , 5 )
194
+ hc = load_high_conf_bed (args .C3 )
195
+ # for SVTYPE in hc:
196
+ # for chr in hc[SVTYPE]:
197
+ # for i in hc[SVTYPE][chr]:
198
+ # print(SVTYPE, chr, i)
199
+ logging .info ("Calculate overlaps." )
200
+ eva_record (pac , ont , args .bias , args .offect )
201
+ eva_hc (pac , ont , hc , args .bias , args .offect )
202
+ svtype = ["DEL" , "DUP" , "INS" , "INV" , "BND" , "ALL" ]
203
+ for i in svtype :
204
+ pac_r , pac_nol = statistics_true_possitive (pac , i )
205
+ pac_ol = pac_r - pac_nol
206
+ ont_r , ont_nol = statistics_true_possitive (ont , i )
207
+ ont_ol = ont_r - ont_nol
208
+ logging .info ("%s\t PacBio:%d %d %.4f\t ONT:%d %d %.4f." % (i , pac_r , pac_ol , 1.0 * pac_ol / pac_r , ont_r , ont_ol , 1.0 * ont_ol / ont_r ))
209
+
210
+
211
+ def main (argv ):
212
+ args = parseArgs (argv )
213
+ setupLogging (False )
214
+ # print args
215
+ starttime = time .time ()
216
+ main_ctrl (args )
217
+ logging .info ("Finished in %0.2f seconds." % (time .time () - starttime ))
218
+
219
+ USAGE = """\
220
+ Evaluate SV callset generated by cuteSV
221
+ """
222
+
223
+ def parseArgs (argv ):
224
+ parser = argparse .ArgumentParser (prog = "Trio_eval" , description = USAGE , formatter_class = argparse .RawDescriptionHelpFormatter )
225
+ parser .add_argument ("C1" , type = str , help = "PacBio callset" )
226
+ parser .add_argument ('C2' , type = str , help = "ONT callset" )
227
+ parser .add_argument ('C3' , type = str , help = "High confidence callset" )
228
+ parser .add_argument ('-b' , '--bias' , help = "Bias of overlaping.[%(default)s]" , default = 0.7 , type = float )
229
+ parser .add_argument ('-o' , '--offect' , help = "Offect of translocation overlaping.[%(default)s]" , default = 1000 , type = int )
230
+ args = parser .parse_args (argv )
231
+ return args
232
+
233
+ def setupLogging (debug = False ):
234
+ logLevel = logging .DEBUG if debug else logging .INFO
235
+ logFormat = "%(asctime)s [%(levelname)s] %(message)s"
236
+ logging .basicConfig ( stream = sys .stderr , level = logLevel , format = logFormat )
237
+ logging .info ("Running %s" % " " .join (sys .argv ))
238
+
239
+ if __name__ == '__main__' :
240
+ main (sys .argv [1 :])
0 commit comments