Skip to content

Commit de77478

Browse files
committed
enabled to install with pip
1 parent ca07723 commit de77478

16 files changed

+976
-9
lines changed

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2018 JiangTao
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

cuteSV.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

setup.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# coding=utf-8
2+
3+
from setuptools import setup, find_packages
4+
5+
LONG_DESCRIPTION = '''Long-read sequencing technologies enable to comprehensively \
6+
discover structural variations (SVs). However, it is still non-trivial for \
7+
state-of-the-art approaches to detect SVs with high sensitivity or high performance \
8+
or both. Herein, we propose cuteSV, a sensitive, fast and lightweight SV detection \
9+
approach. cuteSV uses tailored methods to comprehensively collect various types of \
10+
SV signatures, and a clustering-and-refinement method to implement a stepwise SV \
11+
detection, which enables to achieve high sensitivity without loss of accuracy. \
12+
Benchmark results demonstrate that cuteSV has better yields on real datasets. \
13+
Further, its speed and scalability are outstanding and promising to large-scale data analysis.'''
14+
15+
setup(
16+
name = "cuteSV",
17+
version = "1.0.1",
18+
description = "Long read based human genomic structural variation detection with cuteSV",
19+
author = "Jiang Tao",
20+
author_email = "[email protected]",
21+
url = "https://github.com/tjiangHIT/cuteSV",
22+
license = "MIT",
23+
packages = find_packages("src"),
24+
package_dir = {"": "src"},
25+
data_files = [("", ["LICENSE"])],
26+
scripts=['src/cuteSV/cuteSV'],
27+
long_description = LONG_DESCRIPTION,
28+
zip_safe = False,
29+
install_requires = ['pysam', 'Biopython', 'Cigar', 'numpy']
30+
)

src/benchmarks/__init__.py

Whitespace-only changes.

src/benchmarks/cmp_NA12878.py

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
import sys
2+
import argparse
3+
import logging
4+
import time
5+
6+
def load_high_conf_bed(path):
7+
confbed = dict()
8+
file = open(path,'r')
9+
for line in file:
10+
seq = line.strip('\n').split('\t')
11+
chr = seq[0]
12+
svtype = seq[1]
13+
istart = int(seq[2])
14+
iend = int(seq[3])
15+
16+
if svtype not in confbed:
17+
confbed[svtype] = dict()
18+
if chr not in confbed[svtype]:
19+
confbed[svtype][chr] = list()
20+
confbed[svtype][chr].append([istart, iend, 0, 0, 0])
21+
file.close()
22+
return confbed
23+
24+
def pase_info(seq):
25+
info = {'SVLEN': 0, 'END': 0, "SVTYPE": '', "RE": 0, "CHR2": ''}
26+
for i in seq.split(';'):
27+
if i.split('=')[0] in ["SVLEN", "END", "RE"]:
28+
try:
29+
info[i.split('=')[0]] = abs(int(float(i.split('=')[1])))
30+
except:
31+
pass
32+
if i.split('=')[0] in ["SVTYPE", "CHR2"]:
33+
info[i.split('=')[0]] = i.split('=')[1]
34+
return info
35+
36+
def load_callset_cuteSV(path, filter):
37+
callset = dict()
38+
file = open(path, 'r')
39+
for line in file:
40+
seq = line.strip('\n').split('\t')
41+
if seq[0][0] == '#':
42+
continue
43+
44+
chr = seq[0]
45+
pos = int(seq[1])
46+
ALT = seq[2][7:10]
47+
# if ALT == "DUP":
48+
# ALT = "INS"
49+
if ALT not in ["INS", "INV", "DEL", "DUP", "BND"]:
50+
continue
51+
52+
info = pase_info(seq[7])
53+
54+
if ALT not in callset:
55+
callset[ALT] = dict()
56+
if chr not in callset[ALT]:
57+
if ALT == 'BND':
58+
callset[ALT][chr] = dict()
59+
else:
60+
callset[ALT][chr] = list()
61+
62+
if ALT == "BND":
63+
if info["CHR2"] not in callset[ALT][chr]:
64+
callset[ALT][chr][info["CHR2"]] = list()
65+
# if info["RE"] >= filter:
66+
callset[ALT][chr][info["CHR2"]].append([pos, info["END"], 0])
67+
else:
68+
# if info["SVLEN"] >= 50:
69+
if info["SVLEN"] >= 50 and info["SVLEN"] <= 100000:
70+
callset[ALT][chr].append([pos, info["SVLEN"], info["END"], 0])
71+
file.close()
72+
return callset
73+
74+
def eva_record(call_A, call_B, bias, offect):
75+
for svtype in call_A:
76+
if svtype not in call_B:
77+
continue
78+
else:
79+
if svtype == "BND":
80+
for chr1 in call_A[svtype]:
81+
if chr1 not in call_B[svtype]:
82+
continue
83+
else:
84+
for chr2 in call_A[svtype][chr1]:
85+
if chr2 not in call_B[svtype][chr1]:
86+
continue
87+
else:
88+
for i in call_A[svtype][chr1][chr2]:
89+
for j in call_B[svtype][chr1][chr2]:
90+
if abs(i[0] - j[0]) <= offect and abs(i[1] - j[1]) <= offect:
91+
i[2] += 1
92+
j[2] += 1
93+
else:
94+
pass
95+
else:
96+
for chr in call_A[svtype]:
97+
if chr not in call_B[svtype]:
98+
continue
99+
else:
100+
for i in call_A[svtype][chr]:
101+
for j in call_B[svtype][chr]:
102+
if i[0] - offect <= j[0] <= i[2] + offect or i[0] - offect <= j[2] <= i[2] + offect or j[0] - offect <= i[0] <= j[2] + offect:
103+
if min(i[1], j[1])*1.0/max(i[1], j[1]) >= bias:
104+
i[3] += 1
105+
j[3] += 1
106+
else:
107+
pass
108+
109+
def statistics_true_possitive(callset, SVTYPE="ALL"):
110+
record = 0
111+
non_record = 0
112+
if SVTYPE == "ALL":
113+
for svtype in callset:
114+
if svtype == "BND":
115+
for chr1 in callset[svtype]:
116+
for chr2 in callset[svtype][chr1]:
117+
for res in callset[svtype][chr1][chr2]:
118+
record += 1
119+
if res[2] == 0:
120+
non_record += 1
121+
else:
122+
for chr in callset[svtype]:
123+
for res in callset[svtype][chr]:
124+
record += 1
125+
if res[3] == 0:
126+
non_record += 1
127+
else:
128+
if SVTYPE == "BND":
129+
for chr1 in callset[SVTYPE]:
130+
for chr2 in callset[SVTYPE][chr1]:
131+
for res in callset[SVTYPE][chr1][chr2]:
132+
record += 1
133+
if res[2] == 0:
134+
non_record += 1
135+
else:
136+
for chr in callset[SVTYPE]:
137+
for res in callset[SVTYPE][chr]:
138+
record += 1
139+
if res[3] == 0:
140+
non_record += 1
141+
return record, non_record
142+
143+
def eva_hc(pac, ont, hc, bias, offect):
144+
for chr in hc['INS']:
145+
for ele in hc["INS"][chr]:
146+
if chr not in pac["INS"]:
147+
pass
148+
else:
149+
for i in pac["INS"][chr]:
150+
if ele[0] - offect <= i[0] <= ele[1] + offect:
151+
ele[2] = 1
152+
else:
153+
pass
154+
if chr in ont["INS"]:
155+
for j in ont["INS"][chr]:
156+
if ele[0] - offect <= j[0] <= ele[1] + offect:
157+
ele[3] = 1
158+
else:
159+
pass
160+
161+
for chr in hc['DEL']:
162+
for ele in hc['DEL'][chr]:
163+
if chr in pac['DEL']:
164+
for i in pac['DEL'][chr]:
165+
if ele[0] - offect <= i[0] <= ele[1] + offect or ele[0] - offect <= i[2] <= ele[1] + offect or i[0] - offect <= ele[0] <= i[2] + offect:
166+
if min(ele[1]-ele[0], i[1])*1.0/max(ele[1]-ele[0], i[1]) >= bias:
167+
ele[2] = 1
168+
if chr in ont['DEL']:
169+
for i in ont['DEL'][chr]:
170+
if ele[0] - offect <= i[0] <= ele[1] + offect or ele[0] - offect <= i[2] <= ele[1] + offect or i[0] - offect <= ele[0] <= i[2] + offect:
171+
if min(ele[1]-ele[0], i[1])*1.0/max(ele[1]-ele[0], i[1]) >= bias:
172+
ele[3] = 1
173+
174+
for SVTYPE in ["INS", 'DEL']:
175+
hc_pac = 0
176+
hc_ont = 0
177+
hc_pac_ont = 0
178+
hc_num = 0
179+
for chr in hc[SVTYPE]:
180+
for i in hc[SVTYPE][chr]:
181+
hc_num += 1
182+
if i[2] == 1:
183+
hc_pac += 1
184+
if i[3] == 1:
185+
hc_ont += 1
186+
if i[2]+i[3] == 2:
187+
i[4] = 1
188+
hc_pac_ont += 1
189+
logging.info("%s\thc:%d\tPacBio_hc:%d\tONT_hc:%d\tPacBio_ONT_hc:%d."%(SVTYPE, hc_num, hc_pac, hc_ont, hc_pac_ont))
190+
191+
def main_ctrl(args):
192+
pac = load_callset_cuteSV(args.C1, 10)
193+
ont = load_callset_cuteSV(args.C2, 5)
194+
hc = load_high_conf_bed(args.C3)
195+
# for SVTYPE in hc:
196+
# for chr in hc[SVTYPE]:
197+
# for i in hc[SVTYPE][chr]:
198+
# print(SVTYPE, chr, i)
199+
logging.info("Calculate overlaps.")
200+
eva_record(pac, ont, args.bias, args.offect)
201+
eva_hc(pac, ont, hc, args.bias, args.offect)
202+
svtype = ["DEL", "DUP", "INS", "INV", "BND", "ALL"]
203+
for i in svtype:
204+
pac_r, pac_nol = statistics_true_possitive(pac, i)
205+
pac_ol = pac_r - pac_nol
206+
ont_r, ont_nol = statistics_true_possitive(ont, i)
207+
ont_ol = ont_r - ont_nol
208+
logging.info("%s\tPacBio:%d %d %.4f\tONT:%d %d %.4f."%(i, pac_r, pac_ol, 1.0*pac_ol/pac_r, ont_r, ont_ol, 1.0*ont_ol/ont_r))
209+
210+
211+
def main(argv):
212+
args = parseArgs(argv)
213+
setupLogging(False)
214+
# print args
215+
starttime = time.time()
216+
main_ctrl(args)
217+
logging.info("Finished in %0.2f seconds."%(time.time() - starttime))
218+
219+
USAGE="""\
220+
Evaluate SV callset generated by cuteSV
221+
"""
222+
223+
def parseArgs(argv):
224+
parser = argparse.ArgumentParser(prog="Trio_eval", description=USAGE, formatter_class=argparse.RawDescriptionHelpFormatter)
225+
parser.add_argument("C1", type=str, help="PacBio callset")
226+
parser.add_argument('C2', type=str, help = "ONT callset")
227+
parser.add_argument('C3', type=str, help = "High confidence callset")
228+
parser.add_argument('-b', '--bias', help = "Bias of overlaping.[%(default)s]", default = 0.7, type = float)
229+
parser.add_argument('-o', '--offect', help = "Offect of translocation overlaping.[%(default)s]", default = 1000, type = int)
230+
args = parser.parse_args(argv)
231+
return args
232+
233+
def setupLogging(debug=False):
234+
logLevel = logging.DEBUG if debug else logging.INFO
235+
logFormat = "%(asctime)s [%(levelname)s] %(message)s"
236+
logging.basicConfig( stream=sys.stderr, level=logLevel, format=logFormat )
237+
logging.info("Running %s" % " ".join(sys.argv))
238+
239+
if __name__ == '__main__':
240+
main(sys.argv[1:])

0 commit comments

Comments
 (0)