-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathscaffold_synteny.py
executable file
·480 lines (433 loc) · 24.6 KB
/
scaffold_synteny.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
#!/usr/bin/env python
#
# scaffold_synteny.py created 2019-03-27
# v1.2 add print all option and to remove gene features 2022-10-27
# v1.3 print used options in the scaffold table 2023-01-08
# v1.4 add some auto processing for GenBank format genomes 2023-04-17
# v1.41 bugfix for output 2023-08-15
'''
scaffold_synteny.py v1.4 last modified 2023-08-15
makes a table of gene matches between two genomes, to detect synteny
these can be converted into a dotplot of gene matches
options are:
-b : tabular blast output
-f and -F : fasta files of scaffolds for query and subject genomes
-q and -d : GFF files of genes for query and subject genomes
-l and -L : total length to keep for each genome, in MB
i.e. to only take long scaffolds up to length -l
scaffold_synteny.py -b monbr1_vs_srosetta_blastp.tab -q Monbr1_augustus_v1_no_comment.gff -d Srosetta_mrna_only_ID_renamed.gff -f Monbr1_scaffolds.fasta -F Salpingoeca_rosetta.dna.toplevel.fa.gz -l 40 -L 50 > monbr1_vs_srosetta_scaffold2d_points.tab
output consists of 8 columns, which differ by data, but are all
in the same output file
first type is scaffold information, for both genomes, as:
genome (either 1 or 2) scaffold-name number length
fraction of total assembly cumulative sum cumulative fraction mode
s1 contig_001_length_2062630 1 2062630 0.023656 2062630 0.023656 -
second type is positions of matches, relative to total length:
symbol indicating gene gene contig of gene match to other genome
scaffold of match position in genome 1 position in genome 2 bitscore
g braker1_g09939 contig_176_length_141897 g5612.t1 scaffold_5 72253822 38899018 104.0
generate tabular blast data with blastp:
blastp -query Monbr1_augustus_v1.prot_no_rename.fasta -db Salpingoeca_rosetta.pep.all.fa -outfmt 6 -num_threads 6 -evalue 1e-3 -max_target_seqs 100 > monbr1_vs_srosetta_blastp.tab
generate plot using synteny_2d_plot.R:
Rscript synteny_2d_plot.R monbr1_vs_srosetta_scaffold2d_points.tab Monosiga-brevicollis Salpingoeca-rosetta
'''
import sys
import argparse
import time
import re
import gzip
import random
from collections import defaultdict
from Bio import SeqIO
def make_exclude_dict(excludefile):
'''read file of list of contigs, and return a dict where keys are contig names to exclude'''
sys.stderr.write("# Reading exclusion list {} {}\n".format(excludefile, time.asctime() ) )
exclusion_dict = {}
for term in open(excludefile,'r'):
term = term.strip()
if term[0] == ">":
term = term[1:]
exclusion_dict[term] = True
sys.stderr.write("# Found {} contigs to exclude {}\n".format( len(exclusion_dict), time.asctime() ) )
return exclusion_dict
def make_seq_length_dict(contigsfile, maxlength, exclusiondict, option_mode, wayout, isref=False):
'''read fasta file, and return dict where key is scaffold name and value is length, also print the length of each scaffold to stdout'''
lengthdict = {}
if contigsfile.rsplit('.',1)[-1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write("# Parsing genomic contigs {} as gzipped {}\n".format(contigsfile, time.asctime() ) )
else: # otherwise assume normal open for fasta format
opentype = open
sys.stderr.write("# Parsing genomic contigs {} {}\n".format(contigsfile, time.asctime() ) )
for seqrec in SeqIO.parse(opentype(contigsfile,'rt'), "fasta"):
lengthdict[seqrec.id] = len(seqrec.seq)
sys.stderr.write("# Found {} contigs {}\n".format(len(lengthdict), time.asctime() ) )
# make scaffold key as s1 for query and s2 for reference db
scafkey = "s2" if isref else "s1"
totalgenomesize = sum( list(lengthdict.values() ) )
sorteddict = {} # key is scaffold, value is offset relative to previous scaffolds
lengthsum = 0 # cumulative sum of lengths of large scaffolds
short_skip_mode = False
if maxlength is None: # set max length to 99Gb, and activate short_skip_mode
maxlength = 99999
short_skip_mode = True
maxlength_MB = 1000000*maxlength
sys.stderr.write("# Sorting contigs by length, keeping up to {}Mbp\n".format(maxlength) )
# keep the first N scaffolds, where total length is approximately maxlength
scaffoldcounter = 0
for k,v in sorted(lengthdict.items(), key=lambda x: x[1], reverse=True): # take longest first
if k in exclusiondict:
continue
if short_skip_mode and v < 10000: # quit if contigs are 10kb or shorter
break
scaffoldcounter += 1
sorteddict[k] = lengthsum
lengthsum += v
wayout.write("{}\t{}\t{}\t{}\t{:.6f}\t{}\t{:.6f}\t{}\n".format(scafkey, k, scaffoldcounter, v, v*1.0/totalgenomesize, lengthsum, lengthsum*1.0/totalgenomesize, option_mode ) )
if lengthsum >= maxlength_MB: # keep adding scaffolds until length limit is hit or exceeded
break
sys.stderr.write("# Kept {} contigs, for {} bases, last contig was {}bp long {}\n".format( len(sorteddict), lengthsum, v, time.asctime() ) )
return sorteddict
def parse_gtf(gtffile, excludedict, delimiter, do_ignore_gene, isref=False, is_verbose=False):
'''from a gtf, return a dict of dicts where keys are scaffold names, then gene names, and values are gene info as a tuple of start end and strand direction'''
prot_to_gene_dict = {} # stores protein to gene IDs, in case of unique numbering e.g. GenBank IDs
if isref:
genesbyscaffold = {} # key is reference gene ID, value is a 2-item list of scaffold and gene midpoint position
else:
genesbyscaffold = defaultdict(dict) # scaffolds as key, then gene name, then gene position integer
if gtffile.rsplit('.',1)[-1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write("# Parsing loci from {} as gzipped {}\n".format(gtffile, time.asctime() ) )
else: # otherwise assume normal open for fasta format
opentype = open
sys.stderr.write("# Parsing loci from {} {}\n".format(gtffile, time.asctime() ) )
for line in opentype(gtffile,'rt'):
line = line.strip()
if line and not line[0]=="#": # ignore empty lines and comments
lsplits = line.split("\t")
scaffold = lsplits[0]
if excludedict and excludedict.get(scaffold, False):
continue # skip anything that hits to excludable scaffolds
feature = lsplits[2]
attributes = lsplits[8]
attrd = dict([(field.strip().split("=",1)) for field in attributes.split(";") if field.count("=")])
if feature=="transcript" or feature=="mRNA" or (feature=="gene" and do_ignore_gene is False):
# regex search expects gff format
# should allow a-z , A-Z _ . | + -
raw_geneid = attrd.get("ID",None)
#raw_geneid = re.search('ID=([\w\.\|\+\-]+)', attributes).group(1)
if raw_geneid is None:
print( "ERROR: cannot extract ID= for {}\n{}".format(attributes, line) , file=sys.stderr )
# if a delimiter is given for either query or db, then split
if delimiter:
geneid = geneid.rsplit(delimiter,1)[0]
else:
geneid = raw_geneid
# generate midpoint of each gene as average of start and end positions
genemidpoint = (int(lsplits[3]) + int(lsplits[4])) / 2
if isref:
genesbyscaffold[geneid] = [scaffold,genemidpoint]
else:
genesbyscaffold[scaffold][geneid] = genemidpoint
elif feature=="CDS": # for GenBank, Parent=rna-XM_029783594.1 protein_id=XP_029639454.1
raw_mrna_id = attrd.get("Parent",None)
raw_prot_id = attrd.get("protein_id",None)
if isref: # refs are indexed in reverse, from blast hit to gene ID
prot_to_gene_dict[raw_prot_id] = raw_mrna_id
else: # indexed by GFF to find protein of the query
if raw_prot_id is None: # may be in format where prots and transcripts have the same ID
prot_to_gene_dict[raw_mrna_id] = raw_mrna_id
else: # meaning normal
prot_to_gene_dict[raw_mrna_id] = raw_prot_id
if is_verbose:
print("## key {} value {} last tx {}".format(raw_mrna_id, raw_prot_id, raw_geneid), file=sys.stderr)
if len(genesbyscaffold) > 0:
genetotal = sum( list( map( len, genesbyscaffold.values() ) ) )
sys.stderr.write("# Found {} genes {}\n".format( genetotal, time.asctime() ) )
sys.stderr.write("# GFF names parsed as {} from {}\n".format( geneid, raw_geneid ) )
if prot_to_gene_dict:
sys.stderr.write("# {} RNA to protein IDs parsed as {} from {}\n".format( len(prot_to_gene_dict), raw_mrna_id, raw_prot_id ) )
return genesbyscaffold, prot_to_gene_dict
else:
sys.stderr.write("# WARNING: NO GENES FOUND\n")
def parse_tabular_blast(blasttabfile, evaluecutoff, querydelimiter, refdelimiter, maxhits, group_removal_max):
'''read tabular blast file, return a dict where key is query ID and value is dict of subject ID and bitscore'''
if blasttabfile.rsplit('.',1)[-1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write("# Parsing tabular blast output {} as gzipped {}\n".format(blasttabfile, time.asctime() ) )
else: # otherwise assume normal open for fasta format
opentype = open
sys.stderr.write("# Parsing tabular blast output {} {}\n".format(blasttabfile, time.asctime() ) )
query_to_sub_dict = defaultdict( lambda: defaultdict(int) )
evalueRemovals = 0
subjectcounter = defaultdict(int)
for line in opentype(blasttabfile, 'rt'):
line = line.strip()
lsplits = line.split("\t")
# qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore
queryseq = lsplits[0].rsplit(querydelimiter,1)[0]
subjectid = lsplits[1].rsplit(refdelimiter,1)[0]
# filter by evalue
if float(lsplits[10]) > evaluecutoff:
evalueRemovals += 1
continue
# otherwise add the entry to first dict
bitscore = float(lsplits[11])
query_to_sub_dict[queryseq][subjectid] += bitscore
subjectcounter[subjectid] += 1
sys.stderr.write("# Found blast hits for {} query sequences, removed {} hits by evalue {}\n".format( len(query_to_sub_dict), evalueRemovals, time.asctime() ) )
# filter by number of hits
total_kept = 0
large_group_removals_qu = {} # to prevent multiple counting, store keys
large_group_removals_sb = {} # or possibly to later check what was removed
filtered_hit_dict = defaultdict( lambda: defaultdict(int) )
for queryseq, subdict in query_to_sub_dict.items():
num_hits = len(subdict)
if num_hits >= group_removal_max: # remove proteins with many hits, as large protein families likely lead to spurious synteny
large_group_removals_qu[queryseq] = True
continue
hit_counter = 0 # reset for each query, to take no more than maxhits
for subseq, bits in sorted(subdict.items(), key=lambda x: x[1], reverse=True): # sort by bitscore
if subjectcounter.get(subseq, 0) >= group_removal_max: # flag for removal of group
large_group_removals_sb[subseq] = True
continue
if hit_counter >= maxhits: # meaning already took enough matches
continue
filtered_hit_dict[queryseq][subseq] = bits
hit_counter += 1 # should never get above maxhits
total_kept += hit_counter
sys.stderr.write("# Removed {} queries and {} subjects with {} or more hits\n".format( len(large_group_removals_qu), len(large_group_removals_sb), group_removal_max ) )
sys.stderr.write("# Blast names parsed as {} from {}, and {} from {}\n".format( queryseq,lsplits[0], subjectid,lsplits[1] ))
sys.stderr.write("# Kept {} blast hits\n".format( total_kept ) )
return filtered_hit_dict
def make_self_blast_dict(gene_positions):
'''for haplotype comparison mode, return dict of dicts, where key, key are the same seq ID, and value is 100'''
artificial_hit_dict = defaultdict( lambda: defaultdict(int) )
gene_counter = 0
sys.stderr.write("# Converting gene list to self hits {}\n".format( time.asctime() ) )
for scaf,posdict in gene_positions.items():
for gene,pos in posdict.items():
artificial_hit_dict[gene][gene] = 100
sys.stderr.write("# Using {} genes for positions\n".format( len(artificial_hit_dict ) ) )
return artificial_hit_dict
def generate_synteny_points(queryScafOffset, dbScafOffset, queryPos, dbPos, blastdict, prot_to_gene_dict, give_local_positions, do_print_all, is_genbank, wayout):
'''combine all datasets and for each gene on the query scaffolds, print tab delimited data to stdout'''
is_verbose = False
printed_queries = {} # key is gene ID, value is True
printed_targets = {} # key is gene ID, value is True
scaffoldtotals = defaultdict(int) # counts of total genes for each scaffold
sys.stderr.write("# Determining match positions {}\n".format( time.asctime() ) )
# force sorted order based on the scaffold order from the fasta file
for scaffold, queryoffset in sorted(queryScafOffset.items(), key=lambda x: x[1]):
genedict = queryPos.get(scaffold,{}) # if no genes, then skip scaffold at next for loop
scaffoldcounts = defaultdict(int) # counts of hits to each reference scaffold
for gene, localposition in genedict.items():
scaffoldtotals[scaffold] += 1
if give_local_positions: # if using local positions, ignore all ofsets and use only localposition
overallposition = localposition
else: # using global positions
overallposition = localposition + queryoffset
# if using GenBank GFFs and proteins directly, then convert from mRNA position ID to blasted protein
# this only works if the IDs are unique accession numbers
# i.e. if both are AUGUSTUS IDs like g1.t1 then the two genomes will interfere with each other
if is_genbank:
blast_qgene = prot_to_gene_dict.get(gene, None)
else:
blast_qgene = gene
if is_verbose:
print("## gene {} blast {} scaffold {}".format(gene, blast_qgene, scaffold), file=sys.stderr)
# then get matches for that gene
# blast_qgene should be the fasta protein header, or protein_id from a GFF
blasthits = blastdict.get(blast_qgene, None)
if blasthits is None:
if do_print_all: # make fake entry for non-matches
blasthits = { "NA": 0 }
else: # skip query genes with no matches
continue
# iterate through dict of matches
for matchgene, bitscore in sorted(blasthits.items(), key=lambda x: x[1], reverse=True):
if is_genbank: # if using 2 GenBank genomes, process the same for the match
blast_matchgene = prot_to_gene_dict.get(matchgene, None)
else:
blast_matchgene = matchgene
matchscaf, matchposition = dbPos.get(blast_matchgene, [None, None])
if matchscaf is None: # would mean blast hit is not in the GFF
if do_print_all: # fix to NA and 0 for printing anyway
matchscaf = "NA"
matchposition = 0
else:
continue
# check offset for db scaffold
matchoffset = dbScafOffset.get(matchscaf, None)
if matchoffset is None: # blast hit is not on a kept db scaffolds
if do_print_all: # fix to 0 for printing
matchoffset = 0
else:
continue
if give_local_positions:
overallmatchpos = matchposition
else: # using global positions
overallmatchpos = matchposition + matchoffset
scaffoldcounts[matchscaf] += 1
# write each match to file
printed_queries[gene] = True
printed_targets[matchgene] = True
wayout.write("g\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(blast_qgene, scaffold, matchgene, matchscaf, overallposition, overallmatchpos, bitscore) )
if printed_queries:
sys.stderr.write("# Wrote match positions for {} genes\n".format( len(printed_queries) ) )
else:
sys.stderr.write("# WARNING: NO MATCHES FOUND, CHECK -Q AND -D\n")
if do_print_all:
for db_gene in dbPos.keys():
if db_gene not in printed_targets:
matchscaf, matchposition = dbPos.get(db_gene, [None, None])
matchoffset = dbScafOffset.get(matchscaf, 0)
overallmatchpos = matchposition + matchoffset
printed_targets[db_gene] = True
wayout.write("g\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format("NA", "NA", matchgene, matchscaf, 0, overallmatchpos, 0) )
if printed_targets:
sys.stderr.write("# Wrote target positions for {} genes\n".format( len(printed_targets) ) )
def randomize_genes_globally(refdict):
'''take the query gtf dict and randomize the gene names for all genes on all scaffolds, return a similar dict of dicts'''
genepositions = {} # store gene positions
randomgenelist = []
sys.stderr.write("# Globally randomizing query gene positions {}\n".format( time.asctime() ) )
for scaffold, genedict in refdict.items(): # iterate first to get list of all genes
for genename in genedict.keys():
randomgenelist.append(genename)
genepositions[genename] = genedict[genename]
# randomize the list
random.shuffle(randomgenelist)
# reiterate in same order, but store random gene names at the same position
genecounter = 0
randomgenesbyscaf = defaultdict(dict) # scaffolds as key, then gene name, then genemapping tuple
for scaffold, genedict in refdict.items(): # iterate again to reassign genes to each scaffold
for genename, bounds in genedict.items():
randomgenesbyscaf[scaffold][randomgenelist[genecounter]] = genepositions[genename]
genecounter += 1
sys.stderr.write("# Randomized {} genes {}\n".format(genecounter, time.asctime() ) )
return randomgenesbyscaf
def randomize_genes_locally(refdict):
'''take the query gtf dict and randomize the gene names for genes within each scaffold, return a similar dict of dicts'''
genepositions = {} # store gene positions
scaffoldcount = 0
total_genes = 0
randomgenesbyscaf = defaultdict(dict) # scaffolds as key, then gene name, then genemapping tuple
sys.stderr.write("# Randomizing query gene positions by scaffold {}\n".format( time.asctime() ) )
for scaffold, genedict in refdict.items(): # iterate first to get list of all genes
scaffoldcount += 1
randomgenelist = [] # generate new list to randomize for each scaffold
for genename in genedict.keys():
randomgenelist.append(genename)
genepositions[genename] = genedict[genename]
# randomize the list
random.shuffle(randomgenelist)
# reiterate in same order, but store random gene names at the same position
genecounter = 0
for genename, bounds in genedict.items(): # iterate again to reassign genes to each scaffold
randomgenesbyscaf[scaffold][randomgenelist[genecounter]] = genepositions[genename]
genecounter += 1
total_genes += 1
sys.stderr.write("# Randomized {} genes on {} scaffolds {}\n".format(total_genes, scaffoldcount, time.asctime() ) )
return randomgenesbyscaf
def randomize_db_locally(refdict):
'''take the ref gtf dict and randomize the gene positions for genes within each scaffold, return a similar dict of lists'''
scaffoldcount = 0
total_genes = 0
randomgenesbyscaf = {} # key is reference gene ID, value is list of scaffold and gene midpoint position
sys.stderr.write("# Randomizing reference gene positions by scaffold {}\n".format( time.asctime() ) )
for scaffold, genedict in refdict.items(): # iterate first to get list of all genes
scaffoldcount += 1
randomposlist = [] # generate new list of the gene positions to randomize for each scaffold
for genename in genedict.keys():
randomposlist.append( genedict[genename] )
# randomize the list
random.shuffle(randomposlist)
# reiterate in same order, but store random positions for each gene
genecounter = 0
for genename, bounds in genedict.items(): # iterate again to reassign genes to each scaffold
randomgenesbyscaf[genename] = [scaffold, randomposlist[genecounter]]
genecounter += 1
total_genes += 1
sys.stderr.write("# Randomized {} genes on {} scaffolds {}\n".format(total_genes, scaffoldcount, time.asctime() ) )
return randomgenesbyscaf
def main(argv, wayout):
if not len(argv):
argv.append("-h")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-b','--blast', help="tabular blast or diamond output format 6, can be .gz")
parser.add_argument('-f','--query-fasta', help="fasta file of query scaffolds, can be .gz", required=True)
parser.add_argument('-F','--db-fasta', help="fasta file of reference scaffolds, can be .gz", required=True)
parser.add_argument('-q','--query-gff', help="GFF file of query genes", required=True)
parser.add_argument('-d','--db-gff', help="GFF file of reference genes", required=True)
parser.add_argument('-Q','--query-delimiter', help="gene transcript separator for query [.]")
parser.add_argument('-D','--db-delimiter', help="gene transcript separator for db [.]")
parser.add_argument('--blast-query-delimiter', help="gene transcript separator for blast query [|]", default='|')
parser.add_argument('--blast-db-delimiter', help="gene transcript separator for blast ref [|]", default='|')
parser.add_argument('-E','--exclude', help="file of list of bad contigs, from either genome")
parser.add_argument('-e','--evalue', type=float, default=1e-4, help="evalue cutoff for post blast filtering [1e-4]")
parser.add_argument('-c','--coverage', default=0.8, type=int, help="minimum alignment coverage [0.8]")
parser.add_argument('-l','--query-genome-len', type=int, help="total length of query scaffolds to keep, ie. chromosome sized, otherwise only takes 10kb or longer")
parser.add_argument('-L','--db-genome-len', type=int, help="total length of reference scaffolds")
parser.add_argument('-M','--maximum-hits', metavar="N", type=int, default=1, help="keep maximum of N hits per query [1]")
parser.add_argument('-G','--group-size-maximum', metavar="N", type=int, default=250, help="remove queries with more than N hits, e.g. transposons [250]")
parser.add_argument('-R','--global-randomize', help="globally randomize gene positions of query GFF, cannot use with -S", action="store_true")
parser.add_argument('-S','--scaffold-randomize', help="randomize gene positions of query GFF within each scaffold, cannot use with -R", action="store_true")
parser.add_argument('--double-randomize', help="randomize gene positions of db, use with -S", action="store_true")
parser.add_argument('--local-positions', help="output points as local positions on each scaffold, not global position", action="store_true")
parser.add_argument('--compare-haplotypes', help="compare gene placement between two haplotypes, blast -b is ignored", action="store_true")
parser.add_argument('--ignore-gene-features', help="skip gene features, but will still use mRNA and transcript", action="store_true")
parser.add_argument('--genbank-gff', help="use presets when proteins and GFF files are from GenBank", action="store_true")
parser.add_argument('--print-no-match', help="print lines for all queries, including those without blast matches", action="store_true")
args = parser.parse_args(argv)
# check for required tabular blast output file
if args.blast is None:
if args.compare_haplotypes:
print( "# comparing haplotypes, ignoring -b, will assume that all transcripts blast only to self {}".format(time.asctime()), file=sys.stderr )
else:
raise OSError("ERROR: blast output file -b is required, or use --compare-haplotypes mode if you are comparing haplotype assemblies of the same organism")
exclusiondict = make_exclude_dict(args.exclude) if args.exclude else {}
# get option mode string
option_mode = "" # should be in ["r", "s", "sd", "l", "n", "ln", "-"]
if args.global_randomize:
option_mode += "r"
elif args.scaffold_randomize:
option_mode += "s"
if args.double_randomize:
option_mode += "d"
if args.local_positions:
option_mode += "l"
if args.print_no_match:
option_mode += "n"
if not option_mode: # if empty string, set to hyphen
option_mode = "-"
# read both sets of scaffolds
query_scaf_lengths = make_seq_length_dict(args.query_fasta, args.query_genome_len, exclusiondict, option_mode, wayout, False)
db_scaf_lengths = make_seq_length_dict(args.db_fasta, args.db_genome_len, exclusiondict, option_mode, wayout, True)
prot_to_tx_dict = {}
# read query as normal
# and assign to query_gene_pos, as a dict of dicts
query_gene_pos, q_p2tx_d_raw = parse_gtf(args.query_gff, exclusiondict, args.query_delimiter, args.ignore_gene_features, False)
prot_to_tx_dict.update(q_p2tx_d_raw)
### IF DOING RANDOMIZATION ###
if args.global_randomize:
query_gene_pos = randomize_genes_globally(query_gene_pos)
elif args.scaffold_randomize:
query_gene_pos = randomize_genes_locally(query_gene_pos)
### IF DOING DOUBLE RANDOMIZE ###
if args.double_randomize: # read db first as query format, randomize and generate the dict in the ref format
db_gene_pos, d_p2tx_d_raw = parse_gtf(args.db_gff, exclusiondict, args.db_delimiter, args.ignore_gene_features, False)
db_gene_pos = randomize_db_locally(db_gene_pos)
# if NOT RANDOMIZING REFERENCE #
else: # otherwise read as normal into the ref format
db_gene_pos, d_p2tx_d_raw = parse_gtf(args.db_gff, exclusiondict, args.db_delimiter, args.ignore_gene_features, True)
prot_to_tx_dict.update(d_p2tx_d_raw)
# read blast hits
if args.compare_haplotypes:
blastdict = make_self_blast_dict(query_gene_pos)
else:
blastdict = parse_tabular_blast(args.blast, args.evalue, args.blast_query_delimiter, args.blast_db_delimiter, args.maximum_hits, args.group_size_maximum)
# write output
generate_synteny_points( query_scaf_lengths, db_scaf_lengths, query_gene_pos, db_gene_pos, blastdict, prot_to_tx_dict, args.local_positions, args.print_no_match, args.genbank_gff, wayout)
if __name__ == "__main__":
main(sys.argv[1:],sys.stdout)