Skip to content

Commit

Permalink
minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jiadong324 committed Apr 7, 2024
1 parent a12edc5 commit bb310b0
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 30 deletions.
14 changes: 7 additions & 7 deletions src/collection/cluster_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from src.collection.classes import Cluster, read_fai_file


def partition_and_cluster(signatures, fai_file, genome, chr, this_sample_path, options):
def partition_and_cluster(signatures, chr, this_sample_path, options):

partitions = signature_partition(signatures, options.patition_max_distance)
partitions = signature_partition(signatures, options)

# print(f'Number of partitions: {len(partitions)}')

Expand All @@ -20,7 +20,7 @@ def partition_and_cluster(signatures, fai_file, genome, chr, this_sample_path, o
# print(len(partition))

# clusters = []
clusters = cluster_partitions(partitions, chr, fai_file, genome, this_sample_path, options)
clusters = cluster_partitions(partitions, chr, this_sample_path, options)

# print(f'Number of clusters: {len(clusters)}')

Expand Down Expand Up @@ -48,24 +48,24 @@ def cluster_positions_simple(positions, max_delta):
def distance_positions(position1, position2):
return float("inf") if position1[0] != position2[0] else abs(position1[1] - position2[1])

def signature_partition(signatures, max_distance):
def signature_partition(signatures, options):

sorted_signatures = sorted(signatures, key=lambda evi: evi.get_key())
partitions = []
current_partition = []
for signature in sorted_signatures:
if len(current_partition) > 0 and current_partition[-1].position_distance_to(signature) > max_distance:
if len(current_partition) > options.min_support and current_partition[-1].position_distance_to(signature) > options.patition_max_distance:
partitions.append(current_partition[:])
current_partition = []

current_partition.append(signature)

if len(current_partition) > 0:
if len(current_partition) > options.min_support:
partitions.append(current_partition[:])

return partitions

def cluster_partitions(partitions, chrom, fai_file, genome, this_sample_path, options):
def cluster_partitions(partitions, chrom, this_sample_path, options):
'''
Using hierarchical clustering to cluster signatures in each partition
:param partitions:
Expand Down
23 changes: 5 additions & 18 deletions src/collection/collect_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def analyze_alignments(aligns, bam, options, part_num):

min_mapq = 0 if options.contig is True else options.min_mapq

all_possible_chrs = pysam.FastaFile(options.genome).references
# all_possible_chrs = pysam.FastaFile(options.genome).references

# # collect reads's pm and sa
reads_dict = {}
Expand All @@ -139,11 +139,10 @@ def analyze_alignments(aligns, bam, options, part_num):
continue

# # align to a ref that not in genome reference
align_chr = align.reference_name

if align_chr not in all_possible_chrs:
logging.warning("{0} not in reference index file, skip this read".format(align_chr))
continue
# align_chr = align.reference_name
# if align_chr not in all_possible_chrs:
# logging.warning("{0} not in reference index file, skip this read".format(align_chr))
# continue

align_ref_id = bam.get_tid(align.reference_name)

Expand All @@ -157,9 +156,6 @@ def analyze_alignments(aligns, bam, options, part_num):

# print(f'Number of high-quality reads in window: {len(reads_dict)}')

black_regions = {'chr1': [(143184831, 143275233)], 'chr16': [(46377798, 46418333)]}
mapper = bam.header.get('PG')[0]['ID']

# traverse reads
read_num = 0
seg_signatures = []
Expand All @@ -169,15 +165,6 @@ def analyze_alignments(aligns, bam, options, part_num):
# print(align.reference_start, align.reference_end, align.query_alignment_start, align.query_alignment_end, align.is_supplementary, align.is_reverse)

this_qname_aligns = reads_dict[qname]
this_qname_chr = bam.get_reference_name(this_qname_aligns[0].reference_id)
filtered_query = False

## Add V1.3.6, Handeling reads in two black regions for minimap2, enriched of SDs
if mapper == 'minimap2' and not options.contig and this_qname_chr in black_regions:
regions = black_regions[this_qname_chr]
filtered_query = filter_alignment_in(this_qname_aligns, regions)
if filtered_query:
continue

## End Add

Expand Down
2 changes: 1 addition & 1 deletion src/collection/run_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def run_detect(options, sample_path, chrom, part_num, start, end):


# # partition and clusters signatures to get cluster info
clusters = partition_and_cluster(sv_signatures, fai_file, genome_file, chrom, sample_path, options)
clusters = partition_and_cluster(sv_signatures, chrom, sample_path, options)
# # write cluster info to file
writer_cluster_to_file(clusters, chrom, part_num, options)

Expand Down
6 changes: 3 additions & 3 deletions src/network/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def cluster_original_callset(callset_path, out_path, sample_path, cluster_out_fi
if len(item_list) == 1:
filter_type = 'Uncovered'
else:
filter_type = 'Clustered'
filter_type = 'PASS'

# convert to vcf format
line = convert_to_vcf_format(new_cluster, sample_path, filter_type)
Expand Down Expand Up @@ -284,7 +284,7 @@ def merge_split_vcfs(in_dir, merged_vcf_path, max_score, min_score, spec_chroms,
print("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">", file=merged_vcf)
print("##INFO=<ID=BKPS,Number=.,Type=String,Description=\"All breakpoints (length-start-end) in this region, where CSV might contain multiple breakpoints.\">", file=merged_vcf)
print("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"CNN predicted SV type, containing INS, DEL, DUP, tDUP (tandem duplication) and INV\">", file=merged_vcf)
print("##INFO=<ID=SUPPORT,Number=1,Type=String,Description=\"SV support number in this region\">", file=merged_vcf)
print("##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"SV support number in this region\">", file=merged_vcf)
# print("##INFO=<ID=VAF,Number=1,Type=String,Description=\"SV allele frequency in this region\">", file=merged_vcf)
print("##INFO=<ID=READS,Number=.,Type=String,Description=\"SV support read names in this region\">", file=merged_vcf)

Expand Down Expand Up @@ -527,7 +527,7 @@ def write_results_to_vcf(vcf_out, score_out, region_potential_svtypes, region, r
if 'sigUncovered' in sig_type_stat.keys() and sig_type_stat['sigUncovered'] >= 0.75 * len(sig_types):
filter_type = 'Uncovered'
else:
filter_type = 'Covered'
filter_type = 'PASS'


for i in range(len(all_sv_types)):
Expand Down
2 changes: 1 addition & 1 deletion src/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.3.8"
__version__ = "1.4"

0 comments on commit bb310b0

Please sign in to comment.