Skip to content
This repository was archived by the owner on Nov 13, 2024. It is now read-only.

Commit b5191b2

Browse files
committed
Initial versions of contam and region tools
1 parent 3a4544e commit b5191b2

File tree

16 files changed

+1374
-0
lines changed

16 files changed

+1374
-0
lines changed

src/DNAcopy_1.48.0.tar.gz

327 KB
Binary file not shown.

tools/contam_filter/.shed.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
name: contam_filter
2+
owner: drevoz
3+
description: Filter contaminant reads
4+
long_description: Python script to remove contaminant reads of known origin based on alignment to two genomes - target and contaminant.
5+
homepage_url:
6+
remote_repository_url:
7+
categories:
8+
- SAM

tools/contam_filter/contam_filter.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python
2+
3+
import sys
4+
import tempfile
5+
import os
6+
import pysam
7+
8+
9+
# minimum MAPQ of the read retained after contamination filtering
10+
min_qual = int(sys.argv[1])
11+
12+
# inputs: bams vs target and contam genomes
13+
t_filename = sys.argv[2]
14+
c_filename = sys.argv[3]
15+
16+
# output: filtered bam alignment
17+
filter_filename = sys.argv[4]
18+
#contam_filename = sys.argv[5]
19+
#unmap_filename = sys.argv[6]
20+
21+
# sort inputs by read name in temp
22+
t = tempfile.NamedTemporaryFile(suffix = '_t.bam', delete=False)
23+
c = tempfile.NamedTemporaryFile(suffix = '_c.bam', delete=False)
24+
tname = t.name
25+
cname = c.name
26+
t.close()
27+
c.close()
28+
#pysam.sort('-n', '-o', tname, t_filename) # samtools 0.1.19
29+
#pysam.sort('-n', '-o', cname, c_filename) # samtools 0.1.19
30+
pysam.sort('-n', t_filename, tname[:-4])
31+
pysam.sort('-n', c_filename, cname[:-4])
32+
33+
# create intermediate filtered output
34+
f = tempfile.NamedTemporaryFile(suffix = '_f.bam', delete=False)
35+
fname = f.name
36+
f.close()
37+
38+
with pysam.AlignmentFile(tname) as tfile, pysam.AlignmentFile(cname) as cfile:
39+
filter_file = pysam.AlignmentFile(fname,'wb', template=tfile)
40+
#contam_file = pysam.AlignmentFile(contam_filename,'wb', template=cfile)
41+
#unmap_file = pysam.AlignmentFile(unmap_filename,'wb', template=tfile)
42+
i = 0
43+
for tread in tfile:
44+
cread = cfile.next()
45+
assert tread.query_name == cread.query_name
46+
if tread.mapping_quality >= min_qual and tread.mapping_quality >= cread.mapping_quality:
47+
i += 1
48+
filter_file.write(tread)
49+
#if tread.mapping_quality < min_qual: # unmapped
50+
# unmap_file.write(tread) # output mapping for target genome
51+
#elif tread.mapping_quality < cread.mapping_quality: # contamination
52+
# contam_file.write(cread) # output mapping for contamination genome
53+
#else: #target
54+
# filter_file.write(tread)
55+
filter_file.close()
56+
#contam_file.close()
57+
#unmap_file.close()
58+
59+
# sort
60+
pysam.sort(fname, fname[:-4]+'_srt')
61+
os.rename(fname[:-4]+'_srt.bam', filter_filename)
62+
sys.stdout.write('Writing %d alignments to filtered output file %s\n'%(i, filter_filename))
63+
#sys.stdout.write('3 output files: filtered %s, contamination %s, unmapped %s\n'%(filter_filename, contam_filename, unmap_filename))
64+
65+
#clean-up
66+
os.unlink(tname)
67+
os.unlink(cname)
68+
os.unlink(fname)

tools/contam_filter/contam_filter.xml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<tool id="contam_filter" name="Filter contaminant reads" version="0.1.0">
2+
<requirements>
3+
</requirements>
4+
<stdio>
5+
<exit_code range="1:" />
6+
</stdio>
7+
<command interpreter="python"><![CDATA[
8+
contam_filter.py $minq "$input1" "$input2" "$output1"
9+
]]></command>
10+
<inputs>
11+
<param type="integer" name="minq" value="20" min="0" label="Minimum mapping quality in filtered output" help="By decreasing this value, you will include repetitive and low-quality aligments into downstream analysis."/>
12+
<param type="data" name="input1" format="bam" />
13+
<param type="data" name="input2" format="bam" />
14+
</inputs>
15+
<outputs>
16+
<data name="output1" format="bam" />
17+
</outputs>
18+
<tests>
19+
<test>
20+
<param name="input1" value="test.mm10.ns.bam"/>
21+
<param name="input2" value="test.hg19.ns.bam"/>
22+
<output name="output1" file="test.mm10.filter.bam"/>
23+
</test>
24+
</tests>
25+
<help><![CDATA[
26+
This tool filters contaminant reads based on BAM alignments of same reads to target and contaminant genomes.
27+
]]></help>
28+
</tool>

tools/contam_filter/test-data/1.bam

2.24 KB
Binary file not shown.
2.36 KB
Binary file not shown.
Binary file not shown.
2.24 KB
Binary file not shown.

tools/region_dnacopy/.shed.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
name: region_dnacopy
2+
owner: drevoz
3+
description: Finds regions present on chromosomes
4+
long_description: R script to find regions present on sequenced chromosomes. Underlying DNAcopy package included.
5+
homepage_url:
6+
remote_repository_url:
7+
categories:
8+
- Genomic Interval Operations
9+
type: unrestricted

tools/region_dnacopy/region_dnacopy.R

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#!/usr/bin/Rscript
2+
3+
# From positions bed file generate bed-like file with regions and their statistics and pdf plot for all chromosomes listed in .sizes file.
4+
# Steps:
5+
# Calculate pairwise distances between positions
6+
# Call regions based on distances to the position on the left
7+
# Plot chromosomes to '.reg.pdf' file
8+
# Correct regions - shift one position to the left if seg.mean.left>seg.mean.right
9+
# Calculate statistics of distances and coverage for each region
10+
# Write resulting table to '.reg.tsv' file
11+
12+
# Argument1 - '.pos.bed' file with postions
13+
# Argument2 - '.sizes' file with sizes of chromosomes for which regions are called
14+
# Argument3 - '.pdf' with regions
15+
# Argument4 - '.tsv' with regions
16+
17+
# Package installation:
18+
if(require("DNAcopy")){
19+
print("DNAcopy is loaded correctly")
20+
} else {
21+
print("trying to install DNAcopy")
22+
source("http://bioconductor.org/biocLite.R")
23+
install.packages("DNAcopy")
24+
if(require("DNAcopy")){
25+
print("DNAcopy installed and loaded")
26+
} else {
27+
stop("could not install DNAcopy")
28+
}
29+
}
30+
31+
# Parameter section
32+
args <- commandArgs(trailingOnly = TRUE)
33+
pos_file <- args[1] # bed file with positions, additional column with coverage
34+
size_file <- args[2] # file with chromosome sizes
35+
reg_pdf_file <- args[3] # output chromosome plot
36+
reg_tsv_file <- args[4] # output table
37+
sample <- args[5] # sample name
38+
39+
pdf_width <- 20
40+
pdf_height <- 20
41+
plot.type <- 's' # 'w' for whole genome in one picture
42+
left_dist <- TRUE # Calculate distance to left position. Correction for right dist not implemented.
43+
draw_plot <- TRUE # FALSE to skip plotting
44+
max_chr_plot <- 42 # maximum number of chromosomes to do plotting - 7*6 plot 20*20 inch
45+
log_dist <- TRUE # calculate regions based on log10(pairvise distances)
46+
smooth <- TRUE # remove outlier values
47+
48+
# Inputs
49+
pos_df <- read.table(pos_file)
50+
size_df <- read.table(size_file)
51+
52+
53+
# Calculate distances between positions
54+
# Leaves only positions on chromosomes listed in .sizes file
55+
flt_pos_list <- by(size_df, 1:nrow(size_df), function(chr_data) { # iterate over rows in size_df
56+
chr_pos_df <- subset(pos_df, V1 == as.character(chr_data$V1)) # chr_data$V1 subset positions in chromosome
57+
if (left_dist) {
58+
# dist to position on the left
59+
r_start <- chr_pos_df$V2 # start coordinates of all positions
60+
l_end <- c(0, chr_pos_df$V3[1:nrow(chr_pos_df)-1]) # end coordinates of prev position, 0 for first
61+
} else {
62+
# dist to position on the right. Not supported in downstream analysis.
63+
r_start <- c(chr_pos_df$V2[2:nrow(chr_pos_df)],chr_data$V2) # start coordinates of next position, end of chromosome for last
64+
l_end <- chr_pos_df$V3 # end coordinates of all positions
65+
}
66+
chr_pos_df$V5 <- r_start-l_end # add distance to df
67+
chr_pos_df
68+
})
69+
flt_pos_df <- do.call('rbind', flt_pos_list) # convert list of df's to df
70+
flt_pos_df$V1 <- factor(flt_pos_df$V1) # remove unused chromosomes from factor levels
71+
flt_pos_df$V6 <- log10(flt_pos_df$V5)
72+
flt_pos_df$V7 <- mapply(gsub, 'chr', '', flt_pos_df$V1)
73+
colnames(flt_pos_df) <- c('chrom','start','end','cov','l.dist','log10.l.dist','num.chrom')
74+
75+
# Plot to pdf: chromosome names with stripped chr
76+
# *correct chromosome sort order, sample name, maploc: need to modify DNAcopy.plot function
77+
if (nrow(size_df) > max_chr_plot) {
78+
draw_plot <- FALSE
79+
cat('Number of chromosomes in plot exceeds ', max_chr_plot, '. Plotting will be disabled.', sep = "")
80+
}
81+
if (draw_plot) {
82+
CNA.object <- CNA(flt_pos_df$log10.l.dist,flt_pos_df$num.chrom,flt_pos_df$start,
83+
data.type = 'logratio',sampleid = sample)
84+
if (smooth) {CNA.object <- smooth.CNA(CNA.object)}
85+
segment.CNA.object <- segment(CNA.object, verbose=1)
86+
pdf(reg_pdf_file, width = pdf_width, height = pdf_height)
87+
plot(segment.CNA.object, plot.type=plot.type, xmaploc=T,
88+
ylim=c(0,max(CNA.object[,3]))) # 'w' type for all chromosomes
89+
dev.off()
90+
}
91+
92+
# Call regions: chromosome names as initially
93+
CNA.object <- CNA(flt_pos_df$log10.l.dist,flt_pos_df$chrom,flt_pos_df$start,
94+
data.type = 'logratio',sampleid = sample)
95+
if (smooth) {CNA.object <- smooth.CNA(CNA.object)}
96+
segment.CNA.object <- segment(CNA.object, verbose=1)
97+
out_regions <- segment.CNA.object$output
98+
99+
# Clean-up
100+
# rename cols
101+
names(out_regions)[3] <- "reg.start"
102+
names(out_regions)[4] <- "reg.end"
103+
#names(out_regions)[5] <- "positions"
104+
names(out_regions)[6] <- "log10.mean"
105+
# remove unneeded cols: ID, num.mark, seg.mean
106+
corr_out_regions <- out_regions[,-c(1,5)]
107+
108+
# reg.end - change values from beginning to end of position
109+
corr_out_regions$reg.end <- apply(corr_out_regions[,c('chrom','reg.end')], 1, function(y) { # ugly look-up
110+
flt_pos_df[flt_pos_df$chrom==y['chrom'] & flt_pos_df$start == as.numeric(y['reg.end']),]$end
111+
})
112+
113+
# Correct for l distance: shift one position to the left, if seg.mean.left>seg.mean.right
114+
# *check for skipped positions between regions
115+
corr_start_end <- c()
116+
for (i in 1:nrow(corr_out_regions)){
117+
# start of first position in segment
118+
reg_chrom = as.character(corr_out_regions[i,'chrom'])
119+
reg_start = as.numeric(corr_out_regions[i,'reg.start'])
120+
reg_end = as.numeric(corr_out_regions[i,'reg.end'])
121+
reg_log10_mean = corr_out_regions[i,'log10.mean'] # assumed numeric
122+
start_index <- which(flt_pos_df$chrom == reg_chrom & flt_pos_df$start == reg_start)
123+
# end of last position in segment
124+
end_index <- which(flt_pos_df$chrom == reg_chrom & flt_pos_df$end == reg_end)
125+
# correction
126+
corr_start <- ifelse(i == 1 || corr_out_regions[i-1,'chrom']!=reg_chrom, # start of chr
127+
reg_start,
128+
ifelse(corr_out_regions[i-1,'log10.mean']>reg_log10_mean, # from lower to higher density
129+
flt_pos_df[start_index-1,'start'], # only case to correct
130+
reg_start))
131+
corr_end <- ifelse(i == nrow(corr_out_regions) || reg_chrom!=corr_out_regions[i+1,'chrom'], # end of chr
132+
reg_end,
133+
ifelse(reg_log10_mean>corr_out_regions[i+1,'log10.mean'], # from lower to higher density?
134+
flt_pos_df[end_index-1,'end'], # only case to correct
135+
reg_end))
136+
corr_start_end <- c(corr_start_end, corr_start, corr_end)
137+
}
138+
corr_se_matrix <- matrix(corr_start_end, ncol=2, byrow=T)
139+
corr_out_regions$reg.start <- corr_se_matrix[,1]
140+
corr_out_regions$reg.end <- corr_se_matrix[,2]
141+
corr_out_regions$reg.size <- corr_out_regions$reg.end - corr_out_regions$reg.start
142+
143+
# Calculate additional statistics for each region: number of markers, pd and read coverage mean and sd, pos sizes
144+
add_stats <- apply(corr_out_regions[,c('chrom','reg.start','reg.end')], 1, function(y) {
145+
# Subset of positions for each region
146+
flt_pos_sub <- subset(flt_pos_df, chrom==y['chrom']
147+
& start>=as.numeric(y['reg.start'])
148+
& end<=as.numeric(y['reg.end']) )
149+
# distances between positions within region - all but first
150+
in_dist <- flt_pos_sub$l.dist[2:nrow(flt_pos_sub)]
151+
# chrom size from .sizes file
152+
chr_size <- as.numeric(subset(size_df,V1==y['chrom']))[2]
153+
return(c(nrow(flt_pos_sub), # 1 number of positions
154+
mean(in_dist),sd(in_dist), # 2 mean and 3 sd of pairwise distances between positions inside region
155+
mean(flt_pos_sub$cov),sd(flt_pos_sub$cov), # 4 mean and 5 sd coverage (actually, number of reads within positions)
156+
chr_size, # 6 size of chromosomes
157+
mean(flt_pos_sub$end-flt_pos_sub$start),sum(flt_pos_sub$end-flt_pos_sub$start) # 7 mean and 8 total size of positions within the region
158+
))
159+
})
160+
# add to output
161+
corr_out_regions$positions <- add_stats[1,]
162+
corr_out_regions$pd.mean <- round(add_stats[2,], digits=0)
163+
corr_out_regions$pd.sd <- round(add_stats[3,], digits=0)
164+
corr_out_regions$cov.mean <- round(add_stats[4,], digits=2)
165+
corr_out_regions$cov.sd <- round(add_stats[5,], digits=2)
166+
corr_out_regions$chrom.size <- add_stats[6,]
167+
corr_out_regions$pos.bp.mean <- round(add_stats[7,], digits=0)
168+
corr_out_regions$pos.bp.total <- add_stats[8,]
169+
corr_out_regions$pos.cov <- corr_out_regions$pos.bp.total/(corr_out_regions$reg.end-corr_out_regions$reg.start)
170+
corr_out_regions <- corr_out_regions[c('chrom','reg.start','reg.end','reg.size','positions','pd.mean','pd.sd',
171+
'cov.mean','cov.sd','pos.bp.mean','pos.bp.total','pos.cov','chrom.size','log10.mean')]
172+
173+
# Write tsv file
174+
write.table(corr_out_regions,file=reg_tsv_file,quote=F,sep='\t',
175+
row.names=F,col.names=T)

0 commit comments

Comments
 (0)