amakunin
diff --git a/‎src/DNAcopy_1.48.0.tar.gz
327 KB b/‎src/DNAcopy_1.48.0.tar.gz
327 KB
diff --git a/‎tools/contam_filter/.shed.yml
Lines changed: 8 additions & 0 deletions b/‎tools/contam_filter/.shed.yml
Lines changed: 8 additions & 0 deletions
diff --git a/‎tools/contam_filter/contam_filter.py
Lines changed: 68 additions & 0 deletions b/‎tools/contam_filter/contam_filter.py
Lines changed: 68 additions & 0 deletions
diff --git a/‎tools/contam_filter/contam_filter.xml
Lines changed: 28 additions & 0 deletions b/‎tools/contam_filter/contam_filter.xml
Lines changed: 28 additions & 0 deletions
diff --git a/‎tools/contam_filter/test-data/1.bam
2.24 KB b/‎tools/contam_filter/test-data/1.bam
2.24 KB
diff --git a/‎tools/contam_filter/test-data/test.hg19.ns.bam
2.36 KB b/‎tools/contam_filter/test-data/test.hg19.ns.bam
2.36 KB
diff --git a/‎tools/contam_filter/test-data/test.mm10.filter.bam
1.72 KB b/‎tools/contam_filter/test-data/test.mm10.filter.bam
1.72 KB
diff --git a/‎tools/contam_filter/test-data/test.mm10.ns.bam
2.24 KB b/‎tools/contam_filter/test-data/test.mm10.ns.bam
2.24 KB
diff --git a/‎tools/region_dnacopy/.shed.yml
Lines changed: 9 additions & 0 deletions b/‎tools/region_dnacopy/.shed.yml
Lines changed: 9 additions & 0 deletions
diff --git a/‎tools/region_dnacopy/region_dnacopy.R
Lines changed: 175 additions & 0 deletions b/‎tools/region_dnacopy/region_dnacopy.R
Lines changed: 175 additions & 0 deletions
@@ -0,0 +1,8 @@
+name: contam_filter
+owner: drevoz
+description: Filter contaminant reads
+long_description: Python script to remove contaminant reads of known origin based on alignment to two genomes - target and contaminant.
+homepage_url: 
+remote_repository_url: 
+categories:
+- SAM
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+import sys
+import tempfile
+import os
+import pysam
+
+
+# minimum MAPQ of the read retained after contamination filtering
+min_qual = int(sys.argv[1])
+
+# inputs: bams vs target and contam genomes
+t_filename = sys.argv[2]
+c_filename = sys.argv[3]
+
+# output: filtered bam alignment
+filter_filename = sys.argv[4]
+#contam_filename = sys.argv[5]
+#unmap_filename = sys.argv[6]
+
+# sort inputs by read name in temp
+t = tempfile.NamedTemporaryFile(suffix = '_t.bam', delete=False)
+c = tempfile.NamedTemporaryFile(suffix = '_c.bam', delete=False)
+tname = t.name
+cname = c.name
+t.close()
+c.close()
+#pysam.sort('-n', '-o', tname, t_filename) # samtools 0.1.19
+#pysam.sort('-n', '-o', cname, c_filename) # samtools 0.1.19
+pysam.sort('-n', t_filename, tname[:-4])
+pysam.sort('-n', c_filename, cname[:-4])
+
+# create intermediate filtered output
+f = tempfile.NamedTemporaryFile(suffix = '_f.bam', delete=False)
+fname = f.name
+f.close()
+
+with pysam.AlignmentFile(tname) as tfile, pysam.AlignmentFile(cname) as cfile:
+    filter_file = pysam.AlignmentFile(fname,'wb', template=tfile)
+    #contam_file = pysam.AlignmentFile(contam_filename,'wb', template=cfile)
+    #unmap_file = pysam.AlignmentFile(unmap_filename,'wb', template=tfile)
+    i = 0
+    for tread in tfile:
+        cread = cfile.next()
+        assert tread.query_name == cread.query_name
+        if tread.mapping_quality >= min_qual and tread.mapping_quality >= cread.mapping_quality:
+            i += 1
+            filter_file.write(tread)
+        #if tread.mapping_quality < min_qual: # unmapped
+        #    unmap_file.write(tread) # output mapping for target genome
+        #elif tread.mapping_quality < cread.mapping_quality: # contamination
+        #    contam_file.write(cread) # output mapping for contamination genome
+        #else: #target
+        #    filter_file.write(tread)
+    filter_file.close()
+    #contam_file.close()
+    #unmap_file.close()
+
+# sort 
+pysam.sort(fname, fname[:-4]+'_srt')
+os.rename(fname[:-4]+'_srt.bam', filter_filename)
+sys.stdout.write('Writing %d alignments to filtered output file %s\n'%(i, filter_filename))
+#sys.stdout.write('3 output files: filtered %s, contamination %s, unmapped %s\n'%(filter_filename, contam_filename, unmap_filename))
+
+#clean-up
+os.unlink(tname)
+os.unlink(cname)
+os.unlink(fname)
@@ -0,0 +1,28 @@
+<tool id="contam_filter" name="Filter contaminant reads" version="0.1.0">
+    <requirements>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command interpreter="python"><![CDATA[
+        contam_filter.py $minq "$input1" "$input2" "$output1"
+    ]]></command>
+    <inputs>
+        <param type="integer" name="minq" value="20" min="0" label="Minimum mapping quality in filtered output" help="By decreasing this value, you will include repetitive and low-quality aligments into downstream analysis."/>
+        <param type="data" name="input1" format="bam" />
+        <param type="data" name="input2" format="bam" />
+    </inputs>
+    <outputs>
+        <data name="output1" format="bam" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="test.mm10.ns.bam"/>
+            <param name="input2" value="test.hg19.ns.bam"/>
+            <output name="output1" file="test.mm10.filter.bam"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        This tool filters contaminant reads based on BAM alignments of same reads to target and contaminant genomes.
+    ]]></help>
+</tool>
@@ -0,0 +1,9 @@
+name: region_dnacopy
+owner: drevoz
+description: Finds regions present on chromosomes
+long_description: R script to find regions present on sequenced chromosomes. Underlying DNAcopy package included.
+homepage_url: 
+remote_repository_url: 
+categories:
+- Genomic Interval Operations
+type: unrestricted
@@ -0,0 +1,175 @@
+#!/usr/bin/Rscript
+
+# From positions bed file generate bed-like file with regions and their statistics and pdf plot for all chromosomes listed in .sizes file. 
+# Steps:
+# Calculate pairwise distances between positions
+# Call regions based on distances to the position on the left
+# Plot chromosomes to '.reg.pdf' file
+# Correct regions - shift one position to the left if seg.mean.left>seg.mean.right
+# Calculate statistics of distances and coverage for each region
+# Write resulting table to '.reg.tsv' file
+
+# Argument1 - '.pos.bed' file with postions
+# Argument2 - '.sizes' file with sizes of chromosomes for which regions are called
+# Argument3 - '.pdf' with regions
+# Argument4 - '.tsv' with regions
+
+# Package installation:
+if(require("DNAcopy")){
+  print("DNAcopy is loaded correctly")
+} else {
+  print("trying to install DNAcopy")
+  source("http://bioconductor.org/biocLite.R")
+  install.packages("DNAcopy")
+  if(require("DNAcopy")){
+    print("DNAcopy installed and loaded")
+  } else {
+    stop("could not install DNAcopy")
+  }
+}
+
+# Parameter section
+args <- commandArgs(trailingOnly = TRUE)
+pos_file <- args[1] # bed file with positions, additional column with coverage
+size_file <- args[2] # file with chromosome sizes
+reg_pdf_file <- args[3] # output chromosome plot
+reg_tsv_file <- args[4] # output table
+sample <- args[5] # sample name
+
+pdf_width <- 20
+pdf_height <- 20
+plot.type <- 's' # 'w' for whole genome in one picture
+left_dist <- TRUE # Calculate distance to left position. Correction for right dist not implemented.
+draw_plot <- TRUE # FALSE to skip plotting
+max_chr_plot <- 42 # maximum number of chromosomes to do plotting - 7*6 plot 20*20 inch
+log_dist <- TRUE # calculate regions based on log10(pairvise distances)
+smooth <- TRUE # remove outlier values
+
+# Inputs
+pos_df <- read.table(pos_file)
+size_df <- read.table(size_file)
+
+
+# Calculate distances between positions
+# Leaves only positions on chromosomes listed in .sizes file
+flt_pos_list <- by(size_df, 1:nrow(size_df), function(chr_data) { # iterate over rows in size_df
+  chr_pos_df <- subset(pos_df, V1 == as.character(chr_data$V1)) # chr_data$V1 subset positions in chromosome
+  if (left_dist) {
+    # dist to position on the left
+    r_start <- chr_pos_df$V2 # start coordinates of all positions
+    l_end <- c(0, chr_pos_df$V3[1:nrow(chr_pos_df)-1]) # end coordinates of prev position, 0 for first 
+  } else {
+    # dist to position on the right. Not supported in downstream analysis.
+    r_start <- c(chr_pos_df$V2[2:nrow(chr_pos_df)],chr_data$V2) # start coordinates of next position, end of chromosome for last
+    l_end <- chr_pos_df$V3 # end coordinates of all positions  
+  }
+  chr_pos_df$V5 <- r_start-l_end # add distance to df
+  chr_pos_df
+})
+flt_pos_df <- do.call('rbind', flt_pos_list) # convert list of df's to df
+flt_pos_df$V1 <- factor(flt_pos_df$V1) # remove unused chromosomes from factor levels
+flt_pos_df$V6 <- log10(flt_pos_df$V5)
+flt_pos_df$V7 <- mapply(gsub, 'chr', '', flt_pos_df$V1) 
+colnames(flt_pos_df) <- c('chrom','start','end','cov','l.dist','log10.l.dist','num.chrom')
+
+# Plot to pdf: chromosome names with stripped chr
+# *correct chromosome sort order, sample name, maploc: need to modify DNAcopy.plot function
+if (nrow(size_df) > max_chr_plot) {
+  draw_plot <- FALSE
+  cat('Number of chromosomes in plot exceeds ', max_chr_plot, '. Plotting will be disabled.', sep = "")
+}
+if (draw_plot) {
+  CNA.object <- CNA(flt_pos_df$log10.l.dist,flt_pos_df$num.chrom,flt_pos_df$start,
+                    data.type = 'logratio',sampleid = sample)
+  if (smooth) {CNA.object <- smooth.CNA(CNA.object)}
+  segment.CNA.object <- segment(CNA.object, verbose=1)
+  pdf(reg_pdf_file, width = pdf_width, height = pdf_height)
+  plot(segment.CNA.object, plot.type=plot.type, xmaploc=T,  
+       ylim=c(0,max(CNA.object[,3]))) # 'w' type for all chromosomes
+  dev.off()
+}
+
+# Call regions: chromosome names as initially
+CNA.object <- CNA(flt_pos_df$log10.l.dist,flt_pos_df$chrom,flt_pos_df$start,
+                  data.type = 'logratio',sampleid = sample)
+if (smooth) {CNA.object <- smooth.CNA(CNA.object)}
+segment.CNA.object <- segment(CNA.object, verbose=1)
+out_regions <- segment.CNA.object$output
+
+# Clean-up
+# rename cols
+names(out_regions)[3] <- "reg.start"
+names(out_regions)[4] <- "reg.end"
+#names(out_regions)[5] <- "positions"
+names(out_regions)[6] <- "log10.mean"
+# remove unneeded cols: ID, num.mark, seg.mean
+corr_out_regions <- out_regions[,-c(1,5)] 
+
+# reg.end - change values from beginning to end of position  
+corr_out_regions$reg.end <- apply(corr_out_regions[,c('chrom','reg.end')], 1, function(y) { # ugly look-up
+  flt_pos_df[flt_pos_df$chrom==y['chrom'] & flt_pos_df$start == as.numeric(y['reg.end']),]$end
+})
+
+# Correct for l distance: shift one position to the left, if seg.mean.left>seg.mean.right
+# *check for skipped positions between regions
+corr_start_end <- c()
+for (i in 1:nrow(corr_out_regions)){
+  # start of first position in segment
+  reg_chrom = as.character(corr_out_regions[i,'chrom'])
+  reg_start = as.numeric(corr_out_regions[i,'reg.start'])
+  reg_end = as.numeric(corr_out_regions[i,'reg.end'])
+  reg_log10_mean = corr_out_regions[i,'log10.mean'] # assumed numeric
+  start_index <- which(flt_pos_df$chrom == reg_chrom & flt_pos_df$start == reg_start)
+  # end of last position in segment
+  end_index <- which(flt_pos_df$chrom == reg_chrom & flt_pos_df$end == reg_end)
+  # correction
+  corr_start <- ifelse(i == 1 || corr_out_regions[i-1,'chrom']!=reg_chrom, # start of chr
+                       reg_start,
+                       ifelse(corr_out_regions[i-1,'log10.mean']>reg_log10_mean, # from lower to higher density
+                              flt_pos_df[start_index-1,'start'], # only case to correct
+                              reg_start))
+  corr_end <- ifelse(i == nrow(corr_out_regions) || reg_chrom!=corr_out_regions[i+1,'chrom'], # end of chr
+                     reg_end,
+                     ifelse(reg_log10_mean>corr_out_regions[i+1,'log10.mean'], # from lower to higher density?
+                            flt_pos_df[end_index-1,'end'], # only case to correct
+                            reg_end))
+  corr_start_end <- c(corr_start_end, corr_start, corr_end)               
+}
+corr_se_matrix <- matrix(corr_start_end, ncol=2, byrow=T)
+corr_out_regions$reg.start <- corr_se_matrix[,1]
+corr_out_regions$reg.end <- corr_se_matrix[,2]
+corr_out_regions$reg.size <- corr_out_regions$reg.end - corr_out_regions$reg.start
+
+# Calculate additional statistics for each region: number of markers, pd and read coverage mean and sd, pos sizes
+add_stats <- apply(corr_out_regions[,c('chrom','reg.start','reg.end')], 1, function(y) {
+  # Subset of positions for each region
+  flt_pos_sub <- subset(flt_pos_df, chrom==y['chrom']
+                        & start>=as.numeric(y['reg.start']) 
+                        & end<=as.numeric(y['reg.end']) )
+  # distances between positions within region - all but first
+  in_dist <- flt_pos_sub$l.dist[2:nrow(flt_pos_sub)]
+  # chrom size from .sizes file 
+  chr_size <- as.numeric(subset(size_df,V1==y['chrom']))[2] 
+  return(c(nrow(flt_pos_sub), # 1 number of positions
+           mean(in_dist),sd(in_dist), # 2 mean and 3 sd of pairwise distances between positions inside region
+           mean(flt_pos_sub$cov),sd(flt_pos_sub$cov), # 4 mean and 5 sd coverage (actually, number of reads within positions)
+           chr_size, # 6 size of chromosomes
+           mean(flt_pos_sub$end-flt_pos_sub$start),sum(flt_pos_sub$end-flt_pos_sub$start) # 7 mean and 8 total size of positions within the region
+           ))
+})
+# add to output
+corr_out_regions$positions <- add_stats[1,]
+corr_out_regions$pd.mean <- round(add_stats[2,], digits=0)
+corr_out_regions$pd.sd <- round(add_stats[3,], digits=0)
+corr_out_regions$cov.mean <- round(add_stats[4,], digits=2)
+corr_out_regions$cov.sd <- round(add_stats[5,], digits=2)
+corr_out_regions$chrom.size <- add_stats[6,]
+corr_out_regions$pos.bp.mean <- round(add_stats[7,], digits=0)
+corr_out_regions$pos.bp.total <- add_stats[8,]
+corr_out_regions$pos.cov <- corr_out_regions$pos.bp.total/(corr_out_regions$reg.end-corr_out_regions$reg.start)
+corr_out_regions <- corr_out_regions[c('chrom','reg.start','reg.end','reg.size','positions','pd.mean','pd.sd',
+                                       'cov.mean','cov.sd','pos.bp.mean','pos.bp.total','pos.cov','chrom.size','log10.mean')]
+
+# Write tsv file
+write.table(corr_out_regions,file=reg_tsv_file,quote=F,sep='\t',
+            row.names=F,col.names=T)