1
+ # !/usr/bin/Rscript
2
+
3
+ # From positions bed file generate bed-like file with regions and their statistics and pdf plot for all chromosomes listed in .sizes file.
4
+ # Steps:
5
+ # Calculate pairwise distances between positions
6
+ # Call regions based on distances to the position on the left
7
+ # Plot chromosomes to '.reg.pdf' file
8
+ # Correct regions - shift one position to the left if seg.mean.left>seg.mean.right
9
+ # Calculate statistics of distances and coverage for each region
10
+ # Write resulting table to '.reg.tsv' file
11
+
12
+ # Argument1 - '.pos.bed' file with postions
13
+ # Argument2 - '.sizes' file with sizes of chromosomes for which regions are called
14
+ # Argument3 - '.pdf' with regions
15
+ # Argument4 - '.tsv' with regions
16
+
17
+ # Package installation:
18
+ if (require(" DNAcopy" )){
19
+ print(" DNAcopy is loaded correctly" )
20
+ } else {
21
+ print(" trying to install DNAcopy" )
22
+ source(" http://bioconductor.org/biocLite.R" )
23
+ install.packages(" DNAcopy" )
24
+ if (require(" DNAcopy" )){
25
+ print(" DNAcopy installed and loaded" )
26
+ } else {
27
+ stop(" could not install DNAcopy" )
28
+ }
29
+ }
30
+
31
+ # Parameter section
32
+ args <- commandArgs(trailingOnly = TRUE )
33
+ pos_file <- args [1 ] # bed file with positions, additional column with coverage
34
+ size_file <- args [2 ] # file with chromosome sizes
35
+ reg_pdf_file <- args [3 ] # output chromosome plot
36
+ reg_tsv_file <- args [4 ] # output table
37
+ sample <- args [5 ] # sample name
38
+
39
+ pdf_width <- 20
40
+ pdf_height <- 20
41
+ plot.type <- ' s' # 'w' for whole genome in one picture
42
+ left_dist <- TRUE # Calculate distance to left position. Correction for right dist not implemented.
43
+ draw_plot <- TRUE # FALSE to skip plotting
44
+ max_chr_plot <- 42 # maximum number of chromosomes to do plotting - 7*6 plot 20*20 inch
45
+ log_dist <- TRUE # calculate regions based on log10(pairvise distances)
46
+ smooth <- TRUE # remove outlier values
47
+
48
+ # Inputs
49
+ pos_df <- read.table(pos_file )
50
+ size_df <- read.table(size_file )
51
+
52
+
53
+ # Calculate distances between positions
54
+ # Leaves only positions on chromosomes listed in .sizes file
55
+ flt_pos_list <- by(size_df , 1 : nrow(size_df ), function (chr_data ) { # iterate over rows in size_df
56
+ chr_pos_df <- subset(pos_df , V1 == as.character(chr_data $ V1 )) # chr_data$V1 subset positions in chromosome
57
+ if (left_dist ) {
58
+ # dist to position on the left
59
+ r_start <- chr_pos_df $ V2 # start coordinates of all positions
60
+ l_end <- c(0 , chr_pos_df $ V3 [1 : nrow(chr_pos_df )- 1 ]) # end coordinates of prev position, 0 for first
61
+ } else {
62
+ # dist to position on the right. Not supported in downstream analysis.
63
+ r_start <- c(chr_pos_df $ V2 [2 : nrow(chr_pos_df )],chr_data $ V2 ) # start coordinates of next position, end of chromosome for last
64
+ l_end <- chr_pos_df $ V3 # end coordinates of all positions
65
+ }
66
+ chr_pos_df $ V5 <- r_start - l_end # add distance to df
67
+ chr_pos_df
68
+ })
69
+ flt_pos_df <- do.call(' rbind' , flt_pos_list ) # convert list of df's to df
70
+ flt_pos_df $ V1 <- factor (flt_pos_df $ V1 ) # remove unused chromosomes from factor levels
71
+ flt_pos_df $ V6 <- log10(flt_pos_df $ V5 )
72
+ flt_pos_df $ V7 <- mapply(gsub , ' chr' , ' ' , flt_pos_df $ V1 )
73
+ colnames(flt_pos_df ) <- c(' chrom' ,' start' ,' end' ,' cov' ,' l.dist' ,' log10.l.dist' ,' num.chrom' )
74
+
75
+ # Plot to pdf: chromosome names with stripped chr
76
+ # *correct chromosome sort order, sample name, maploc: need to modify DNAcopy.plot function
77
+ if (nrow(size_df ) > max_chr_plot ) {
78
+ draw_plot <- FALSE
79
+ cat(' Number of chromosomes in plot exceeds ' , max_chr_plot , ' . Plotting will be disabled.' , sep = " " )
80
+ }
81
+ if (draw_plot ) {
82
+ CNA.object <- CNA(flt_pos_df $ log10.l.dist ,flt_pos_df $ num.chrom ,flt_pos_df $ start ,
83
+ data.type = ' logratio' ,sampleid = sample )
84
+ if (smooth ) {CNA.object <- smooth.CNA(CNA.object )}
85
+ segment.CNA.object <- segment(CNA.object , verbose = 1 )
86
+ pdf(reg_pdf_file , width = pdf_width , height = pdf_height )
87
+ plot(segment.CNA.object , plot.type = plot.type , xmaploc = T ,
88
+ ylim = c(0 ,max(CNA.object [,3 ]))) # 'w' type for all chromosomes
89
+ dev.off()
90
+ }
91
+
92
+ # Call regions: chromosome names as initially
93
+ CNA.object <- CNA(flt_pos_df $ log10.l.dist ,flt_pos_df $ chrom ,flt_pos_df $ start ,
94
+ data.type = ' logratio' ,sampleid = sample )
95
+ if (smooth ) {CNA.object <- smooth.CNA(CNA.object )}
96
+ segment.CNA.object <- segment(CNA.object , verbose = 1 )
97
+ out_regions <- segment.CNA.object $ output
98
+
99
+ # Clean-up
100
+ # rename cols
101
+ names(out_regions )[3 ] <- " reg.start"
102
+ names(out_regions )[4 ] <- " reg.end"
103
+ # names(out_regions)[5] <- "positions"
104
+ names(out_regions )[6 ] <- " log10.mean"
105
+ # remove unneeded cols: ID, num.mark, seg.mean
106
+ corr_out_regions <- out_regions [,- c(1 ,5 )]
107
+
108
+ # reg.end - change values from beginning to end of position
109
+ corr_out_regions $ reg.end <- apply(corr_out_regions [,c(' chrom' ,' reg.end' )], 1 , function (y ) { # ugly look-up
110
+ flt_pos_df [flt_pos_df $ chrom == y [' chrom' ] & flt_pos_df $ start == as.numeric(y [' reg.end' ]),]$ end
111
+ })
112
+
113
+ # Correct for l distance: shift one position to the left, if seg.mean.left>seg.mean.right
114
+ # *check for skipped positions between regions
115
+ corr_start_end <- c()
116
+ for (i in 1 : nrow(corr_out_regions )){
117
+ # start of first position in segment
118
+ reg_chrom = as.character(corr_out_regions [i ,' chrom' ])
119
+ reg_start = as.numeric(corr_out_regions [i ,' reg.start' ])
120
+ reg_end = as.numeric(corr_out_regions [i ,' reg.end' ])
121
+ reg_log10_mean = corr_out_regions [i ,' log10.mean' ] # assumed numeric
122
+ start_index <- which(flt_pos_df $ chrom == reg_chrom & flt_pos_df $ start == reg_start )
123
+ # end of last position in segment
124
+ end_index <- which(flt_pos_df $ chrom == reg_chrom & flt_pos_df $ end == reg_end )
125
+ # correction
126
+ corr_start <- ifelse(i == 1 || corr_out_regions [i - 1 ,' chrom' ]!= reg_chrom , # start of chr
127
+ reg_start ,
128
+ ifelse(corr_out_regions [i - 1 ,' log10.mean' ]> reg_log10_mean , # from lower to higher density
129
+ flt_pos_df [start_index - 1 ,' start' ], # only case to correct
130
+ reg_start ))
131
+ corr_end <- ifelse(i == nrow(corr_out_regions ) || reg_chrom != corr_out_regions [i + 1 ,' chrom' ], # end of chr
132
+ reg_end ,
133
+ ifelse(reg_log10_mean > corr_out_regions [i + 1 ,' log10.mean' ], # from lower to higher density?
134
+ flt_pos_df [end_index - 1 ,' end' ], # only case to correct
135
+ reg_end ))
136
+ corr_start_end <- c(corr_start_end , corr_start , corr_end )
137
+ }
138
+ corr_se_matrix <- matrix (corr_start_end , ncol = 2 , byrow = T )
139
+ corr_out_regions $ reg.start <- corr_se_matrix [,1 ]
140
+ corr_out_regions $ reg.end <- corr_se_matrix [,2 ]
141
+ corr_out_regions $ reg.size <- corr_out_regions $ reg.end - corr_out_regions $ reg.start
142
+
143
+ # Calculate additional statistics for each region: number of markers, pd and read coverage mean and sd, pos sizes
144
+ add_stats <- apply(corr_out_regions [,c(' chrom' ,' reg.start' ,' reg.end' )], 1 , function (y ) {
145
+ # Subset of positions for each region
146
+ flt_pos_sub <- subset(flt_pos_df , chrom == y [' chrom' ]
147
+ & start > = as.numeric(y [' reg.start' ])
148
+ & end < = as.numeric(y [' reg.end' ]) )
149
+ # distances between positions within region - all but first
150
+ in_dist <- flt_pos_sub $ l.dist [2 : nrow(flt_pos_sub )]
151
+ # chrom size from .sizes file
152
+ chr_size <- as.numeric(subset(size_df ,V1 == y [' chrom' ]))[2 ]
153
+ return (c(nrow(flt_pos_sub ), # 1 number of positions
154
+ mean(in_dist ),sd(in_dist ), # 2 mean and 3 sd of pairwise distances between positions inside region
155
+ mean(flt_pos_sub $ cov ),sd(flt_pos_sub $ cov ), # 4 mean and 5 sd coverage (actually, number of reads within positions)
156
+ chr_size , # 6 size of chromosomes
157
+ mean(flt_pos_sub $ end - flt_pos_sub $ start ),sum(flt_pos_sub $ end - flt_pos_sub $ start ) # 7 mean and 8 total size of positions within the region
158
+ ))
159
+ })
160
+ # add to output
161
+ corr_out_regions $ positions <- add_stats [1 ,]
162
+ corr_out_regions $ pd.mean <- round(add_stats [2 ,], digits = 0 )
163
+ corr_out_regions $ pd.sd <- round(add_stats [3 ,], digits = 0 )
164
+ corr_out_regions $ cov.mean <- round(add_stats [4 ,], digits = 2 )
165
+ corr_out_regions $ cov.sd <- round(add_stats [5 ,], digits = 2 )
166
+ corr_out_regions $ chrom.size <- add_stats [6 ,]
167
+ corr_out_regions $ pos.bp.mean <- round(add_stats [7 ,], digits = 0 )
168
+ corr_out_regions $ pos.bp.total <- add_stats [8 ,]
169
+ corr_out_regions $ pos.cov <- corr_out_regions $ pos.bp.total / (corr_out_regions $ reg.end - corr_out_regions $ reg.start )
170
+ corr_out_regions <- corr_out_regions [c(' chrom' ,' reg.start' ,' reg.end' ,' reg.size' ,' positions' ,' pd.mean' ,' pd.sd' ,
171
+ ' cov.mean' ,' cov.sd' ,' pos.bp.mean' ,' pos.bp.total' ,' pos.cov' ,' chrom.size' ,' log10.mean' )]
172
+
173
+ # Write tsv file
174
+ write.table(corr_out_regions ,file = reg_tsv_file ,quote = F ,sep = ' \t ' ,
175
+ row.names = F ,col.names = T )
0 commit comments