Skip to content

Commit 93ec0ba

Browse files
committed
Added paired-end support with split reads
1 parent 5972521 commit 93ec0ba

File tree

14 files changed

+526
-139
lines changed

14 files changed

+526
-139
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ BUILD_DATE := "$(shell date)"
44
CC=gcc
55
CFLAGS = -O3 -g -I htslib -I sonic -DSVDEPTH_VERSION=\"$(SVDEPTH_VERSION)\" -DBUILD_DATE=\"$(BUILD_DATE)\" -DSVDEPTH_UPDATE=\"$(SVDEPTH_UPDATE)\"
66
LDFLAGS = htslib/libhts.a sonic/libsonic.a -lz -lm -lpthread -llzma -lbz2 -lcurl
7-
SOURCES = svdepth.c cmdline.c common.c bam_data.c read_distribution.c free.c likelihood.c svs.c
7+
SOURCES = svdepth.c cmdline.c common.c bam_data.c read_distribution.c free.c likelihood.c svs.c split_read.c
88
OBJECTS = $(SOURCES:.c=.o)
99
EXECUTABLE = svdepth
1010
INSTALLPATH = /usr/local/bin/

bam_data.c

Lines changed: 169 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,46 +2,173 @@
22
#include <math.h>
33
#include "free.h"
44
#include "likelihood.h"
5+
#include "split_read.h"
56

7+
struct SplitsInfo *all_split_reads = NULL;
68

7-
void get_sample_name(bam_info* in_bam, char* header_text)
9+
10+
SplitRow *createSplitRow(int locMapLeftStart, int locMapLeftEnd, char orientationLeft,
11+
int locMapRightStart, int locMapRightEnd, char orientationRight, char svType)
812
{
9-
char *tmp_header = NULL;
10-
set_str( &( tmp_header), header_text);
11-
char* p = strtok( tmp_header, "\t\n");
12-
char sample_name_buffer[1024];
13+
SplitRow *newRow = (SplitRow*) getMem(sizeof(SplitRow));
14+
15+
newRow->next = NULL;
16+
17+
newRow->locMapLeftEnd = locMapLeftEnd;
18+
newRow->locMapLeftStart = locMapLeftStart;
19+
newRow->locMapRightStart = locMapRightStart;
20+
newRow->locMapRightEnd = locMapRightEnd;
21+
22+
newRow->orientationLeft = orientationLeft;
23+
newRow->orientationRight = orientationRight;
24+
25+
newRow->svType = svType;
26+
27+
return newRow;
28+
}
29+
30+
SplitRow *determine_SvType( splitRead *ptrSplitRead, posMapSplitRead *ptrPosMapSplit)
31+
{
32+
int pos1_1, pos1_2, pos2_1, pos2_2;
33+
34+
/* Length of read A(left) and read B(right) */
35+
int lengthRead, lengthSplit;
36+
37+
/* If soft clip is at the end */
38+
lengthSplit = ptrSplitRead->read_length - ptrSplitRead->split_start;
39+
lengthRead = ptrSplitRead->read_length - lengthSplit;
1340

14-
while( p != NULL)
41+
if( ptrSplitRead->pos < ptrPosMapSplit->posMap)
1542
{
16-
/* If the current token has "SM" as the first two characters,
17-
we have found our Sample Name */
18-
if( p[0] == 'S' && p[1] == 'M')
19-
{
20-
/* Get the Sample Name */
21-
strncpy( sample_name_buffer, p + 3, strlen( p) - 3);
43+
pos1_1 = ptrSplitRead->pos;
44+
pos1_2 = ptrSplitRead->pos + lengthRead;
45+
pos2_1 = ptrPosMapSplit->posMap;
46+
pos2_2 = ptrPosMapSplit->posMap + lengthSplit;
47+
}
48+
else if( ptrPosMapSplit->posMap < ptrSplitRead->pos)
49+
{
50+
pos1_1 = ptrPosMapSplit->posMap;
51+
pos1_2 = ptrPosMapSplit->posMap + lengthSplit;
52+
pos2_1 = ptrSplitRead->pos;
53+
pos2_2 = ptrSplitRead->pos + lengthRead;
54+
}
2255

23-
/* Add the NULL terminator */
24-
sample_name_buffer[strlen( p) - 3] = '\0';
56+
if( pos1_2 >= pos2_1)
57+
return NULL;
2558

26-
/* Exit loop */
27-
break;
59+
if( ptrSplitRead->orient == FORWARD && ptrPosMapSplit->orient == FORWARD)
60+
{
61+
if( ( ptrSplitRead->pos < ptrPosMapSplit->posMap))
62+
{
63+
SplitRow * newRow = createSplitRow (pos1_1, pos1_2, FORWARD, pos2_1, pos2_2, FORWARD, DELETION);
64+
return newRow;
65+
}
66+
else if( ( ptrPosMapSplit->posMap < ptrSplitRead->pos))
67+
{
68+
SplitRow * newRow = createSplitRow (pos1_1, pos1_2, FORWARD, pos2_1, pos2_2, FORWARD, DUPLICATION);
69+
return newRow;
2870
}
29-
p = strtok( NULL, "\t\n");
3071
}
72+
return NULL;
73+
}
74+
75+
int read_SplitReads(splitRead *ptrSoftClip, parameters *params, int chr_index)
76+
{
77+
float is_satellite = 0.0;
78+
SplitRow *newRow = NULL;
79+
posMapSplitRead *ptrPosMapSoftClip;
80+
81+
all_split_reads = ( SplitsInfo *) getMem( sizeof( struct SplitsInfo));
82+
all_split_reads->size = 0;
83+
all_split_reads->head = NULL;
84+
all_split_reads->tail = NULL;
85+
86+
while( ptrSoftClip != NULL)
87+
{
88+
ptrPosMapSoftClip = ptrSoftClip->ptrSplitMap;
89+
while( ptrPosMapSoftClip != NULL)
90+
{
91+
is_satellite = sonic_is_satellite( params->this_sonic, ptrSoftClip->chromosome_name, ptrSoftClip->pos, ptrSoftClip->pos + 1 ) +
92+
sonic_is_satellite( params->this_sonic, ptrSoftClip->chromosome_name, ptrPosMapSoftClip->posMap, ptrPosMapSoftClip->posMap + 1);
93+
94+
if ( is_satellite == 0 && ptrSoftClip->qual > params->mq_threshold && strcmp(ptrSoftClip->chromosome_name, params->this_sonic->chromosome_names[chr_index]) == 0
95+
&& ptrPosMapSoftClip->mapq > params->mq_threshold && ptrSoftClip->pos > 0 && ptrPosMapSoftClip->posMap > 0
96+
&& ptrSoftClip->pos < params->this_sonic->chromosome_lengths[chr_index] && ptrPosMapSoftClip->posMap < params->this_sonic->chromosome_lengths[chr_index])
97+
{
98+
newRow = determine_SvType(ptrSoftClip, ptrPosMapSoftClip);
99+
if( newRow != NULL)
100+
{
101+
/* For Deletion */
102+
if( newRow->svType == DELETION)
103+
{
104+
newRow->orientationLeft = FORWARD;
105+
newRow->orientationRight = REVERSE;
106+
newRow->locMapLeftStart -= SOFTCLIP_WRONGMAP_WINDOW;
107+
newRow->locMapLeftEnd -= SOFTCLIP_WRONGMAP_WINDOW;
108+
newRow->locMapRightStart += SOFTCLIP_WRONGMAP_WINDOW;
109+
newRow->locMapRightEnd += SOFTCLIP_WRONGMAP_WINDOW;
110+
}
111+
/* For Duplication */
112+
else if(newRow->svType == DUPLICATION)
113+
{
114+
newRow->orientationLeft = REVERSE;
115+
newRow->orientationRight = FORWARD;
116+
newRow->locMapLeftStart -= SOFTCLIP_WRONGMAP_WINDOW;
117+
newRow->locMapLeftEnd -= SOFTCLIP_WRONGMAP_WINDOW;
118+
newRow->locMapRightStart += SOFTCLIP_WRONGMAP_WINDOW;
119+
newRow->locMapRightEnd += SOFTCLIP_WRONGMAP_WINDOW;
120+
}
121+
else
122+
newRow = NULL;
123+
}
31124

32-
set_str( &( in_bam->sample_name), sample_name_buffer);
33-
free( tmp_header);
125+
if( newRow == NULL)
126+
;//fprintf( stderr, "ERROR loading divet from bam (soft clip)\n");
127+
else
128+
{
129+
if( all_split_reads->head == NULL || all_split_reads->tail == NULL)
130+
{
131+
all_split_reads->head = newRow;
132+
all_split_reads->tail = newRow;
133+
}
134+
else
135+
{
136+
all_split_reads->tail->next = newRow;
137+
all_split_reads->tail = newRow;
138+
}
139+
all_split_reads->size++;
140+
}
141+
}
142+
ptrPosMapSoftClip = ptrPosMapSoftClip->next;
143+
}
144+
ptrSoftClip = ptrSoftClip->next;
145+
}
146+
fprintf(stderr,"There are %d split reads\n",all_split_reads->size);
147+
return all_split_reads->size;
34148
}
35149

36-
void count_reads_bam( bam_info* in_bam, parameters* params)
150+
void count_reads_bam( bam_info* in_bam, parameters* params, int chr_index)
37151
{
38152
bam1_core_t bam_alignment_core;
39153
bam1_t* bam_alignment = bam_init1();
40154

155+
int return_type;
156+
41157
while( sam_itr_next( in_bam->bam_file, in_bam->iter, bam_alignment) > 0)
42158
{
43159
bam_alignment_core = bam_alignment->core;
44160

161+
if( sonic_is_satellite( params->this_sonic, params->this_sonic->chromosome_names[chr_index], bam_alignment_core.pos, bam_alignment_core.pos + 20) == 0
162+
&& bam_alignment_core.qual > params->mq_threshold && is_proper( bam_alignment_core.flag)
163+
&& bam_alignment_core.l_qseq > params->min_read_length)
164+
{
165+
if( bam_alignment_core.l_qseq > params->min_read_length)
166+
return_type = find_split_reads( in_bam, params, bam_alignment, chr_index);
167+
168+
if( return_type == -1)
169+
continue;
170+
}
171+
45172
/* Increase the read depth and read count for RD filtering */
46173
in_bam->read_depth_per_chr[bam_alignment_core.pos]++;
47174
in_bam->read_count++;
@@ -60,7 +187,7 @@ void read_bam( bam_info* in_bam, parameters *params)
60187
sprintf( svfile, "%s%s_svs.bed", params->outdir, params->outprefix);
61188
fprintf( stderr, "\nOutput SV file: %s\n", svfile);
62189
fpSVs = safe_fopen( svfile,"w");
63-
fprintf(fpSVs,"#CHR\tSTART_SV\tEND_SV\tSV_TYPE\tLIKELIHOOD\tCOPY_NUMBER\n");
190+
fprintf(fpSVs,"#CHR\tSTART_SV\tEND_SV\tSV_TYPE\tLIKELIHOOD\tCOPY_NUMBER\tREAD_PAIR\n");
64191

65192
sprintf( svfile_del, "%s%s_dels.bed", params->outdir, params->outprefix);
66193
fprintf( stderr, "Output Del file: %s\n", svfile_del);
@@ -112,6 +239,10 @@ void read_bam( bam_info* in_bam, parameters *params)
112239
exit( 1);
113240
}
114241

242+
243+
/* Extract the Sample Name from the header text */
244+
get_sample_name( in_bam, in_bam->bam_header->text);
245+
115246
fprintf( stderr, "\n");
116247
fprintf( stderr, "Reading BAM [%s] - Chromosome: %s", in_bam->sample_name, in_bam->bam_header->target_name[chr_index_bam]);
117248

@@ -121,7 +252,15 @@ void read_bam( bam_info* in_bam, parameters *params)
121252
init_rd_per_chr( in_bam, params, chr_index);
122253

123254
/* Read bam file for this chromosome */
124-
count_reads_bam( in_bam, params);
255+
count_reads_bam( in_bam, params, chr_index);
256+
257+
258+
fprintf( stderr, "\nReading the Reference Genome");
259+
readReferenceSeq(params, chr_index);
260+
261+
fprintf( stderr, "\nMapping the Splits\n");
262+
map_split_reads(in_bam, params, chr_index);
263+
125264

126265
/* Mean value (mu) calculation */
127266
calc_mean_per_chr( params, in_bam, chr_index);
@@ -141,7 +280,15 @@ void read_bam( bam_info* in_bam, parameters *params)
141280
if( not_in_bam == 1)
142281
continue;
143282

283+
//Load Split-Reads
284+
read_SplitReads(in_bam->listSplitRead, params, chr_index);
285+
286+
//fprintf( stderr, "\nLikelihood Estimation\n");
144287
find_SVs( in_bam, params, fpDel, fpDup, fpSVs, chr);
288+
free_hash_table(params);
289+
290+
/* Free the read depth array*/
291+
free( in_bam->read_depth_per_chr);
145292
}
146293
fprintf( stderr, "\n");
147294
fclose( fpDel);

bam_data.h

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,11 @@
77
#include <stdbool.h>
88
#include "common.h"
99

10+
extern long split_read_count;
11+
1012
/* Maximum sequence/quality length */
1113
#define MAX_SEQ 1000
12-
13-
/* Gender of the bam file */
14-
enum gender{ MALE, FEMALE};
15-
16-
typedef struct _bam_info
17-
{
18-
int read_count; /* total number of reads in this library */
19-
short* read_depth_per_chr; /* read depth */
20-
float mean;
21-
float mean_rd_per_gc[101]; /* GC percentages, i.e., GC[13]=323 means 343 windows have GC of 13% */
22-
23-
htsFile* bam_file; /* file pointer to the BAM file */
24-
hts_idx_t* bam_file_index;
25-
hts_itr_t *iter;
26-
bam_hdr_t* bam_header;
27-
28-
enum gender sample_gender; /* gender of the sample */
29-
char* sample_name; /* name of the sample, parsed from SM in the BAM header */
30-
} bam_info;
14+
#define SOFTCLIP_WRONGMAP_WINDOW 20
3115

3216

3317
/* Function Prototypes */
@@ -36,8 +20,4 @@ void print_bam( bam_info* in_bam);
3620
void read_bam( bam_info* in_bam, parameters *params);
3721

3822

39-
/* BAM Utility functions */
40-
void get_sample_name( bam_info* in_bam, char* header_text);
41-
42-
4323
#endif

cmdline.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,14 @@ int parse_command_line( int argc, char** argv, parameters* params)
2727
{"help" , no_argument, 0, 'h'},
2828
{"version", no_argument, 0, 'v'},
2929
{"sv-size" , required_argument, 0, 'l'},
30+
{"min-read-length" , required_argument, 0, 'b'},
3031
{"out" , required_argument, 0, 'o'},
3132
{"sonic" , required_argument, 0, 's'},
3233
{"sonic-info" , required_argument, 0, 'n'},
3334
{"first-chrom", required_argument, 0, FIRST_CHROM},
3435
{"last-chrom", required_argument, 0, LAST_CHROM},
3536
{"rd", required_argument, 0, 'a'},
37+
{"rp", required_argument, 0, 'j'},
3638
{"mq", required_argument, 0, 'e'},
3739
{0 , 0, 0, 0 }
3840
};
@@ -81,6 +83,10 @@ int parse_command_line( int argc, char** argv, parameters* params)
8183
params->min_sv_size = atoi( optarg);
8284
break;
8385

86+
case 'b':
87+
params->min_read_length = atoi( optarg);
88+
break;
89+
8490
case 'h':
8591
print_help();
8692
return 0;
@@ -143,13 +149,21 @@ int parse_command_line( int argc, char** argv, parameters* params)
143149
}
144150

145151
if( threshold == NULL)
146-
params->rd_threshold = 100;
152+
params->rd_threshold = 1000;
147153
else
148154
{
149155
params->rd_threshold = atoi(threshold);
150156
free( threshold);
151157
}
152158

159+
if( rp_support == NULL)
160+
params->rp_support = 2;
161+
else
162+
{
163+
params->rp_support = atoi(rp_support);
164+
free( threshold);
165+
}
166+
153167
if( mapping_qual == NULL)
154168
params->mq_threshold = 5;
155169
else

0 commit comments

Comments
 (0)