Skip to content
This repository was archived by the owner on Nov 9, 2019. It is now read-only.

Commit 8643640

Browse files
committed
Merge pull request #532 from broadinstitute/db_gc
GC bias correction. Closes #226.
2 parents e73b005 + ea0214f commit 8643640

File tree

10 files changed

+445
-15
lines changed

10 files changed

+445
-15
lines changed

src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,8 @@ private TargetCollection<? extends BEDFeature> readTargetCollection(final File t
214214
* @param <E> the element type of {@code exonCollection}, it must be a {@link BEDFeature}.
215215
* @return never {@code null}.
216216
* @throws UserException.CouldNotReadInputFile if there was some problem
217-
* trying to read from {@code readCountsFile}.
218-
* @throws UserException.BadInput if there is some format issue with the input file {@code readCountsFile},
217+
* trying to read from {@code inputReadCountsFile}.
218+
* @throws UserException.BadInput if there is some format issue with the input file {@code inputReadCountsFile},
219219
* or it does not contain target names and {@code exonCollection} is {@code null}
220220
* or the input read counts contain more thanb one sample.
221221
*/

src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
* @author Valentin Ruano-Rubio &lt;[email protected]&gt;
1111
*/
12-
final class TargetAnnotationCollection {
12+
public final class TargetAnnotationCollection {
1313

1414
private final EnumMap<TargetAnnotation, String> values = new EnumMap<TargetAnnotation, String>(TargetAnnotation.class);
1515

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package org.broadinstitute.hellbender.tools.exome.gcbias;
2+
3+
import org.broadinstitute.hellbender.cmdline.*;
4+
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
5+
import org.broadinstitute.hellbender.exceptions.UserException;
6+
import org.broadinstitute.hellbender.tools.exome.*;
7+
8+
import java.io.File;
9+
import java.io.IOException;
10+
import java.util.List;
11+
import java.util.stream.Collectors;
12+
13+
/**
14+
* Correct coverage for sample-specific GC bias effects as described in {@link GCCorrector}.
15+
*
16+
* Inputs are
17+
* 1. read counts file (format described in {@link ReadCountCollectionUtils}). Since this tool corrects
18+
* for multiplicative GC bias effects, these counts should not be in log space i.e. they should represent
19+
* raw coverage or proportional coverage upstream of tangent normalization.
20+
* 2. targets file containing GC content annotation as produced by {@link AnnotateTargets}. Every target in the input
21+
* read counts must be present in the targets file but the reverse need not be true.
22+
*
23+
* Output is a read counts file with the same targets (rows) and samples (columns) as the input, corrected for GC bias.
24+
* Coverage is represented by doubles in {@link ReadCountCollection}.
25+
*
26+
* @author David Benjamin &lt;[email protected]&gt;
27+
*/
28+
@CommandLineProgramProperties(
29+
oneLineSummary = "Correct for per-sample GC bias effects.",
30+
summary = "Correct coverage in a read counts files by estimating per-sample bias as a function of target GC" +
31+
" content and dividing input coverage by these derived bias curves.",
32+
programGroup = CopyNumberProgramGroup.class
33+
)
34+
public class CorrectGCBias extends CommandLineProgram {
35+
public static final String INPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.INPUT_LONG_NAME;
36+
public static final String INPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.INPUT_SHORT_NAME;
37+
38+
public static final String OUTPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.OUTPUT_LONG_NAME;
39+
public static final String OUTPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.OUTPUT_SHORT_NAME;
40+
41+
@Argument(
42+
doc = "Read counts input file.",
43+
shortName = INPUT_READ_COUNTS_FILE_SHORT_NAME,
44+
fullName = INPUT_READ_COUNTS_FILE_LONG_NAME,
45+
optional = false
46+
)
47+
protected File inputReadCountsFile;
48+
49+
@Argument(
50+
doc = "Output file of GC-corrected read counts.",
51+
shortName = OUTPUT_READ_COUNTS_FILE_SHORT_NAME,
52+
fullName = OUTPUT_READ_COUNTS_FILE_LONG_NAME,
53+
optional = false
54+
)
55+
protected File outputReadCountsFile;
56+
57+
// arguments for GC-annotated targets
58+
@ArgumentCollection
59+
protected TargetArgumentCollection targetArguments = new TargetArgumentCollection();
60+
61+
@Override
62+
protected Object doWork() {
63+
final TargetCollection<Target> targets = targetArguments.readTargetCollection(false);
64+
final ReadCountCollection inputCounts = readInputCounts(targets);
65+
final double[] gcContentByTarget = gcContentsOfTargets(inputCounts, targets);
66+
final ReadCountCollection outputCounts = GCCorrector.correctCoverage(inputCounts, gcContentByTarget);
67+
writeOutputCounts(outputCounts);
68+
return "Success";
69+
}
70+
71+
private ReadCountCollection readInputCounts(final TargetCollection<Target> targets) {
72+
final ReadCountCollection inputCounts;
73+
try {
74+
inputCounts = ReadCountCollectionUtils.parse(inputReadCountsFile, targets, false);
75+
} catch (final IOException ex) {
76+
throw new UserException.CouldNotReadInputFile(inputReadCountsFile, ex.getMessage(), ex);
77+
}
78+
return inputCounts;
79+
}
80+
81+
private void writeOutputCounts(final ReadCountCollection outputCounts) {
82+
try {
83+
ReadCountCollectionUtils.write(outputReadCountsFile, outputCounts);
84+
} catch (final IOException ex) {
85+
throw new UserException.CouldNotCreateOutputFile(outputReadCountsFile, ex);
86+
}
87+
}
88+
89+
private double[] gcContentsOfTargets(final ReadCountCollection inputCounts, final TargetCollection<Target> targets) {
90+
final List<Target> annotatedTargets = inputCounts.targets().stream().map(t -> targets.target(t.getName())).collect(Collectors.toList());
91+
if (!annotatedTargets.stream().allMatch(t -> t.getAnnotations().hasAnnotation(TargetAnnotation.GC_CONTENT))) {
92+
throw new UserException.BadInput("At least one target lacks a GC annotation.");
93+
}
94+
return annotatedTargets.stream().mapToDouble(t -> t.getAnnotations().getDouble(TargetAnnotation.GC_CONTENT)).toArray();
95+
}
96+
97+
98+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
package org.broadinstitute.hellbender.tools.exome.gcbias;
2+
3+
import org.apache.commons.math3.linear.ArrayRealVector;
4+
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
5+
import org.apache.commons.math3.linear.RealMatrix;
6+
import org.apache.commons.math3.linear.RealVector;
7+
import org.apache.commons.math3.stat.descriptive.rank.Median;
8+
import org.broadinstitute.hellbender.tools.exome.ReadCountCollection;
9+
import org.broadinstitute.hellbender.utils.Utils;
10+
11+
import java.util.ArrayList;
12+
import java.util.Arrays;
13+
import java.util.List;
14+
import java.util.stream.Collectors;
15+
import java.util.stream.IntStream;
16+
17+
/**
18+
* Learn multiplicative correction factors as a function of GC content from coverage vs. GC data. Basically, learn a
19+
* regression curve of coverage vs. GC, and divide by that curve to get GC-corrected coverage.
20+
*
21+
* Our regression curve is obtained by filling GC content bins of width 0.01 with the coverages of targets corresponding to each GC
22+
* and taking the median to get a robust estimate of the curve. In order to smooth out bins with few data (i.e. extreme
23+
* GC values that occur rarely) we then convolve these medians with an exponential kernel.
24+
*
25+
* @author David Benjamin &lt;[email protected]&gt;
26+
*/
27+
class GCCorrector {
28+
// GC bins are 0%, 1% . . . 100%
29+
private final static int NUMBER_OF_GC_BINS = 101;
30+
31+
// scale (in units of GCcontent from 0 to 1) over which gc bias correlation decays
32+
// i.e. the bias at GC content = 0.3 and at 0.2 are correlated ~exp(-0.1/correlationLength)
33+
// this is effectively a smoothness parameter used for regularizing the GC bias estimate for
34+
// GC content bins that have few targets)
35+
private static final double correlationLength = 0.02;
36+
private static final double correlationDecayRatePerBin = 1.0 / (correlationLength * NUMBER_OF_GC_BINS);
37+
38+
// multiply by these to get a GC correction as a function of GC
39+
private final double[] gcCorrectionFactors;
40+
41+
//Apache commons median doesn't work on empty arrays; this value is a placeholder to avoid exceptions
42+
private static final double DUMMY_VALUE_NEVER_USED = 1.0;
43+
44+
/**
45+
* Learn multiplicative correction factors as a function of GC from coverage vs. GC data. Basically, learn a
46+
* regression curve of coverage vs. GC in order to divide by that curve later.
47+
*
48+
* @param gcContents GC content (from 0.0 to 1.0) of targets in {@code coverage}
49+
* @param coverage raw of proportional coverage
50+
*/
51+
public GCCorrector(final double[] gcContents, final RealVector coverage) {
52+
Utils.nonNull(gcContents);
53+
Utils.nonNull(coverage);
54+
Utils.validateArg(gcContents.length > 0, "must have at lest one datum");
55+
Utils.validateArg(gcContents.length == coverage.getDimension(), "must have one gc value per coverage.");
56+
57+
final List<List<Double>> coveragesByGC = new ArrayList<>(NUMBER_OF_GC_BINS);
58+
IntStream.range(0, NUMBER_OF_GC_BINS).forEach(n -> coveragesByGC.add(new ArrayList<>()));
59+
IntStream.range(0, gcContents.length).forEach(n -> coveragesByGC.get(gcContentToBinIndex(gcContents[n])).add(coverage.getEntry(n)));
60+
gcCorrectionFactors = calculateCorrectionFactors(coveragesByGC);
61+
}
62+
63+
/**
64+
* As described above, calculate medians of each GC bin and convolve with an exponential kernel.
65+
*
66+
* @param coveragesByGC list of coverages for each GC bin
67+
* @return multiplicative correction factors for each GC bin
68+
*/
69+
private double[] calculateCorrectionFactors(final List<List<Double>> coveragesByGC) {
70+
final RealVector medians = new ArrayRealVector(coveragesByGC.stream().mapToDouble(GCCorrector::medianOrDefault).toArray());
71+
return IntStream.range(0, NUMBER_OF_GC_BINS).mapToDouble(bin -> {
72+
final RealVector weights = new ArrayRealVector(IntStream.range(0, NUMBER_OF_GC_BINS)
73+
.mapToDouble(n -> coveragesByGC.get(n).size() * Math.exp(-Math.abs(bin - n) * correlationDecayRatePerBin)).toArray());
74+
return weights.dotProduct(medians) / weights.getL1Norm();
75+
}).map(x -> 1/x).toArray();
76+
}
77+
78+
/**
79+
*
80+
* @param inputCounts raw coverage before GC correction
81+
* @param gcContentByTarget array of gc contents, one per target of the input
82+
* @return GC-corrected coverage
83+
*/
84+
public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) {
85+
// each column (sample) has its own GC bias curve, hence its own GC corrector
86+
final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size())
87+
.mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList());
88+
89+
// gc correct a copy of the input counts in-place
90+
final RealMatrix correctedCounts = inputCounts.counts().copy();
91+
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
92+
@Override
93+
public double visit(int target, int column, double coverage) {
94+
return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]);
95+
}
96+
});
97+
98+
// we would like the average correction factor to be 1.0 in the sense that average coverage before and after
99+
// correction should be equal
100+
final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size())
101+
.mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray();
102+
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
103+
@Override
104+
public double visit(int target, int column, double coverage) {
105+
return coverage * columnNormalizationFactors[column];
106+
}
107+
});
108+
109+
return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts);
110+
}
111+
112+
private double correctedCoverage(final double coverage, final double gcContent) {
113+
return gcCorrectionFactors[gcContentToBinIndex(gcContent)] * coverage;
114+
}
115+
116+
// return a median of coverages or dummy default value if no coverage exists at this gc bin
117+
// this default is never used because empty bins get zero weight in {@code calculateCorrectionFactors}
118+
private static double medianOrDefault(final List<Double> list) {
119+
return list.size() > 0 ? new Median().evaluate(list.stream().mapToDouble(d->d).toArray()) : DUMMY_VALUE_NEVER_USED;
120+
}
121+
122+
private static int gcContentToBinIndex(final double gcContent) {
123+
return (int) Math.round(gcContent * (NUMBER_OF_GC_BINS-1));
124+
}
125+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package org.broadinstitute.hellbender.fakedata;
2+
3+
import org.apache.commons.lang3.tuple.ImmutablePair;
4+
import org.apache.commons.lang3.tuple.Pair;
5+
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
6+
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
7+
import org.apache.commons.math3.linear.RealMatrix;
8+
import org.broadinstitute.hellbender.tools.exome.*;
9+
import org.broadinstitute.hellbender.utils.SimpleInterval;
10+
11+
import java.io.File;
12+
import java.io.IOException;
13+
import java.util.Arrays;
14+
import java.util.Collections;
15+
import java.util.List;
16+
import java.util.Random;
17+
import java.util.function.Function;
18+
import java.util.stream.Collectors;
19+
import java.util.stream.IntStream;
20+
21+
/**
22+
* GC bias data consists of a {@link ReadCountCollection} and an array of gc contents
23+
* in order of the read counts' targets.
24+
*
25+
* @author David Benjamin &lt;[email protected]&gt;
26+
*/
27+
public class GCBiasSimulatedData {
28+
29+
// a reasonable default GC bias curve
30+
private static Function<Double, Double> QUADRATIC_GC_BIAS_CURVE = gc -> 0.5 + 2*gc*(1-gc);
31+
32+
// visible for the integration test
33+
public static Pair<ReadCountCollection, double[]> simulatedData(final int numTargets, final int numSamples) {
34+
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(numTargets);
35+
final List<String> phonySamples = SimulatedSamples.phonySamples(numSamples);
36+
37+
final Random random = new Random(13);
38+
final double[] gcContentByTarget = IntStream.range(0, numTargets)
39+
.mapToDouble(n -> 0.5 + 0.2*random.nextGaussian())
40+
.map(x -> Math.min(x,0.95)).map(x -> Math.max(x,0.05)).toArray();
41+
final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray();
42+
43+
// model mainly GC bias with a small random amount of non-GC bias
44+
// thus noise after GC correction should be nearly zero
45+
final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples);
46+
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
47+
@Override
48+
public double visit(final int target, final int column, final double value) {
49+
return gcBiasByTarget[target]*(1.0 + 0.01*random.nextDouble());
50+
}
51+
});
52+
final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts);
53+
return new ImmutablePair<>(rcc, gcContentByTarget);
54+
}
55+
56+
/**
57+
*
58+
* @param readCountsFile A simulated read counts file with GC bias effects
59+
* @param targetsFile A simulated targets file with GC content annotation
60+
*/
61+
public static void makeGCBiasInputFiles(final Pair<ReadCountCollection, double[]> data,
62+
final File readCountsFile, final File targetsFile) throws IOException {
63+
final ReadCountCollection inputCounts = data.getLeft();
64+
final double[] gcContentByTarget = data.getRight();
65+
ReadCountCollectionUtils.write(readCountsFile, inputCounts);
66+
67+
final TargetTableWriter writer = new TargetTableWriter(targetsFile, Collections.singleton(TargetAnnotation.GC_CONTENT));
68+
for (int i = 0; i < gcContentByTarget.length; i++) {
69+
final Target unannotatedTarget = inputCounts.records().get(i).getTarget();
70+
final TargetAnnotationCollection annotations = new TargetAnnotationCollection();
71+
annotations.put(TargetAnnotation.GC_CONTENT, Double.toString(gcContentByTarget[i]));
72+
final Target target = new Target(unannotatedTarget.getName(), unannotatedTarget.getInterval(), annotations);
73+
writer.writeRecord(target);
74+
}
75+
writer.close();
76+
}
77+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package org.broadinstitute.hellbender.fakedata;
2+
3+
import java.util.List;
4+
import java.util.stream.Collectors;
5+
import java.util.stream.IntStream;
6+
7+
/**
8+
* @author David Benjamin &lt;[email protected]&gt;
9+
*/
10+
public class SimulatedSamples {
11+
public static List<String> phonySamples(final int numSamples) {
12+
return IntStream.range(0, numSamples)
13+
.mapToObj(n -> String.format("Sample%d", n)).collect(Collectors.toList());
14+
}
15+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package org.broadinstitute.hellbender.fakedata;
2+
3+
import org.broadinstitute.hellbender.tools.exome.Target;
4+
import org.broadinstitute.hellbender.utils.SimpleInterval;
5+
6+
import java.util.List;
7+
import java.util.stream.Collectors;
8+
import java.util.stream.IntStream;
9+
10+
/**
11+
* @author David Benjamin &lt;[email protected]&gt;
12+
*/
13+
public class SimulatedTargets {
14+
15+
// phony targets whose intervals don't really matter
16+
public static List<Target> phonyTargets(final int numTargets) {
17+
return IntStream.range(0, numTargets).mapToObj(n -> {
18+
final String name = String.format("Target%d", n);
19+
final SimpleInterval interval = new SimpleInterval("chr", 2*n + 1, 2*n + 2);
20+
return new Target(name, interval);
21+
}).collect(Collectors.toList());
22+
}
23+
}

src/test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import org.broadinstitute.hellbender.CommandLineProgramTest;
55
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
66
import org.broadinstitute.hellbender.exceptions.UserException;
7+
import org.broadinstitute.hellbender.fakedata.SimulatedTargets;
78
import org.broadinstitute.hellbender.utils.SimpleInterval;
89
import org.broadinstitute.hellbender.utils.test.BaseTest;
910
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
@@ -45,12 +46,7 @@ public class CombineReadCountsIntegrationTest extends CommandLineProgramTest {
4546
@Test(expectedExceptions = UserException.class)
4647
public void testEmptyInputFileListProvided() throws Exception {
4748
@SuppressWarnings("serial")
48-
final List<Target> phonyTargets = new ArrayList<Target>() {{
49-
add(new Target("target_0",new SimpleInterval("1", 100, 200)));
50-
add(new Target("target_1",new SimpleInterval("2", 100, 200)));
51-
add(new Target("target_2",new SimpleInterval("3", 100, 200)));
52-
53-
}};
49+
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(3);
5450
final List<File> inputFiles = Collections.emptyList();
5551
final File targetFile = createTargetFile(phonyTargets);
5652
final File inputListFile = createInputListFile(inputFiles);
@@ -67,12 +63,7 @@ public void testEmptyInputFileListProvided() throws Exception {
6763
@Test(expectedExceptions = UserException.class)
6864
public void testNoInputFileProvided() throws Exception {
6965
@SuppressWarnings("serial")
70-
final List<Target> phonyTargets = new ArrayList<Target>() {{
71-
add(new Target("target_0",new SimpleInterval("1", 100, 200)));
72-
add(new Target("target_1",new SimpleInterval("2", 100, 200)));
73-
add(new Target("target_2",new SimpleInterval("3", 100, 200)));
74-
75-
}};
66+
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(3);
7667
final List<File> inputFiles = Collections.emptyList();
7768
final File targetFile = createTargetFile(phonyTargets);
7869
try {

0 commit comments

Comments
 (0)