Skip to content

Commit 89ce6c2

Browse files
authored
Merge pull request #721 from opencb/TASK-7463
TASK-7463 Modify CellBase clinical variant builders
2 parents 5943afc + d1c4dc1 commit 89ce6c2

File tree

10 files changed

+163
-97
lines changed

10 files changed

+163
-97
lines changed

cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ private CellBaseBuilder buildConservation() {
367367

368368
private CellBaseBuilder buildClinicalVariants() {
369369
Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER);
370-
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("clinvarVersion.json")));
370+
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve(CLINVAR_VERSION_FILENAME)));
371371
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("gwasVersion.json")));
372372

373373
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder,

cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ private void loadClinical() throws FileNotFoundException {
484484

485485
// Update release (collection and sources)
486486
List<Path> sources = new ArrayList<>(Arrays.asList(
487-
input.resolve("clinvarVersion.json"),
487+
input.resolve(CLINVAR_VERSION_FILENAME),
488488
input.resolve("cosmicVersion.json"),
489489
input.resolve("gwasVersion.json")
490490
));

cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java

+6-3
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,18 @@ public class EtlCommons {
5656
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";
5757

5858
public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
59-
public static final String CLINVAR_VERSION = "2024-05";
60-
public static final String CLINVAR_DATE = "2024-05";
61-
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
59+
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease.xml.gz";
6260
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
6361
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
6462
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
63+
public static final String CLINVAR_VERSION_FILENAME = "clinvarVersion.json";
64+
6565
public static final String IARCTP53_FILE = "IARC-TP53.zip";
6666
public static final String GWAS_FILE = "gwas_catalog.tsv";
6767
public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz";
68+
public static final String COSMIC_VERSION_FILENAME = "cosmicVersion.json";
69+
public static final String HGMD_VERSION_FILENAME = "hgmdVersion.json";
70+
6871
@Deprecated
6972
public static final String DBSNP_FILE = "GCF_000001405.40.gz";
7073
public static final String DBSNP_NAME = "dbSNP";

cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java

+28-14
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717
package org.opencb.cellbase.lib.builders.clinical.variant;
1818

1919
import com.fasterxml.jackson.core.JsonProcessingException;
20+
import com.fasterxml.jackson.databind.ObjectMapper;
21+
import com.fasterxml.jackson.databind.ObjectReader;
2022
import org.apache.commons.lang3.StringUtils;
2123
import org.opencb.biodata.formats.variant.clinvar.rcv.ClinvarParser;
2224
import org.opencb.biodata.formats.variant.clinvar.rcv.v64jaxb.*;
2325
import org.opencb.biodata.models.variant.avro.*;
26+
import org.opencb.cellbase.core.models.DataReleaseSource;
2427
import org.opencb.cellbase.lib.EtlCommons;
2528
import org.opencb.cellbase.lib.variant.VariantAnnotationUtils;
2629
import org.opencb.commons.ProgressLogger;
@@ -41,8 +44,7 @@
4144
import java.util.stream.Collectors;
4245
import java.util.stream.Stream;
4346

44-
import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_DATE;
45-
import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION;
47+
import static org.opencb.cellbase.lib.EtlCommons.*;
4648

4749
//import org.opencb.biodata.formats.variant.clinvar.v24jaxb.*;
4850

@@ -83,6 +85,10 @@ public class ClinVarIndexer extends ClinicalIndexer {
8385
private final Path clinvarVariationAlleleFile;
8486
private final Path clinvarEFOFile;
8587
private final String assembly;
88+
89+
private String version;
90+
private String date;
91+
8692
private int numberSomaticRecords = 0;
8793
private int numberGermlineRecords = 0;
8894
private int numberNoDiseaseTrait = 0;
@@ -98,18 +104,32 @@ public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinva
98104
Path clinvarEFOFile, boolean normalize, Path genomeSequenceFilePath, String assembly,
99105
RocksDB rdb) throws IOException {
100106
super(genomeSequenceFilePath);
101-
this.rdb = rdb;
107+
102108
this.clinvarXMLFiles = clinvarXMLFiles;
103109
this.clinvarSummaryFile = clinvarSummaryFile;
104110
this.clinvarVariationAlleleFile = clinvarVariationAlleleFile;
105111
this.clinvarEFOFile = clinvarEFOFile;
106112
this.normalize = normalize;
107113
this.genomeSequenceFilePath = genomeSequenceFilePath;
108114
this.assembly = assembly;
115+
116+
this.rdb = rdb;
109117
}
110118

111119
public void index() throws RocksDBException {
112120
try {
121+
Path clinvarVersionPath = clinvarSummaryFile.getParent().resolve(CLINVAR_VERSION_FILENAME);
122+
if (!Files.exists(clinvarVersionPath)) {
123+
throw new IOException("ClinVar version file " + clinvarVersionPath + " does not exist");
124+
}
125+
ObjectMapper jsonObjectMapper = new ObjectMapper();
126+
ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class);
127+
DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(clinvarVersionPath.toFile());
128+
129+
this.date = dataReleaseSource.getDate();
130+
this.version = dataReleaseSource.getVersion();
131+
132+
113133
Map<String, EFO> traitsToEfoTermsMap = loadEFOTerms();
114134
Map<String, List<AlleleLocationData>> rcvToAlleleLocationData = parseVariantSummary(traitsToEfoTermsMap);
115135

@@ -156,15 +176,9 @@ public boolean accept(File dir, String name) {
156176
}
157177
logger.info("Done");
158178
printSummary();
159-
} catch (RocksDBException e) {
160-
logger.error("Error reading/writing from/to the RocksDB index while indexing ClinVar");
161-
throw e;
162-
} catch (JAXBException e) {
163-
logger.error("Error unmarshalling clinvar Xml file: " + e.getMessage());
164-
e.printStackTrace();
165-
} catch (IOException e) {
166-
logger.error("Error indexing clinvar Xml file: " + e.getMessage());
167-
e.printStackTrace();
179+
} catch (RocksDBException | JAXBException | IOException e) {
180+
logger.error("Error indexing ClinVar", e);
181+
throw new RocksDBException(e.getMessage());
168182
}
169183
}
170184

@@ -331,7 +345,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation
331345
String mateVariantString, String clinicalHaplotypeString,
332346
Map<String, EFO> traitsToEfoTermsMap) {
333347

334-
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
348+
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, date);
335349
// Create a set to avoid situations like germline;germline;germline
336350
List<AlleleOrigin> alleleOrigin = null;
337351
if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) {
@@ -412,7 +426,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
412426
throws JsonProcessingException {
413427

414428
List<Property> additionalProperties = new ArrayList<>(3);
415-
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
429+
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, version, date);
416430
// String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
417431

418432
VariantClassification variantClassification = getVariantClassification(

cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java

+55-4
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
2323
import org.opencb.cellbase.lib.EtlCommons;
2424
import org.opencb.cellbase.lib.builders.CellBaseBuilder;
25+
import org.opencb.commons.utils.FileUtils;
2526
import org.rocksdb.Options;
2627
import org.rocksdb.RocksDB;
2728
import org.rocksdb.RocksDBException;
2829
import org.rocksdb.RocksIterator;
2930

30-
import java.io.File;
31-
import java.io.IOException;
31+
import java.io.*;
3232
import java.nio.file.Files;
3333
import java.nio.file.Path;
3434
import java.nio.file.Paths;
@@ -124,8 +124,17 @@ public void parse() throws IOException, RocksDBException {
124124
if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null
125125
&& this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile)
126126
&& Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) {
127-
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile,
128-
clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb);
127+
128+
Path chunksPaths = clinvarXMLFile.getParent().resolve("clinvar_chunks");
129+
if (Files.notExists(chunksPaths)) {
130+
logger.info("Splitting ClinVar XML file in multiple ClinVar chunk files at {} ...", chunksPaths);
131+
Files.createDirectories(chunksPaths);
132+
splitClinvar(this.clinvarXMLFile, chunksPaths);
133+
logger.info("Done");
134+
}
135+
136+
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(chunksPaths, clinvarSummaryFile, clinvarVariationAlleleFile,
137+
clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb);
129138
clinvarIndexer.index();
130139
} else {
131140
logger.warn("One or more of required ClinVar files are missing. Skipping ClinVar data.\n"
@@ -189,6 +198,48 @@ public void parse() throws IOException, RocksDBException {
189198

190199
}
191200

201+
private void splitClinvar(Path clinvarXmlFilePath, Path splitOutdirPath) throws IOException {
202+
BufferedReader br = FileUtils.newBufferedReader(clinvarXmlFilePath);
203+
PrintWriter pw = null;
204+
StringBuilder header = new StringBuilder();
205+
boolean beforeEntry = true;
206+
boolean inEntry = false;
207+
int count = 0;
208+
int chunk = 0;
209+
String line;
210+
while ((line = br.readLine()) != null) {
211+
if (line.trim().startsWith("<ClinVarSet ")) {
212+
inEntry = true;
213+
beforeEntry = false;
214+
if (count % 10000 == 0) {
215+
pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile()));
216+
pw.println(header.toString().trim());
217+
}
218+
count++;
219+
}
220+
221+
if (beforeEntry) {
222+
header.append(line).append("\n");
223+
}
224+
225+
if (inEntry) {
226+
pw.println(line);
227+
}
228+
229+
if (line.trim().startsWith("</ClinVarSet>")) {
230+
inEntry = false;
231+
if (count % 10000 == 0) {
232+
pw.print("</ReleaseSet>");
233+
pw.close();
234+
chunk++;
235+
}
236+
}
237+
}
238+
pw.print("</ReleaseSet>");
239+
pw.close();
240+
br.close();
241+
}
242+
192243
private void serializeRDB(RocksDB rdb) throws IOException {
193244
// DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's
194245
// named "iterator"

cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java

+26-10
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616

1717
package org.opencb.cellbase.lib.builders.clinical.variant;
1818

19+
import com.fasterxml.jackson.databind.ObjectMapper;
20+
import com.fasterxml.jackson.databind.ObjectReader;
1921
import org.apache.commons.lang3.StringUtils;
2022
import org.opencb.biodata.models.variant.avro.*;
23+
import org.opencb.cellbase.core.models.DataReleaseSource;
2124
import org.opencb.cellbase.lib.EtlCommons;
2225
import org.opencb.cellbase.lib.variant.VariantAnnotationUtils;
2326
import org.opencb.commons.ProgressLogger;
@@ -27,12 +30,15 @@
2730

2831
import java.io.BufferedReader;
2932
import java.io.IOException;
33+
import java.nio.file.Files;
3034
import java.nio.file.Path;
3135
import java.text.NumberFormat;
3236
import java.util.*;
3337
import java.util.regex.Matcher;
3438
import java.util.regex.Pattern;
3539

40+
import static org.opencb.cellbase.lib.EtlCommons.COSMIC_VERSION_FILENAME;
41+
3642

3743
public class CosmicIndexer extends ClinicalIndexer {
3844

@@ -41,8 +47,6 @@ public class CosmicIndexer extends ClinicalIndexer {
4147
private Pattern mutationGRCh37GenomePositionPattern;
4248
private Pattern snvPattern;
4349

44-
private static final String COSMIC_VERSION = "v99";
45-
4650
private static final int GENE_NAMES_COLUMN = 0;
4751
private static final int HGNC_COLUMN = 3;
4852
private static final int PRIMARY_SITE_COLUMN = 7;
@@ -79,6 +83,9 @@ public class CosmicIndexer extends ClinicalIndexer {
7983

8084
private static final String VARIANT_STRING_PATTERN = "[ACGT]*";
8185

86+
private String date;
87+
private String version;
88+
8289
private int ignoredCosmicLines = 0;
8390
private long normaliseTime = 0;
8491
private int rocksDBNewVariants = 0;
@@ -101,9 +108,20 @@ private void init() {
101108
}
102109

103110
public void index() throws RocksDBException {
104-
logger.info("Parsing cosmic file ...");
105-
106111
try {
112+
Path cosmicVersionPath = cosmicFile.getParent().resolve(COSMIC_VERSION_FILENAME);
113+
if (!Files.exists(cosmicVersionPath)) {
114+
throw new IOException("COSMIC version file " + cosmicVersionPath + " does not exist");
115+
}
116+
ObjectMapper jsonObjectMapper = new ObjectMapper();
117+
ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class);
118+
DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(cosmicVersionPath.toFile());
119+
120+
this.date = dataReleaseSource.getDate();
121+
this.version = dataReleaseSource.getVersion();
122+
123+
logger.info("Parsing cosmic file ...");
124+
107125
ProgressLogger progressLogger = new ProgressLogger("Parsed COSMIC lines:",
108126
() -> EtlCommons.countFileLines(cosmicFile), 200).setBatchSize(10000);
109127

@@ -168,11 +186,9 @@ public void index() throws RocksDBException {
168186
rocksDBUpdateVariants = numberVariantUpdates;
169187
}
170188
}
171-
} catch (RocksDBException e) {
172-
logger.error("Error reading/writing from/to the RocksDB index while indexing Cosmic");
173-
throw e;
174-
} catch (IOException ex) {
175-
ex.printStackTrace();
189+
} catch (RocksDBException | IOException e) {
190+
logger.error("Error indexing Cosmic", e);
191+
throw new RocksDBException(e.getMessage());
176192
} finally {
177193
logger.info("Done");
178194
this.printSummary();
@@ -469,7 +485,7 @@ private EvidenceEntry buildCosmic(String[] fields) {
469485
String id = fields[ID_COLUMN];
470486
String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id;
471487

472-
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, COSMIC_VERSION, null);
488+
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_DATA, version, date);
473489
SomaticInformation somaticInformation = getSomaticInformation(fields);
474490
List<GenomicFeature> genomicFeatureList = getGenomicFeature(fields);
475491

cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/HGMDIndexer.java

+22-2
Original file line numberDiff line numberDiff line change
@@ -16,28 +16,37 @@
1616

1717
package org.opencb.cellbase.lib.builders.clinical.variant;
1818

19+
import com.fasterxml.jackson.databind.ObjectMapper;
20+
import com.fasterxml.jackson.databind.ObjectReader;
1921
import org.apache.commons.collections4.CollectionUtils;
2022
import org.apache.commons.collections4.MapUtils;
2123
import org.opencb.biodata.models.variant.Variant;
2224
import org.opencb.biodata.models.variant.VariantFileMetadata;
2325
import org.opencb.biodata.models.variant.avro.*;
2426
import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata;
2527
import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader;
28+
import org.opencb.cellbase.core.models.DataReleaseSource;
2629
import org.opencb.cellbase.lib.EtlCommons;
2730
import org.rocksdb.RocksDB;
2831
import org.rocksdb.RocksDBException;
2932

3033
import java.io.IOException;
34+
import java.nio.file.Files;
3135
import java.nio.file.Path;
3236
import java.util.*;
3337

38+
import static org.opencb.cellbase.lib.EtlCommons.HGMD_VERSION_FILENAME;
39+
3440
/**
3541
* Created by jtarraga on 23/02/22.
3642
*/
3743
public class HGMDIndexer extends ClinicalIndexer {
3844
private final Path hgmdFile;
3945
private final String assembly;
4046

47+
private String date;
48+
private String version;
49+
4150
public HGMDIndexer(Path hgmdFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb)
4251
throws IOException {
4352
super(genomeSequenceFilePath);
@@ -51,6 +60,18 @@ public void index() throws RocksDBException, IOException {
5160
logger.info("Parsing HGMD file ...");
5261

5362
try {
63+
64+
Path hgmdVersionPath = hgmdFile.getParent().resolve(HGMD_VERSION_FILENAME);
65+
if (!Files.exists(hgmdVersionPath)) {
66+
throw new IOException("HGMD version file " + hgmdVersionPath + " does not exist");
67+
}
68+
ObjectMapper jsonObjectMapper = new ObjectMapper();
69+
ObjectReader jsonObjectReader = jsonObjectMapper.readerFor(DataReleaseSource.class);
70+
DataReleaseSource dataReleaseSource = jsonObjectReader.readValue(hgmdVersionPath.toFile());
71+
72+
this.date = dataReleaseSource.getDate();
73+
this.version = dataReleaseSource.getVersion();
74+
5475
VariantStudyMetadata metadata = new VariantFileMetadata(null, hgmdFile.toString()).toVariantStudyMetadata("study");
5576
VariantVcfHtsjdkReader reader = new VariantVcfHtsjdkReader(hgmdFile.toAbsolutePath(), metadata);
5677
for (Variant variant : reader) {
@@ -74,7 +95,6 @@ public void index() throws RocksDBException, IOException {
7495
throw e;
7596
} finally {
7697
logger.info("Done");
77-
7898
// this.printSummary();
7999
}
80100
}
@@ -93,7 +113,7 @@ private void parseHgmdInfo(Variant variant) {
93113
}
94114

95115
// Source
96-
entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, "2020.3", "2020"));
116+
entry.setSource(new EvidenceSource(EtlCommons.HGMD_DATA, version, date));
97117

98118
// Assembly
99119
entry.setAssembly(assembly);

0 commit comments

Comments
 (0)