Skip to content

Commit

Permalink
Merge pull request #45 from cBioPortal/inc-tab-delimited-uploader
Browse files Browse the repository at this point in the history
(1/7) RFC 79: Implement incremental upload of tab delimited data.
  • Loading branch information
forus authored Jun 19, 2024
2 parents 2e80b73 + 18dbdd3 commit 52714d6
Show file tree
Hide file tree
Showing 55 changed files with 1,739 additions and 494 deletions.
40 changes: 28 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,34 @@ This repo contains:
## Inclusion in main codebase
The `cbioportal-core` code is currently included in the final Docker image during the Docker build process: https://github.com/cBioPortal/cbioportal/blob/master/docker/web-and-data/Dockerfile#L48

## Running in docker

Build docker image with:
```bash
docker build -t cbioportal-core .
```

Example of how to start loading of the whole study:
```bash
docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o
```

### Incremental upload of data

To add or update specific patient, sample, or molecular data in an already loaded study, you can perform an incremental upload. This process is quicker than reloading the entire study.

To execute an incremental upload, use the -d (or --data_directory) option instead of -s (or --study_directory). Here is an example command:
```bash
docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -d /data/study_es_0_inc -p /data/api_json -o
```
**Note:**
While the directory should adhere to the standard cBioPortal file formats and study structure, please note the following specific guidelines for incremental uploads:

- Incremental uploads are not supported for all data types. For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported.
- The data pertaining to patient or sample IDs should only include entries that are either new or need updates.

This method ensures efficient updates without the need for complete study reuploads, saving time and computational resources.

## How to run integration tests

This section guides you through the process of running integration tests by setting up a cBioPortal MySQL database environment using Docker. Please follow these steps carefully to ensure your testing environment is configured correctly.
Expand Down Expand Up @@ -119,15 +147,3 @@ The script will search for `core-*.jar` in the root of the project:
python scripts/importer/metaImport.py -s tests/test_data/study_es_0 -p tests/test_data/api_json_unit_tests -o
```

## Running in docker

Build docker image with:
```bash
docker build -t cbioportal-core .
```

Example of how to start the loading:
```bash
docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o
```

3 changes: 3 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.21.0</version>
<configuration>
<trimStackTrace>false</trimStackTrace>
</configuration>
<executions>
<execution>
<id>default-test</id>
Expand Down
4 changes: 3 additions & 1 deletion scripts/importer/cbioportalImporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,8 @@ def import_incremental_data(jvm_args, data_directory, update_generic_assay_entit
Load all data types that are available and support incremental upload
"""
for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES:
if meta_file_type not in meta_file_type_to_meta_files:
continue
meta_pairs = meta_file_type_to_meta_files[meta_file_type]
for meta_pair in meta_pairs:
meta_filename, meta_dictionary = meta_pair
Expand Down Expand Up @@ -651,5 +653,5 @@ def main(args):
# ready to roll

if __name__ == '__main__':
parsed_args = interface(args)
parsed_args = interface()
main(parsed_args)
11 changes: 10 additions & 1 deletion scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,16 @@ class MetaFileTypes(object):
MetaFileTypes.PATIENT_ATTRIBUTES,
MetaFileTypes.SAMPLE_ATTRIBUTES,
MetaFileTypes.MUTATION,
# TODO Add more types here as incremental upload is enabled
MetaFileTypes.MUTATION_UNCALLED,
MetaFileTypes.EXPRESSION,
MetaFileTypes.CNA_DISCRETE,
MetaFileTypes.CNA_CONTINUOUS,
MetaFileTypes.CNA_LOG2,
MetaFileTypes.METHYLATION,
MetaFileTypes.PROTEIN,
MetaFileTypes.GENERIC_ASSAY_CONTINUOUS,
MetaFileTypes.GENERIC_ASSAY_BINARY,
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL,
]

IMPORTER_CLASSNAME_BY_META_TYPE = {
Expand Down
27 changes: 26 additions & 1 deletion src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,32 @@ private static long addCnaEventDirectly(CnaEvent cnaEvent) throws DaoException {
JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs);
}
}


public static void removeSampleCnaEvents(int cnaProfileId, List<Integer> sampleIds) throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
try {
con = JdbcUtil.getDbConnection(DaoCnaEvent.class);
pstmt = con.prepareStatement
("DELETE sample_cna_event, alteration_driver_annotation" +
" FROM sample_cna_event" +
" LEFT JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" +
" WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" +
String.join(",", Collections.nCopies(sampleIds.size(), "?"))
+ ")");
pstmt.setInt(1, cnaProfileId);
for (int i = 0; i < sampleIds.size(); i++) {
pstmt.setInt(i + 2, sampleIds.get(i));
}
pstmt.executeUpdate();
} catch (SQLException e) {
throw new DaoException(e);
} finally {
JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs);
}
}

public static Map<Sample, Set<Long>> getSamplesWithAlterations(
Collection<Long> eventIds) throws DaoException {
return getSamplesWithAlterations(StringUtils.join(eventIds, ","));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.mskcc.cbio.portal.model.CanonicalGene;
import org.mskcc.cbio.portal.util.EntrezValidator;
import org.mskcc.cbio.portal.util.ProgressMonitor;

/**
Expand Down Expand Up @@ -322,7 +323,7 @@ public List<CanonicalGene> guessGene(String geneId, String chr) {
}

CanonicalGene gene;
if (geneId.matches("[0-9]+")) { // likely to be a entrez gene id
if (EntrezValidator.isaValidEntrezId(geneId)) { // likely to be a entrez gene id
gene = getGene(Integer.parseInt(geneId));
if (gene!=null) {
return Collections.singletonList(gene);
Expand Down
22 changes: 13 additions & 9 deletions src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,10 @@ private DaoGeneticAlteration() {
* Gets Instance of Dao Object. (Singleton pattern).
*
* @return DaoGeneticAlteration Object.
* @throws DaoException Dao Initialization Error.
*/
public static DaoGeneticAlteration getInstance() throws DaoException {
public static DaoGeneticAlteration getInstance() {
if (daoGeneticAlteration == null) {
daoGeneticAlteration = new DaoGeneticAlteration();

}

return daoGeneticAlteration;
Expand All @@ -96,7 +94,7 @@ public int addGeneticAlterations(int geneticProfileId, long entrezGeneId, String
throws DaoException {
return addGeneticAlterationsForGeneticEntity(geneticProfileId, DaoGeneOptimized.getGeneticEntityId(entrezGeneId), values);
}

public int addGeneticAlterationsForGeneticEntity(int geneticProfileId, int geneticEntityId, String[] values)
throws DaoException {

Expand Down Expand Up @@ -239,8 +237,8 @@ public HashMap<Integer,HashMap<Integer, String>> getGeneticAlterationMapForEntit
int geneticEntityId = rs.getInt("GENETIC_ENTITY_ID");
String values = rs.getString("VALUES");
//hm.debug..
String valueParts[] = values.split(DELIM);
for (int i=0; i<valueParts.length; i++) {
String valueParts[] = values.split(DELIM, -1);
for (int i = 0; i < orderedSampleList.size(); i++) {
String value = valueParts[i];
Integer sampleId = orderedSampleList.get(i);
mapSampleValue.put(sampleId, value);
Expand Down Expand Up @@ -292,7 +290,11 @@ public static ArrayList<ObjectNode> getProcessedAlterationData(
rs = pstmt.executeQuery();
while (rs.next()) {
long entrezGeneId = DaoGeneOptimized.getEntrezGeneId(rs.getInt("GENETIC_ENTITY_ID"));
String[] values = rs.getString("VALUES").split(DELIM);
String valuesString = rs.getString("VALUES");
if (valuesString.endsWith(DELIM)) {
valuesString = valuesString.substring(0, valuesString.length() - DELIM.length());
}
String[] values = valuesString.split(DELIM, -1);
ObjectNode datum = processor.process(
entrezGeneId,
values,
Expand Down Expand Up @@ -427,17 +429,19 @@ public int getCount() throws DaoException {
* Deletes all Genetic Alteration Records associated with the specified Genetic Profile ID.
*
* @param geneticProfileId Genetic Profile ID.
* @param geneticEntityId Genetic Entity ID.
* @throws DaoException Database Error.
*/
public void deleteAllRecordsInGeneticProfile(long geneticProfileId) throws DaoException {
public void deleteAllRecordsInGeneticProfile(long geneticProfileId, long geneticEntityId) throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
try {
con = JdbcUtil.getDbConnection(DaoGeneticAlteration.class);
pstmt = con.prepareStatement("DELETE from " +
"genetic_alteration WHERE GENETIC_PROFILE_ID=?");
"genetic_alteration WHERE GENETIC_PROFILE_ID=? and GENETIC_ENTITY_ID=?");
pstmt.setLong(1, geneticProfileId);
pstmt.setLong(2, geneticEntityId);
pstmt.executeUpdate();
} catch (SQLException e) {
throw new DaoException(e);
Expand Down
21 changes: 18 additions & 3 deletions src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@
import java.sql.*;
import java.util.*;
import javax.sql.DataSource;
import org.apache.commons.dbcp2.BasicDataSource;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.mskcc.cbio.portal.util.*;
import org.springframework.jdbc.datasource.DataSourceTransactionManager;
import org.springframework.jdbc.datasource.TransactionAwareDataSourceProxy;
import org.springframework.transaction.support.TransactionTemplate;

/**
* Connection Utility for JDBC.
Expand All @@ -50,24 +52,37 @@ public class JdbcUtil {
private static DataSource dataSource;
private static Map<String,Integer> activeConnectionCount = new HashMap<String,Integer>(); // keep track of the number of active connection per class/requester
private static final Logger LOG = LoggerFactory.getLogger(JdbcUtil.class);
private static DataSourceTransactionManager transactionManager;
private static TransactionTemplate transactionTemplate;

/**
* Gets the data source
* @return the data source
*/
public static DataSource getDataSource() {
if (dataSource == null) {
dataSource = new JdbcDataSource();
dataSource = new TransactionAwareDataSourceProxy(new JdbcDataSource());
initSpringTx();
}
return dataSource;
}

private static void initSpringTx() {
transactionManager = new DataSourceTransactionManager(dataSource);
transactionTemplate = new TransactionTemplate(transactionManager);
}

/**
* Sets the data source
* @param value the data source
*/
public static void setDataSource(DataSource value) {
dataSource = value;
initSpringTx();
}

public static TransactionTemplate getTransactionTemplate() {
return transactionTemplate;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,9 @@ public boolean store(
}
}

public boolean isImportedAlready(CanonicalGene gene) {
return importSetOfGenes.contains(gene.getEntrezGeneId());
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx
ProgressMonitor.incrementCurValue();
ConsoleUtil.showProgress();
String parts[] = line.split("\t", -1); // include trailing empty strings
if (!parts[0].matches("[0-9]+")) {
if (!EntrezValidator.isaValidEntrezId(parts[0])) {
ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'");
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,19 @@
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.cbioportal.model.EntityType;
import org.cbioportal.model.GenericEntityProperty;
import org.cbioportal.model.GeneticEntity;
import org.mskcc.cbio.portal.dao.DaoGenericAssay;
import org.mskcc.cbio.portal.dao.DaoGeneticEntity;
import org.mskcc.cbio.portal.model.GeneticAlterationType;
import org.mskcc.cbio.portal.util.FileUtil;
import org.mskcc.cbio.portal.util.ProgressMonitor;

import joptsimple.OptionParser;
Expand Down Expand Up @@ -160,7 +163,6 @@ public static void startImport(OptionSet options, OptionSpec<String> data, Optio
* @throws Exception
*/
public static void importData(File dataFile, GeneticAlterationType geneticAlterationType, String additionalProperties, boolean updateInfo) throws Exception {

ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getCanonicalPath());

// read generic assay data file
Expand All @@ -186,6 +188,10 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera
currentLine = buf.readLine();

while (currentLine != null) {
if (!FileUtil.isInfoLine(currentLine)) {
currentLine = buf.readLine();
continue;
}

String[] parts = currentLine.split("\t");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ public ImportGenericAssayPatientLevelData(File dataFile, String targetLine, int
* @throws IOException IO Error.
* @throws DaoException Database Error.
*/
public void importData(int numLines) throws IOException, DaoException {
public void importData() throws IOException, DaoException {
int numLines = FileUtil.getNumLines(dataFile);

geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);

Expand Down
Loading

0 comments on commit 52714d6

Please sign in to comment.