Skip to content

Commit 1819876

Browse files
authored
Merge pull request #158 from shntnu/db
feat: Add DuckDB database schema for JUMP metadata
2 parents 99b8501 + 4a52458 commit 1819876

File tree

6 files changed

+442
-101
lines changed

6 files changed

+442
-101
lines changed

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,10 @@ dmypy.json
128128
# Pyre type checker
129129
.pyre/
130130

131-
.DS_Store
131+
.DS_Store
132+
133+
# Claude
134+
.claude
135+
CLAUDE.md
136+
.mcp.json
137+
*.duckdb

metadata/README.md

Lines changed: 120 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,98 +1,144 @@
11
# Metadata
22

3-
The metadata [schema](https://mermaid-js.github.io/mermaid/#/entityRelationshipDiagram?id=entity-relationship-diagrams) is shown below.
3+
This directory contains experimental metadata for the JUMP Cell Painting Consortium datasets.
44

5+
## Schema Overview
56

67
```mermaid
78
erDiagram
8-
WELL }|--|| PLATE : ""
9+
WELL }|--|| PLATE : "belongs to"
10+
WELL }o--|| PERTURBATION : "treated with"
911
WELL {
10-
string Metadata_Source "Data-generating center ID"
11-
string Metadata_Plate "Plate ID"
12-
string Metadata_Well "Well position"
13-
string Metadata_JCP2022 "JUMP Perturbation ID"
12+
string Metadata_Plate PK,FK
13+
string Metadata_Well PK
14+
string Metadata_JCP2022 FK
15+
string Metadata_Source
1416
}
17+
18+
PLATE }|--|| MICROSCOPE-CONFIG : "imaged with"
19+
PLATE }|--|| CELLPROFILER-VERSION : "analyzed with"
1520
PLATE {
16-
string Metadata_Source "Data-generating center ID"
17-
string Metadata_Batch "Batch ID"
18-
string Metadata_Plate "Plate ID"
19-
string Metadata_PlateType "One of: TARGET1, TARGET2, POSCON8, DMSO, ORF, COMPOUND, COMPOUND_EMPTY"
21+
string Metadata_Plate PK
22+
string Metadata_Source FK
23+
string Metadata_Batch
24+
string Metadata_PlateType
2025
}
21-
WELL }o--o| COMPOUND : ""
26+
27+
PERTURBATION ||--o{ COMPOUND : "if compound"
28+
PERTURBATION ||--o{ ORF : "if ORF"
29+
PERTURBATION ||--o{ CRISPR : "if CRISPR"
30+
PERTURBATION {
31+
string Metadata_JCP2022 PK
32+
string Metadata_perturbation_modality "compound/orf/crispr/unknown"
33+
}
34+
35+
PERTURBATION-CONTROL }o--|| PERTURBATION : "describes"
36+
PERTURBATION-CONTROL {
37+
string Metadata_JCP2022 PK,FK
38+
string Metadata_pert_type "poscon/negcon/empty"
39+
string Metadata_Name "Human-readable name"
40+
}
41+
42+
COMPOUND ||--o{ COMPOUND-SOURCE : "sourced from"
2243
COMPOUND {
23-
string Metadata_JCP2022 PK "JUMP perturbation ID"
24-
string Metadata_InChI "International Chemical ID"
25-
string Metadata_InChIKey "Hashed InChI"
26-
string Metadata_SMILES "SMILES"
44+
string Metadata_JCP2022 PK
45+
string Metadata_InChIKey
46+
string Metadata_SMILES
2747
}
28-
COMPOUND ||--|| COMPOUND_SOURCE : ""
29-
COMPOUND_SOURCE {
30-
string Metadata_JCP2022 "JUMP perturbation ID"
31-
string Metadata_Compound_Source "Compound-nominating centerID"
48+
49+
COMPOUND-SOURCE {
50+
string Metadata_JCP2022 PK,FK
51+
string Metadata_Compound_Source PK
3252
}
33-
WELL }o--o| ORF : ""
53+
3454
ORF {
35-
string Metadata_JCP2022 PK "JUMP perturbation ID"
36-
string Metadata_broad_sample "Broad perturbation ID"
37-
string Metadata_Name "Internal perturbation ID"
38-
string Metadata_Vector "ORF expression vector"
39-
float Metadata_Prot_Match "% match to protein sequence"
40-
int Metadata_Insert_Length "ORF sequence length"
41-
string Metadata_Taxon_ID "NCBI taxonomy ID"
42-
string Metadata_Symbol "NCBI gene symbol"
43-
string Metadata_NCBI_Gene_ID "NCBI gene ID"
44-
string Metadata_Transcript "NCBI reference sequence"
45-
string Metadata_Gene_Description "NCBI gene definition"
46-
string Metadata_pert_type "One of: trt, poscon, negcon"
55+
string Metadata_JCP2022 PK
56+
string Metadata_Symbol "Gene symbol"
57+
string Metadata_NCBI_Gene_ID
4758
}
48-
WELL }o--o| CRISPR : ""
59+
4960
CRISPR {
50-
string Metadata_JCP2022 PK "JUMP perturbation ID"
51-
string Metadata_Symbol "NCBI gene symbol"
52-
string Metadata_NCBI_Gene_ID "NCBI gene ID"
61+
string Metadata_JCP2022 PK
62+
string Metadata_Symbol "Gene symbol"
63+
string Metadata_NCBI_Gene_ID
5364
}
54-
PLATE }|--|| MICROSCOPE-CONFIG : ""
65+
66+
MICROSCOPE-CONFIG }o--|| MICROSCOPE-FILTER : "uses"
5567
MICROSCOPE-CONFIG {
56-
string Metadata_Source "Data-generating center ID"
57-
string Metadata_Microscope_Name "Microscope model name"
58-
string Metadata_Widefield_vs_Confocal "One of: Widefield, Confocal"
59-
string Metadata_Excitation_Type "One of: Laser, LED"
60-
float Metadata_Objective_NA "Objective numerical aperture"
61-
int Metadata_N_Brightfield_Planes_Min "Min number of brightfield planes taken"
62-
int Metadata_N_Brightfield_Planes_Max "Max number of brightfield planes taken"
63-
int Metadata_Distance_Between_Z_Microns "Distance between Z planes in um (only if > 1um)"
64-
int Metadata_Sites_Per_Well "Number of sites per well"
65-
string Metadata_Filter_Configuration "Filter configuration ID"
66-
float Metadata_Pixel_Size_Microns "Pixel size in microns"
68+
string Metadata_Source PK
69+
string Metadata_Microscope_Name
70+
string Metadata_Filter_Configuration FK
6771
}
68-
MICROSCOPE-FILTER ||--|{ MICROSCOPE-CONFIG : ""
72+
6973
MICROSCOPE-FILTER {
70-
string Metadata_Filter_Configuration "Filter configuration ID"
71-
float Metadata_Excitation_Low_DNA "Excitation wavelength min, DNA channel"
72-
float Metadata_Excitation_Low_ER "Excitation wavelength min, ER channel"
73-
float Metadata_Excitation_Low_RNA "Excitation wavelength min, RNA channel"
74-
float Metadata_Excitation_Low_AGP "Excitation wavelength min, AGP channel"
75-
float Metadata_Excitation_Low_Mito "Excitation wavelength min, Mito channel"
76-
float Metadata_Excitation_High_DNA "Excitation wavelength max, DNA channel"
77-
float Metadata_Excitation_High_ER "Excitation wavelength max, ER channel"
78-
float Metadata_Excitation_High_RNA "Excitation wavelength max, RNA channel"
79-
float Metadata_Excitation_High_AGP "Excitation wavelength max, AGP channel"
80-
float Metadata_Excitation_High_Mito "Excitation wavelength max, Mito channel"
81-
float Metadata_Emission_Low_DNA "Emission wavelength min, DNA channel"
82-
float Metadata_Emission_Low_ER "Emission wavelength min, ER channel"
83-
float Metadata_Emission_Low_RNA "Emission wavelength min, RNA channel"
84-
float Metadata_Emission_Low_AGP "Emission wavelength min, AGP channel"
85-
float Metadata_Emission_Low_Mito "Emission wavelength min, Mito channel"
86-
float Metadata_Emission_High_DNA "Emission wavelength max, DNA channel"
87-
float Metadata_Emission_High_ER "Emission wavelength max, ER channel"
88-
float Metadata_Emission_High_RNA "Emission wavelength max, RNA channel"
89-
float Metadata_Emission_High_AGP "Emission wavelength max, AGP channel"
90-
float Metadata_Emission_High_Mito "Emission wavelength max, Mito channel"
91-
string Metadata_FPBase_Config "Fluorescence spectra config URL"
74+
string Metadata_Filter_Configuration PK
75+
string wavelength_configs "DNA/ER/RNA/AGP/Mito channels"
9276
}
93-
PLATE }|--|| CELLPROFILER-VERSION : ""
77+
9478
CELLPROFILER-VERSION {
95-
string Metadata_Source "Data-generating center ID"
96-
string Metadata_CellProfiler_Version "CellProfiler Version"
79+
string Metadata_Source PK
80+
string Metadata_CellProfiler_Version
9781
}
9882
```
83+
84+
### Schema Notes
85+
86+
- **Simplified Overview:** This diagram shows key columns only (e.g., `COMPOUND` has additional `Metadata_InChI column`). See `db/setup.sql` for complete definitions.
87+
- The `PERTURBATION` table is created during database setup by combining all compound, ORF, and CRISPR IDs (no separate CSV file).
88+
- `PERTURBATION_CONTROL` defines which perturbations are controls (negcon/poscon/empty) and provides human-readable names (e.g., "DMSO" → JCP2022_033924). Note: The ORF table also has a legacy `Metadata_pert_type` column, but PERTURBATION_CONTROL is now the canonical source for all control designations across compound, ORF, and CRISPR modalities.
89+
90+
## Database Setup
91+
92+
To create a queryable [DuckDB](https://duckdb.org/docs/installation/) database from these CSV files:
93+
94+
```bash
95+
rm -rf db/jump_metadata.duckdb && duckdb db/jump_metadata.duckdb < db/setup.sql
96+
```
97+
98+
This creates a database with:
99+
100+
- Explicit schema with primary and foreign key constraints
101+
- All CSV data imported as tables with data validation
102+
- Documentation for all tables and columns embedded in the schema
103+
104+
## Querying the Database
105+
106+
```bash
107+
# Interactive mode
108+
duckdb db/jump_metadata.duckdb
109+
110+
# UI; available by default for DuckDB versions >= v1.2.1
111+
# https://duckdb.org/docs/stable/core_extensions/ui.html
112+
duckdb -ui db/jump_metadata.duckdb
113+
```
114+
115+
## Schema Documentation
116+
117+
Full schema documentation is embedded in the database. To view:
118+
119+
```sql
120+
-- List all tables with descriptions
121+
SELECT table_name, comment FROM duckdb_tables();
122+
123+
-- View column descriptions
124+
SELECT table_name, column_name, comment
125+
FROM duckdb_columns()
126+
WHERE comment IS NOT NULL;
127+
128+
-- View all foreign key relationships
129+
SELECT table_name, constraint_text
130+
FROM duckdb_constraints()
131+
WHERE constraint_type = 'FOREIGN KEY';
132+
```
133+
134+
## For Maintainers: Schema Changes
135+
136+
When adding or modifying tables:
137+
138+
1. **Add data file**: Use `.csv` for small tables (<1MB) or `.csv.gz` for larger ones
139+
2. **Update `db/setup.sql`**:
140+
- Define table with PRIMARY KEY and FOREIGN KEY constraints
141+
- Add COMMENT statements for table and columns
142+
- Update the import section with correct file extension
143+
3. **Test**: `rm -rf db/jump_metadata.duckdb && duckdb db/jump_metadata.duckdb < db/setup.sql`
144+
4. **Update diagram**: Add table to Mermaid diagram above with minimal columns (PKs, FKs, 1-2 key fields)

metadata/cellprofiler_version.csv

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
Metadata_Source,Metadata_CellProfiler_Version
2-
1,4.1.3
3-
2,4.0.7
4-
3,4.2.1
5-
4,4.1.3
6-
5,4.1.3
7-
6,4.1.3
8-
7,4.2.1
9-
8,4.1.3
10-
9,4.1.3
11-
10,4.1.3
12-
11,4.1.3
13-
13,4.2.1
14-
15,4.1.3
2+
source_1,4.1.3
3+
source_2,4.0.7
4+
source_3,4.2.1
5+
source_4,4.1.3
6+
source_5,4.1.3
7+
source_6,4.1.3
8+
source_7,4.2.1
9+
source_8,4.1.3
10+
source_9,4.1.3
11+
source_10,4.1.3
12+
source_11,4.1.3
13+
source_13,4.2.1
14+
source_15,4.1.3

0 commit comments

Comments
 (0)