|
1 | 1 | # Metadata |
2 | 2 |
|
3 | | -The metadata [schema](https://mermaid-js.github.io/mermaid/#/entityRelationshipDiagram?id=entity-relationship-diagrams) is shown below. |
| 3 | +This directory contains experimental metadata for the JUMP Cell Painting Consortium datasets. |
4 | 4 |
|
| 5 | +## Schema Overview |
5 | 6 |
|
6 | 7 | ```mermaid |
7 | 8 | erDiagram |
8 | | - WELL }|--|| PLATE : "" |
| 9 | + WELL }|--|| PLATE : "belongs to" |
| 10 | + WELL }o--|| PERTURBATION : "treated with" |
9 | 11 | WELL { |
10 | | - string Metadata_Source "Data-generating center ID" |
11 | | - string Metadata_Plate "Plate ID" |
12 | | - string Metadata_Well "Well position" |
13 | | - string Metadata_JCP2022 "JUMP Perturbation ID" |
| 12 | + string Metadata_Plate PK,FK |
| 13 | + string Metadata_Well PK |
| 14 | + string Metadata_JCP2022 FK |
| 15 | + string Metadata_Source |
14 | 16 | } |
| 17 | + |
| 18 | + PLATE }|--|| MICROSCOPE-CONFIG : "imaged with" |
| 19 | + PLATE }|--|| CELLPROFILER-VERSION : "analyzed with" |
15 | 20 | PLATE { |
16 | | - string Metadata_Source "Data-generating center ID" |
17 | | - string Metadata_Batch "Batch ID" |
18 | | - string Metadata_Plate "Plate ID" |
19 | | - string Metadata_PlateType "One of: TARGET1, TARGET2, POSCON8, DMSO, ORF, COMPOUND, COMPOUND_EMPTY" |
| 21 | + string Metadata_Plate PK |
| 22 | + string Metadata_Source FK |
| 23 | + string Metadata_Batch |
| 24 | + string Metadata_PlateType |
20 | 25 | } |
21 | | - WELL }o--o| COMPOUND : "" |
| 26 | + |
| 27 | + PERTURBATION ||--o{ COMPOUND : "if compound" |
| 28 | + PERTURBATION ||--o{ ORF : "if ORF" |
| 29 | + PERTURBATION ||--o{ CRISPR : "if CRISPR" |
| 30 | + PERTURBATION { |
| 31 | + string Metadata_JCP2022 PK |
| 32 | + string Metadata_perturbation_modality "compound/orf/crispr/unknown" |
| 33 | + } |
| 34 | + |
| 35 | + PERTURBATION-CONTROL }o--|| PERTURBATION : "describes" |
| 36 | + PERTURBATION-CONTROL { |
| 37 | + string Metadata_JCP2022 PK,FK |
| 38 | + string Metadata_pert_type "poscon/negcon/empty" |
| 39 | + string Metadata_Name "Human-readable name" |
| 40 | + } |
| 41 | + |
| 42 | + COMPOUND ||--o{ COMPOUND-SOURCE : "sourced from" |
22 | 43 | COMPOUND { |
23 | | - string Metadata_JCP2022 PK "JUMP perturbation ID" |
24 | | - string Metadata_InChI "International Chemical ID" |
25 | | - string Metadata_InChIKey "Hashed InChI" |
26 | | - string Metadata_SMILES "SMILES" |
| 44 | + string Metadata_JCP2022 PK |
| 45 | + string Metadata_InChIKey |
| 46 | + string Metadata_SMILES |
27 | 47 | } |
28 | | - COMPOUND ||--|| COMPOUND_SOURCE : "" |
29 | | - COMPOUND_SOURCE { |
30 | | - string Metadata_JCP2022 "JUMP perturbation ID" |
31 | | - string Metadata_Compound_Source "Compound-nominating centerID" |
| 48 | + |
| 49 | + COMPOUND-SOURCE { |
| 50 | + string Metadata_JCP2022 PK,FK |
| 51 | + string Metadata_Compound_Source PK |
32 | 52 | } |
33 | | - WELL }o--o| ORF : "" |
| 53 | + |
34 | 54 | ORF { |
35 | | - string Metadata_JCP2022 PK "JUMP perturbation ID" |
36 | | - string Metadata_broad_sample "Broad perturbation ID" |
37 | | - string Metadata_Name "Internal perturbation ID" |
38 | | - string Metadata_Vector "ORF expression vector" |
39 | | - float Metadata_Prot_Match "% match to protein sequence" |
40 | | - int Metadata_Insert_Length "ORF sequence length" |
41 | | - string Metadata_Taxon_ID "NCBI taxonomy ID" |
42 | | - string Metadata_Symbol "NCBI gene symbol" |
43 | | - string Metadata_NCBI_Gene_ID "NCBI gene ID" |
44 | | - string Metadata_Transcript "NCBI reference sequence" |
45 | | - string Metadata_Gene_Description "NCBI gene definition" |
46 | | - string Metadata_pert_type "One of: trt, poscon, negcon" |
| 55 | + string Metadata_JCP2022 PK |
| 56 | + string Metadata_Symbol "Gene symbol" |
| 57 | + string Metadata_NCBI_Gene_ID |
47 | 58 | } |
48 | | - WELL }o--o| CRISPR : "" |
| 59 | + |
49 | 60 | CRISPR { |
50 | | - string Metadata_JCP2022 PK "JUMP perturbation ID" |
51 | | - string Metadata_Symbol "NCBI gene symbol" |
52 | | - string Metadata_NCBI_Gene_ID "NCBI gene ID" |
| 61 | + string Metadata_JCP2022 PK |
| 62 | + string Metadata_Symbol "Gene symbol" |
| 63 | + string Metadata_NCBI_Gene_ID |
53 | 64 | } |
54 | | - PLATE }|--|| MICROSCOPE-CONFIG : "" |
| 65 | + |
| 66 | + MICROSCOPE-CONFIG }o--|| MICROSCOPE-FILTER : "uses" |
55 | 67 | MICROSCOPE-CONFIG { |
56 | | - string Metadata_Source "Data-generating center ID" |
57 | | - string Metadata_Microscope_Name "Microscope model name" |
58 | | - string Metadata_Widefield_vs_Confocal "One of: Widefield, Confocal" |
59 | | - string Metadata_Excitation_Type "One of: Laser, LED" |
60 | | - float Metadata_Objective_NA "Objective numerical aperture" |
61 | | - int Metadata_N_Brightfield_Planes_Min "Min number of brightfield planes taken" |
62 | | - int Metadata_N_Brightfield_Planes_Max "Max number of brightfield planes taken" |
63 | | - int Metadata_Distance_Between_Z_Microns "Distance between Z planes in um (only if > 1um)" |
64 | | - int Metadata_Sites_Per_Well "Number of sites per well" |
65 | | - string Metadata_Filter_Configuration "Filter configuration ID" |
66 | | - float Metadata_Pixel_Size_Microns "Pixel size in microns" |
| 68 | + string Metadata_Source PK |
| 69 | + string Metadata_Microscope_Name |
| 70 | + string Metadata_Filter_Configuration FK |
67 | 71 | } |
68 | | - MICROSCOPE-FILTER ||--|{ MICROSCOPE-CONFIG : "" |
| 72 | + |
69 | 73 | MICROSCOPE-FILTER { |
70 | | - string Metadata_Filter_Configuration "Filter configuration ID" |
71 | | - float Metadata_Excitation_Low_DNA "Excitation wavelength min, DNA channel" |
72 | | - float Metadata_Excitation_Low_ER "Excitation wavelength min, ER channel" |
73 | | - float Metadata_Excitation_Low_RNA "Excitation wavelength min, RNA channel" |
74 | | - float Metadata_Excitation_Low_AGP "Excitation wavelength min, AGP channel" |
75 | | - float Metadata_Excitation_Low_Mito "Excitation wavelength min, Mito channel" |
76 | | - float Metadata_Excitation_High_DNA "Excitation wavelength max, DNA channel" |
77 | | - float Metadata_Excitation_High_ER "Excitation wavelength max, ER channel" |
78 | | - float Metadata_Excitation_High_RNA "Excitation wavelength max, RNA channel" |
79 | | - float Metadata_Excitation_High_AGP "Excitation wavelength max, AGP channel" |
80 | | - float Metadata_Excitation_High_Mito "Excitation wavelength max, Mito channel" |
81 | | - float Metadata_Emission_Low_DNA "Emission wavelength min, DNA channel" |
82 | | - float Metadata_Emission_Low_ER "Emission wavelength min, ER channel" |
83 | | - float Metadata_Emission_Low_RNA "Emission wavelength min, RNA channel" |
84 | | - float Metadata_Emission_Low_AGP "Emission wavelength min, AGP channel" |
85 | | - float Metadata_Emission_Low_Mito "Emission wavelength min, Mito channel" |
86 | | - float Metadata_Emission_High_DNA "Emission wavelength max, DNA channel" |
87 | | - float Metadata_Emission_High_ER "Emission wavelength max, ER channel" |
88 | | - float Metadata_Emission_High_RNA "Emission wavelength max, RNA channel" |
89 | | - float Metadata_Emission_High_AGP "Emission wavelength max, AGP channel" |
90 | | - float Metadata_Emission_High_Mito "Emission wavelength max, Mito channel" |
91 | | - string Metadata_FPBase_Config "Fluorescence spectra config URL" |
| 74 | + string Metadata_Filter_Configuration PK |
| 75 | + string wavelength_configs "DNA/ER/RNA/AGP/Mito channels" |
92 | 76 | } |
93 | | - PLATE }|--|| CELLPROFILER-VERSION : "" |
| 77 | + |
94 | 78 | CELLPROFILER-VERSION { |
95 | | - string Metadata_Source "Data-generating center ID" |
96 | | - string Metadata_CellProfiler_Version "CellProfiler Version" |
| 79 | + string Metadata_Source PK |
| 80 | + string Metadata_CellProfiler_Version |
97 | 81 | } |
98 | 82 | ``` |
| 83 | + |
| 84 | +### Schema Notes |
| 85 | + |
| 86 | +- **Simplified Overview:** This diagram shows key columns only (e.g., `COMPOUND` has additional `Metadata_InChI column`). See `db/setup.sql` for complete definitions. |
| 87 | +- The `PERTURBATION` table is created during database setup by combining all compound, ORF, and CRISPR IDs (no separate CSV file). |
| 88 | +- `PERTURBATION_CONTROL` defines which perturbations are controls (negcon/poscon/empty) and provides human-readable names (e.g., "DMSO" → JCP2022_033924). Note: The ORF table also has a legacy `Metadata_pert_type` column, but PERTURBATION_CONTROL is now the canonical source for all control designations across compound, ORF, and CRISPR modalities. |
| 89 | + |
| 90 | +## Database Setup |
| 91 | + |
| 92 | +To create a queryable [DuckDB](https://duckdb.org/docs/installation/) database from these CSV files: |
| 93 | + |
| 94 | +```bash |
| 95 | +rm -rf db/jump_metadata.duckdb && duckdb db/jump_metadata.duckdb < db/setup.sql |
| 96 | +``` |
| 97 | + |
| 98 | +This creates a database with: |
| 99 | + |
| 100 | +- Explicit schema with primary and foreign key constraints |
| 101 | +- All CSV data imported as tables with data validation |
| 102 | +- Documentation for all tables and columns embedded in the schema |
| 103 | + |
| 104 | +## Querying the Database |
| 105 | + |
| 106 | +```bash |
| 107 | +# Interactive mode |
| 108 | +duckdb db/jump_metadata.duckdb |
| 109 | + |
| 110 | +# UI; available by default for DuckDB versions >= v1.2.1 |
| 111 | +# https://duckdb.org/docs/stable/core_extensions/ui.html |
| 112 | +duckdb -ui db/jump_metadata.duckdb |
| 113 | +``` |
| 114 | + |
| 115 | +## Schema Documentation |
| 116 | + |
| 117 | +Full schema documentation is embedded in the database. To view: |
| 118 | + |
| 119 | +```sql |
| 120 | +-- List all tables with descriptions |
| 121 | +SELECT table_name, comment FROM duckdb_tables(); |
| 122 | + |
| 123 | +-- View column descriptions |
| 124 | +SELECT table_name, column_name, comment |
| 125 | +FROM duckdb_columns() |
| 126 | +WHERE comment IS NOT NULL; |
| 127 | + |
| 128 | +-- View all foreign key relationships |
| 129 | +SELECT table_name, constraint_text |
| 130 | +FROM duckdb_constraints() |
| 131 | +WHERE constraint_type = 'FOREIGN KEY'; |
| 132 | +``` |
| 133 | + |
| 134 | +## For Maintainers: Schema Changes |
| 135 | + |
| 136 | +When adding or modifying tables: |
| 137 | + |
| 138 | +1. **Add data file**: Use `.csv` for small tables (<1MB) or `.csv.gz` for larger ones |
| 139 | +2. **Update `db/setup.sql`**: |
| 140 | + - Define table with PRIMARY KEY and FOREIGN KEY constraints |
| 141 | + - Add COMMENT statements for table and columns |
| 142 | + - Update the import section with correct file extension |
| 143 | +3. **Test**: `rm -rf db/jump_metadata.duckdb && duckdb db/jump_metadata.duckdb < db/setup.sql` |
| 144 | +4. **Update diagram**: Add table to Mermaid diagram above with minimal columns (PKs, FKs, 1-2 key fields) |
0 commit comments