Skip to content

Commit 99c34b6

Browse files
authored
Merge pull request #166 from jump-cellpainting/sqlite-export
feat: Add SQLite export script and documentation
2 parents 1819876 + 041624c commit 99c34b6

File tree

8 files changed

+98
-24
lines changed

8 files changed

+98
-24
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,4 @@ dmypy.json
135135
CLAUDE.md
136136
.mcp.json
137137
*.duckdb
138+
*.sqlite

.markdownlint.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Markdown style configuration
2+
MD007:
3+
indent: 4 # List indent
4+
MD013: false # Line length
5+
MD024: false # Multiple headers with same content
6+
MD029:
7+
style: ordered # Ordered list style
8+
MD033: false # Inline HTML
9+
MD046: false # Code block style

.pre-commit-config.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v5.0.0
4+
hooks:
5+
- id: trailing-whitespace
6+
exclude: \.dvc$|dvc\.lock|\.local\.json$
7+
- id: check-added-large-files
8+
args: [--maxkb=10240]
9+
- id: check-yaml
10+
- id: end-of-file-fixer
11+
exclude: \.dvc$|dvc\.lock|\.local\.json$
12+
13+
- repo: https://github.com/astral-sh/ruff-pre-commit
14+
rev: v0.9.1
15+
hooks:
16+
- id: ruff
17+
args: [--fix]
18+
- id: ruff-format

.zenodo.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
"creators": [
33
{
44
"name": "The JUMP Cell Painting Consortium"
5-
5+
66
}
77
],
88
"title": "The Joint Undertaking for Morphological Profiling (JUMP) Consortium Datasets",
9-
"upload_type": "dataset",
10-
"access_right": "open",
9+
"upload_type": "dataset",
10+
"access_right": "open",
1111
"related_identifiers": [
1212
{
1313
"scheme": "doi",
@@ -21,6 +21,6 @@
2121
"relation": "describes",
2222
"resource_type": "publication-preprint"
2323
}
24-
24+
2525
]
2626
}

LICENCE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
2626
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
2727
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
2828
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

metadata/README.md

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ erDiagram
1414
string Metadata_JCP2022 FK
1515
string Metadata_Source
1616
}
17-
17+
1818
PLATE }|--|| MICROSCOPE-CONFIG : "imaged with"
1919
PLATE }|--|| CELLPROFILER-VERSION : "analyzed with"
2020
PLATE {
@@ -23,58 +23,58 @@ erDiagram
2323
string Metadata_Batch
2424
string Metadata_PlateType
2525
}
26-
26+
2727
PERTURBATION ||--o{ COMPOUND : "if compound"
2828
PERTURBATION ||--o{ ORF : "if ORF"
2929
PERTURBATION ||--o{ CRISPR : "if CRISPR"
3030
PERTURBATION {
3131
string Metadata_JCP2022 PK
3232
string Metadata_perturbation_modality "compound/orf/crispr/unknown"
3333
}
34-
34+
3535
PERTURBATION-CONTROL }o--|| PERTURBATION : "describes"
3636
PERTURBATION-CONTROL {
3737
string Metadata_JCP2022 PK,FK
3838
string Metadata_pert_type "poscon/negcon/empty"
3939
string Metadata_Name "Human-readable name"
4040
}
41-
41+
4242
COMPOUND ||--o{ COMPOUND-SOURCE : "sourced from"
4343
COMPOUND {
4444
string Metadata_JCP2022 PK
4545
string Metadata_InChIKey
4646
string Metadata_SMILES
4747
}
48-
48+
4949
COMPOUND-SOURCE {
5050
string Metadata_JCP2022 PK,FK
5151
string Metadata_Compound_Source PK
5252
}
53-
53+
5454
ORF {
5555
string Metadata_JCP2022 PK
5656
string Metadata_Symbol "Gene symbol"
5757
string Metadata_NCBI_Gene_ID
5858
}
59-
59+
6060
CRISPR {
6161
string Metadata_JCP2022 PK
6262
string Metadata_Symbol "Gene symbol"
6363
string Metadata_NCBI_Gene_ID
6464
}
65-
65+
6666
MICROSCOPE-CONFIG }o--|| MICROSCOPE-FILTER : "uses"
6767
MICROSCOPE-CONFIG {
6868
string Metadata_Source PK
6969
string Metadata_Microscope_Name
7070
string Metadata_Filter_Configuration FK
7171
}
72-
72+
7373
MICROSCOPE-FILTER {
7474
string Metadata_Filter_Configuration PK
7575
string wavelength_configs "DNA/ER/RNA/AGP/Mito channels"
7676
}
77-
77+
7878
CELLPROFILER-VERSION {
7979
string Metadata_Source PK
8080
string Metadata_CellProfiler_Version
@@ -101,6 +101,14 @@ This creates a database with:
101101
- All CSV data imported as tables with data validation
102102
- Documentation for all tables and columns embedded in the schema
103103

104+
## SQLite Export
105+
106+
To export the DuckDB database to SQLite format:
107+
108+
```bash
109+
bash db/export_sqlite.sh
110+
```
111+
104112
## Querying the Database
105113

106114
```bash
@@ -121,13 +129,13 @@ Full schema documentation is embedded in the database. To view:
121129
SELECT table_name, comment FROM duckdb_tables();
122130

123131
-- View column descriptions
124-
SELECT table_name, column_name, comment
125-
FROM duckdb_columns()
132+
SELECT table_name, column_name, comment
133+
FROM duckdb_columns()
126134
WHERE comment IS NOT NULL;
127135

128136
-- View all foreign key relationships
129-
SELECT table_name, constraint_text
130-
FROM duckdb_constraints()
137+
SELECT table_name, constraint_text
138+
FROM duckdb_constraints()
131139
WHERE constraint_type = 'FOREIGN KEY';
132140
```
133141

@@ -136,7 +144,7 @@ WHERE constraint_type = 'FOREIGN KEY';
136144
When adding or modifying tables:
137145

138146
1. **Add data file**: Use `.csv` for small tables (<1MB) or `.csv.gz` for larger ones
139-
2. **Update `db/setup.sql`**:
147+
2. **Update `db/setup.sql`**:
140148
- Define table with PRIMARY KEY and FOREIGN KEY constraints
141149
- Add COMMENT statements for table and columns
142150
- Update the import section with correct file extension

metadata/db/export_sqlite.sh

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
# Export DuckDB database to SQLite format
3+
# Usage: bash db/export_sqlite.sh
4+
#
5+
# This script exports the DuckDB database to SQLite format rather than maintaining
6+
# a separate SQLite setup script. DuckDB is the primary interaction mode, and this
7+
# export approach ensures schema consistency between formats while avoiding dual
8+
# maintenance of separate setup scripts.
9+
10+
set -euo pipefail
11+
12+
DUCKDB_FILE="db/jump_metadata.duckdb"
13+
SQLITE_FILE="db/jump_metadata.sqlite"
14+
15+
if [[ ! -f "$DUCKDB_FILE" ]]; then
16+
echo "Error: DuckDB file $DUCKDB_FILE not found. Run setup first:"
17+
echo " duckdb $DUCKDB_FILE < db/setup.sql"
18+
exit 1
19+
fi
20+
21+
echo "Exporting DuckDB to SQLite..."
22+
23+
duckdb "$DUCKDB_FILE" -c "
24+
ATTACH '$SQLITE_FILE' AS sqlite_db (TYPE SQLITE);
25+
CREATE TABLE sqlite_db.microscope_filter AS SELECT * FROM microscope_filter;
26+
CREATE TABLE sqlite_db.microscope_config AS SELECT * FROM microscope_config;
27+
CREATE TABLE sqlite_db.cellprofiler_version AS SELECT * FROM cellprofiler_version;
28+
CREATE TABLE sqlite_db.compound AS SELECT * FROM compound;
29+
CREATE TABLE sqlite_db.orf AS SELECT * FROM orf;
30+
CREATE TABLE sqlite_db.crispr AS SELECT * FROM crispr;
31+
CREATE TABLE sqlite_db.perturbation AS SELECT * FROM perturbation;
32+
CREATE TABLE sqlite_db.perturbation_control AS SELECT * FROM perturbation_control;
33+
CREATE TABLE sqlite_db.plate AS SELECT * FROM plate;
34+
CREATE TABLE sqlite_db.well AS SELECT * FROM well;
35+
CREATE TABLE sqlite_db.compound_source AS SELECT * FROM compound_source;
36+
"
37+
38+
echo "SQLite database created: $SQLITE_FILE"

metadata/db/setup.sql

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ INSERT INTO orf SELECT * FROM read_csv_auto('orf.csv.gz');
236236
INSERT INTO crispr SELECT * FROM read_csv_auto('crispr.csv.gz');
237237

238238
-- Populate the perturbation union table
239-
INSERT INTO perturbation
239+
INSERT INTO perturbation
240240
SELECT Metadata_JCP2022, 'compound' FROM compound
241241
UNION ALL
242242
SELECT Metadata_JCP2022, 'orf' FROM orf
@@ -248,14 +248,14 @@ INSERT INTO perturbation VALUES ('JCP2022_UNKNOWN', 'unknown');
248248

249249
-- Load perturbation control information
250250
INSERT INTO perturbation_control (Metadata_JCP2022, Metadata_pert_type, Metadata_Name)
251-
SELECT Metadata_JCP2022, Metadata_pert_type, Metadata_Name
251+
SELECT Metadata_JCP2022, Metadata_pert_type, Metadata_Name
252252
FROM read_csv_auto('perturbation_control.csv');
253253

254254
-- Then load tables with foreign keys
255255
INSERT INTO plate SELECT * FROM read_csv_auto('plate.csv.gz');
256256

257257
-- Only insert compound_source records that have matching compounds (deduplicated)
258-
INSERT INTO compound_source
258+
INSERT INTO compound_source
259259
SELECT DISTINCT cs.* FROM read_csv_auto('compound_source.csv.gz') cs
260260
WHERE cs.Metadata_JCP2022 IN (SELECT Metadata_JCP2022 FROM compound);
261261

@@ -267,4 +267,4 @@ INSERT INTO well SELECT * FROM read_csv_auto('well.csv.gz');
267267
-- Create additional indexes for query performance
268268
-- (Primary keys already create indexes automatically)
269269

270-
CREATE INDEX idx_well_jcp ON well(Metadata_JCP2022);
270+
CREATE INDEX idx_well_jcp ON well(Metadata_JCP2022);

0 commit comments

Comments
 (0)