Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
9d0c8c6
Initial update for basic compatibility with python 3.12. Tried to up…
melissacline Oct 18, 2025
15e8c09
Updated for basic compatibility with python 3.12
melissacline Oct 18, 2025
dada2a0
Updated Dockerfile to use Python 3.12
melissacline Dec 29, 2025
bbd47f5
Enhance Python 3.12 compatibility and branch checkout support
melissacline Dec 30, 2025
87c83fe
Fix Docker build failure with grep patterns for numpy and cython
melissacline Dec 30, 2025
69ffb4a
Update test-requirements.txt for Python 3.12 compatibility
melissacline Dec 30, 2025
584105f
Update path from brca-exchange to brca-exchange-kb in run_luigi.sh
melissacline Dec 30, 2025
f4826d3
Add fallback download for LOVD data in setup-lovd target
melissacline Dec 30, 2025
5f4e83a
Fix UTA_PG_PW template variable for PostgreSQL password
melissacline Dec 30, 2025
60e6d04
Fix typo in seqrepo script filename
melissacline Dec 31, 2025
742a40b
Fix six module dependency issue for Python 3.12
melissacline Dec 31, 2025
63b8504
Update dependencies for Python 3.12 compatibility
melissacline Dec 31, 2025
a4ae1f9
Update coloredlogs to fix dependency conflict with biocommons.seqrepo
melissacline Dec 31, 2025
eaaae77
Update tqdm to fix dependency conflict with biocommons.seqrepo
melissacline Dec 31, 2025
9249077
Update humanfriendly to fix dependency conflict with coloredlogs
melissacline Dec 31, 2025
95aa24b
Update pipeline configuration template for latest dependencies
melissacline Jan 5, 2026
5b4d279
Merge branch 'master' into mc_pipeline/sync_brca_exchange_pipeline
melissacline Jan 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions pipeline/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ LUIGI_WORKERS = 8
SEQ_REPO_DIR_DOCKER := /files/resources/seq_repo

ifndef GENE_CONFIG_FILENAME
GENE_CONFIG_FILE = /opt/brca-exchange/pipeline/workflow/gene_config_brca_only.txt
GENE_CONFIG_FILE = /opt/brca-exchange-kb/pipeline/workflow/gene_config_brca_only.txt
else
GENE_CONFIG_FILE = /opt/brca-exchange/pipeline/workflow/$(GENE_CONFIG_FILENAME)
GENE_CONFIG_FILE = /opt/brca-exchange-kb/pipeline/workflow/$(GENE_CONFIG_FILENAME)
endif

init: ## setup config file
Expand Down Expand Up @@ -52,7 +52,7 @@ start-local-uta: ## starting local uta docker container

start-seqrepo-rest-service:
[ `docker ps -f name="seqrepo-rest-service" | wc -l` -gt 1 ] || \
utilities/lauch_seqrepo.sh
utilities/launch_seqrepo.sh

.ONESHELL:
setup-files: ## setup various directories to run pipeline
Expand All @@ -67,7 +67,8 @@ setup-files: ## setup various directories to run pipeline
setup-lovd: ## setting up LOVD data via pipeline machine, if not running on pipeline machine, as the access is IP restricted. Requires access to main pipeline machine
if [ `hostname -s` != "brcaexchange-dev" ]; then
mkdir -p $(OUT_DIR)/LOVD
ssh brca@brcaexchange-dev.gi.ucsc.edu curl https://databases.lovd.nl/shared/export/BRCA > $(OUT_DIR)/LOVD/BRCA.txt
ssh brca@brcaexchange-dev.gi.ucsc.edu curl https://databases.lovd.nl/shared/export/BRCA > $(OUT_DIR)/LOVD/BRCA.txt || \
wget https://brcaexchange.org/backend/downloads/BRCA_from_LOVD.2025_11_11.txt -O $(OUT_DIR)/LOVD/BRCA.txt
fi


Expand All @@ -81,15 +82,15 @@ COMMON_DOCKER_ARGS = --rm -u `id -u ${USER}`:$(DOCKER_GRP) \
-e "DATA_DATE=$(DATA_DATE)" \
-e "UTA_DB_URL=$(UTA_DB_URL)" \
-e "HGVS_SEQREPO_DIR=$(SEQ_REPO_DIR_DOCKER)/latest" \
-e "PYTHONPATH=/opt/brca-exchange/pipeline" \
-e "PYTHONPATH=/opt/brca-exchange-kb/pipeline" \
--network host \
-v $(RESOURCES_DIR):/files/resources \
-v $(SEQ_REPO_DIR):$(SEQ_REPO_DIR_DOCKER) \
-v $(OUT_DIR):/files/data \
-v $(CREDENTIALS_PATH):/opt/luigi_pipeline_credentials.cfg \
-v $(PREVIOUS_RELEASE_PATH):/files/previous_release.tar.gz \
-v $(RELEASE_NOTES_PATH):/files/release_notes.txt \
-v $(CODE_BASE):/opt/brca-exchange \
-v $(CODE_BASE):/opt/brca-exchange-kb \
-v /var/run/docker.sock:/var/run/docker.sock

download-seqrepo: ## Download seq repo data (fyi, seqrepo pull will skip downloading the latest release if we already have it)
Expand All @@ -99,14 +100,14 @@ download-seqrepo: ## Download seq repo data (fyi, seqrepo pull will skip downloa
grep 'most recent seqrepo instance is' | rev | cut -d ' ' -f 1 | rev > $(OUT_DIR)/seqrepo_instance.txt

download-victor-data: ## download data dependency for victor (bayesdel computations). Keeping a separate data version per release. In order to save disk space, e.g. jdupes could be used to create hardlinks
docker run $(COMMON_DOCKER_ARGS) -v $(VICTOR_DATA_DIR):/mnt $(PIPELINE_IMAGE) bash -c '/opt/brca-exchange/pipeline/data_merging/bayesdel/download_bayesdel_data_dependencies.sh /mnt/$(DATA_DATE)'
docker run $(COMMON_DOCKER_ARGS) -v $(VICTOR_DATA_DIR):/mnt $(PIPELINE_IMAGE) bash -c '/opt/brca-exchange-kb/pipeline/data_merging/bayesdel/download_bayesdel_data_dependencies.sh /mnt/$(DATA_DATE)'

run-pipeline: ## running entire pipeline
TS=`date +%Y%m%d_%H%M%S`
LOG_FILE=$(LOG_DIR)/pipeline_run_$${TS}.log
echo "Log files are in $${LOG_FILE}"

docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) /opt/brca-exchange/pipeline/docker/run_luigi.sh $(PRIORS_REFERENCES) $(OUT_DIR) $(PRIORS_IMAGE) $(VR_IMAGE) $(LUIGI_ROOT_TASK) $(LUIGI_WORKERS) $(GENE_CONFIG_FILE) ${VICTOR_IMAGE} ${VICTOR_DATA_DIR}/STATIC_DATA $(SEQ_REPO_DIR) > $${LOG_FILE} 2>&1
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) /opt/brca-exchange-kb/pipeline/docker/run_luigi.sh $(PRIORS_REFERENCES) $(OUT_DIR) $(PRIORS_IMAGE) $(VR_IMAGE) $(LUIGI_ROOT_TASK) $(LUIGI_WORKERS) $(GENE_CONFIG_FILE) ${VICTOR_IMAGE} ${VICTOR_DATA_DIR}/STATIC_DATA $(SEQ_REPO_DIR) > $${LOG_FILE} 2>&1


run-interactive: ## starting docker container in interactive mode
Expand All @@ -121,7 +122,7 @@ ifeq ($(firstword),$(filter $(firstword), run-task, force-run-task, clean-files-
$(eval $(RUN_ARGS):;@:)
endif

RUN_TASK_CMD := docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) /opt/brca-exchange/pipeline/docker/run_luigi.sh $(PRIORS_REFERENCES) $(OUT_DIR) $(PRIORS_IMAGE) $(VR_IMAGE)
RUN_TASK_CMD := docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) /opt/brca-exchange-kb/pipeline/docker/run_luigi.sh $(PRIORS_REFERENCES) $(OUT_DIR) $(PRIORS_IMAGE) $(VR_IMAGE)

run-task: ## Running a specific task
$(RUN_TASK_CMD) $(RUN_ARGS) $(LUIGI_WORKERS) $(GENE_CONFIG_FILE) ${VICTOR_IMAGE} ${VICTOR_DATA_DIR}/STATIC_DATA $(SEQ_REPO_DIR)
Expand All @@ -130,26 +131,26 @@ PIPELINE_ARGS := --PipelineParams-output-dir /files/data/output --PipelineParams

force-run-task: ## Running a specific task (deleting its dependencies first)
## TODO: should this command use RUN_ARGS command twice?
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/workflow && python prune_output_files.py CompileVCFFiles $(RUN_ARGS) $(RUN_ARGS) $(PIPELINE_ARGS)'
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange-kb/pipeline/workflow && python prune_output_files.py CompileVCFFiles $(RUN_ARGS) $(RUN_ARGS) $(PIPELINE_ARGS)'
$(RUN_TASK_CMD) $(RUN_ARGS) $(LUIGI_WORKERS) $(GENE_CONFIG_FILE) ${VICTOR_IMAGE} ${VICTOR_DATA_DIR}/STATIC_DATA $(SEQ_REPO_DIR)

clean-files-from: ## Delete files generated by tasks on the task path between the given task and $(LUIGI_ROOT_TASK)
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/workflow && python prune_output_files.py CompileVCFFiles $(LUIGI_ROOT_TASK) $(RUN_ARGS) $(PIPELINE_ARGS)'
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange-kb/pipeline/workflow && python prune_output_files.py CompileVCFFiles $(LUIGI_ROOT_TASK) $(RUN_ARGS) $(PIPELINE_ARGS)'

show-luigi-graph: ## print some representation of the luigi compute graph on the console
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/workflow && PYTHONPATH="/opt/brca-exchange/pipeline:/opt/brca-exchange/pipeline/workflow" luigi-deps-tree --module CompileVCFFiles $(PIPELINE_ARGS) $(LUIGI_ROOT_TASK) | sed -E "s/([^{])\{.*\}([^}])/\1/g" '
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange-kb/pipeline/workflow && PYTHONPATH="/opt/brca-exchange-kb/pipeline:/opt/brca-exchange-kb/pipeline/workflow" luigi-deps-tree --module CompileVCFFiles $(PIPELINE_ARGS) $(LUIGI_ROOT_TASK) | sed -E "s/([^{])\{.*\}([^}])/\1/g" '

test: ## Running pipeline unit tests
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/data && bash getdata && cd /opt/brca-exchange/pipeline && pytest --ignore=splicing/'
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange-kb/pipeline/data && bash getdata && cd /opt/brca-exchange-kb/pipeline && pytest --ignore=splicing/'

test-coverage: ## Running pipeline unit tests with coverage information
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/data && bash getdata && cd /opt/brca-exchange/pipeline && pytest --cov --ignore=splicing/ && coverage html --include="/opt/brca-exchange/pipeline/*" --omit="*/test_*"'
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange-kb/pipeline/data && bash getdata && cd /opt/brca-exchange-kb/pipeline && pytest --cov --ignore=splicing/ && coverage html --include="/opt/brca-exchange-kb/pipeline/*" --omit="*/test_*"'


build-release: checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-local-uta start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release

variants-by-source: ## postprocessing: compute statistics for changes with respect to the last release
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) python /opt/brca-exchange/pipeline/utilities/variantsBySource.py -i /files/data/output/release/built_with_change_types.tsv -c true
docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) python /opt/brca-exchange-kb/pipeline/utilities/variantsBySource.py -i /files/data/output/release/built_with_change_types.tsv -c true

prune-release-notes-output: ## postprocessing: removes some files, s.t. luigi tasks can be triggered to include updated release notes into release archive
make clean-files-from GenerateReleaseNotes
Expand Down
22 changes: 11 additions & 11 deletions pipeline/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
FROM python:3.9-bullseye

FROM python:3.12-bookworm
RUN chmod 1777 /tmp

RUN apt-get update && apt-get install -y \
Expand All @@ -11,15 +10,12 @@ RUN apt-get update && apt-get install -y \
libbz2-dev \
liblzma-dev \
pkg-config \
python3 \
python3-pip \
python3-gdbm \
python-lzo \
rsync \
vim \
wget \
zlib1g-dev \
netcat
netcat-openbsd

# Get the Docker binary
RUN curl -fsSL get.docker.com -o get-docker.sh \
Expand All @@ -31,12 +27,16 @@ COPY pipeline/requirements.txt .
COPY test-requirements.txt .

# pip 20.3+ uses strict dependency resolver that causes biocommons/bioutils and hgvs/ipython errors
RUN pip install pip==24.0
# Using pip 24.3.1 for Python 3.12 compatibility
RUN pip install --upgrade pip==24.3.1

# install six first as it's a fundamental dependency for many packages
RUN pip install $(grep "^six" requirements.txt)

# install numpy first to avoid issues with bio python and bx-python (see also https://github.com/LUMC/vep2lovd/issues/1)
RUN pip install $(grep numpy requirements.txt)
RUN pip install $(grep "^numpy" requirements.txt)

RUN pip install $(grep -i cython requirements.txt)
RUN pip install $(grep "^cython" requirements.txt)
RUN pip install -r requirements.txt -r test-requirements.txt

# install vcf tools
Expand Down Expand Up @@ -64,13 +64,13 @@ RUN mkdir -p $res /files/data && chmod -R o+rwx /files
RUN rm -r /root/.cache

ARG FORCE_REBUILD=0
COPY . /opt/brca-exchange
COPY . /opt/brca-exchange-kb

ENV LUIGI_CONFIG_PATH="/opt/luigi_pipeline_credentials.cfg"

ARG IS_GIT_DIRTY="False"
ARG GIT_COMMIT=""
LABEL GitCommit=$GIT_COMMIT IsGitDirty=$IS_GIT_DIRTY

CMD ["/opt/brca-exchange/pipeline/docker/run_luigi.sh"]
CMD ["/opt/brca-exchange-kb/pipeline/docker/run_luigi.sh"]

2 changes: 1 addition & 1 deletion pipeline/docker/reproducibility/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ The pipeline consists of 2 stages where the first one is to normalize the data f
## Reproducing Merging Stage

1. Make sure you have docker installed on your system
2. Copy and run [this script](https://github.com/BRCAChallenge/brca-exchange/blob/master/pipeline/docker/reproducibility/reproduce_merging.sh):
2. Copy and run [this script](https://github.com/BRCAChallenge/brca-exchange-kb/blob/master/pipeline/docker/reproducibility/reproduce_merging.sh):
It will download necessary release archives, some auxiliary data and run the docker image of the merging part.
3. If everything went well, the script will output a path were the newly generated release archive can be found.
6 changes: 3 additions & 3 deletions pipeline/docker/run_luigi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ PREVIOUS_RELEASE_TAR=/files/previous_release.tar.gz

RELEASE_NOTES=/files/release_notes.txt

CODE_MNT=$(mount | grep /opt/brca-exchange)
CODE_MNT=$(mount | grep /opt/brca-exchange-kb)
[ -z "${CODE_MNT}" ] || echo "WARNING: BRCA Code base mounted from host file system"

cd /opt/brca-exchange
cd /opt/brca-exchange-kb

echo "Running brca exchange pipeline:"
echo "Git hash: $(git log | head -n 1)"

cd /opt/brca-exchange/pipeline/workflow
cd /opt/brca-exchange-kb/pipeline/workflow

echo "Attempting to run task ${LUIGI_TASK}"
python -m luigi --logging-conf-file luigi_log_configuration.conf --module CompileVCFFiles ${LUIGI_TASK} \
Expand Down
14 changes: 4 additions & 10 deletions pipeline/pipeline_running/brca_pipeline_cfg.mk.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@ PREVIOUS_RELEASE_DIR := {{ PREVIOUS_RELEASE_DIR }}
SEQ_REPO_DIR := {{ SEQ_REPO_DIR }}
HGVS_SEQREPO_DIR := {{ SEQ_REPO_DIR }}

{% set VICTOR_DATA_DIR = VICTOR_DATA_DIR|default("/data/victor") %}
VICTOR_DATA_DIR := {{ VICTOR_DATA_DIR }}

{% set VICTOR_IMAGE_DEFAULT_TAG = "0.1" %}
VICTOR_IMAGE := {{ VICTOR_IMAGE | default("brcachallenge/victor:" + VICTOR_IMAGE_DEFAULT_TAG) }}

# path to release notes
RELEASE_NOTES_PATH := {{ RELEASE_NOTES_PATH | default(WORK_DIR + "/release_notes_" + RELEASE_TAG + ".txt") }}

Expand All @@ -33,7 +27,7 @@ OUT_DIR := {{ OUT_DIR | default(WORK_DIR + "/data_out") }}

CODE_BASE := {{ CODE_BASE|default("../../..") }}
GIT_COMMIT := {{ GIT_COMMIT|default("master") }}
BRCA_GIT_REPO := {{ BRCA_GIT_REPO|default("https://github.com/BRCAChallenge/brca-exchange.git") }}
BRCA_GIT_REPO := {{ BRCA_GIT_REPO|default("https://github.com/BRCAChallenge/brca-exchange-kb.git") }}

PIPELINE_IMAGE := {{ PIPELINE_IMAGE | default("brcachallenge/brca-exchange-pipeline:" + RELEASE_TAG) }}
RESOURCES_DIR := {{ RESOURCES_DIR|default(WORK_DIR + "/resources") }}
Expand All @@ -51,16 +45,16 @@ VR_IMAGE := {{ VR_IMAGE | default("brcachallenge/append-vr-ids:" + VR_IMAGE_DEFA

## UTA configurations

{% set UTA_RELEASE_DATE = UTA_RELEASE_DATE|default("20210129b") %}
{% set UTA_RELEASE_DATE = UTA_RELEASE_DATE|default("20241220") %}
UTA_RELEASE_DATE := {{ UTA_RELEASE_DATE }}
UTA_DOCKER_IMAGE := {{ UTA_DOCKER_IMAGE | default("biocommons/uta:uta_" + UTA_RELEASE_DATE) }}
UTA_CONTAINER := {{ UTA_CONTAINER | default("uta_" + UTA_RELEASE_DATE) }}
{% set UTA_PG_PW = brca_2024 %}
{% set UTA_PG_PW = UTA_PG_PW | default("POSTGRES_PASSWORD=uta_password") %}
UTA_PG_PW := {{ UTA_PG_PW }}
{% set UTA_PORT = UTA_PORT | default("50828") %}
UTA_PORT := {{ UTA_PORT }}
UTA_DB_URL := {{ "postgresql://anonymous@localhost:" + UTA_PORT + "/uta/uta_" + UTA_RELEASE_DATE }}
{% set UTA_VOLUME = UTA_VOLUME | default("uta-" + UTA_RELEASE_DATE) %}
{% set UTA_VOLUME = UTA_VOLUME | default("uta_vol") %}
UTA_VOLUME := {{ UTA_VOLUME }}


Expand Down
Loading