snakemake-workflows · johanneskoester · Sep 20, 2025 · Sep 20, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/.test/config-chm-eval/config.yaml b/.test/config-chm-eval/config.yaml
@@ -36,6 +36,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/.test/config-giab/config.yaml b/.test/config-giab/config.yaml
@@ -57,6 +57,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 # printing of variants in a matrix, sorted by recurrence
 report:
   # if stratificatio is deactivated, one oncoprint for all

diff --git a/.test/config-no-candidate-filtering/config.yaml b/.test/config-no-candidate-filtering/config.yaml
@@ -37,6 +37,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/.test/config-simple/config.yaml b/.test/config-simple/config.yaml
@@ -36,6 +36,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
@@ -34,6 +34,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/.test/config-target-regions/config.yaml b/.test/config-target-regions/config.yaml
@@ -37,6 +37,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/.test/config-target-regions/config_multiple_beds.yaml b/.test/config-target-regions/config_multiple_beds.yaml
@@ -39,6 +39,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/.test/config_primers/config.yaml b/.test/config_primers/config.yaml
@@ -38,6 +38,14 @@ mutational_signatures:
   events:
     - some_id
 
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 calling:
   delly:
     activate: true

diff --git a/config/config.yaml b/config/config.yaml
@@ -77,6 +77,15 @@ mutational_signatures:
     # select events (callsets, defined under calling/fdr-control/events) to evaluate
     - some_id
 
+
+hla_typing:
+  activate: false
+  loci: 
+    - "A"
+    - "B"
+    - "C"
+    - "DQB1"
+
 # Sets the minimum average coverage for each gene.
 # Genes with lower average coverage will not be concidered in gene coverage datavzrd report
 # If not present min_avg_coverage will be set to 0 rendering all genes.

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -44,6 +44,7 @@ include: "rules/fusion_calling.smk"
 include: "rules/testcase.smk"
 include: "rules/population.smk"
 include: "rules/mutational_signatures.smk"
+include: "rules/hla_typing.smk"
 
 
 batches = "all"

diff --git a/workflow/envs/orthanq.yaml b/workflow/envs/orthanq.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - orthanq =1.15.0
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -44,6 +44,10 @@ delly_excluded_regions = {
     ("homo_sapiens", "GRCh37"): "human.hg19",
 }
 
+# hla typing
+hla_loci = config.get("hla_typing", {}).get("loci", [])
+orthanq_callers = [f"orthanq-{locus}" for locus in hla_loci]
+
 
 def _group_or_sample(row):
     group = row.get("group", None)
@@ -163,6 +167,18 @@ def get_final_output(wildcards):
         )
     )
 
+    if is_activated("hla_typing"):
+        loci = [f"orthanq-{locus}" for locus in config["hla_typing"].get("loci", [])]
+        for group in samples["group"].unique():
+            final_output.extend(
+                expand(
+                    "results/hla-typing/{group}-{locus}/{alias}/",
+                    alias=get_group_aliases(group),
+                    group=group,
+                    locus=loci,
+                )
+            )
+
     for calling_type in calling_types:
         if config["report"]["activate"]:
             for event in get_calling_events(calling_type):
@@ -702,7 +718,20 @@ def get_mutational_signature_targets():
 
 def get_scattered_calls(ext="bcf"):
     def inner(wildcards):
-        caller = "arriba" if wildcards.calling_type == "fusions" else variant_caller
+        match wildcards.calling_type:
+            case "fusions":
+                caller = "arriba"
+            case "variants":
+                caller = variant_caller
+            case "hla-variants":
+                caller = [
+                    f"orthanq-{locus}" for locus in config["hla_typing"].get("loci", [])
+                ]
+            case _:
+                raise ValueError(
+                    f"Unexpected wildcard value for 'calling_type': {wildcards.calling_type}"
+                )
+
         return expand(
             "results/calls/{{group}}.{caller}.{{scatteritem}}.{ext}",
             caller=caller,
@@ -751,12 +780,25 @@ def get_gather_annotated_calls_input(ext="bcf"):
     return inner
 
 
+# def get_scatter_candidates_input(wildcards):
+#     if wildcards.caller == "orthanq":
+#         return "results/candidate-calls/{caller}.{scatteritem}.bcf"
+#     elif config.get("target_regions"):
+#         return "results/candidate-calls/{group}.{caller}.filtered.bcf"
+#     else:
+#         return get_fixed_candidate_calls("bcf")
+
+
 def get_candidate_calls(wc):
     filter = config["calling"]["filter"].get("candidates")
+    # only use orthanq path if caller is one of the orthanq loci
+    if is_activated("hla_typing") and wc.caller.startswith("orthanq-"):
+        return "results/orthanq-candidate-calls/{caller}.hla-variants.{scatteritem}.bcf"
+
     if filter and wc.caller != "arriba":
         return "results/candidate-calls/{group}.{caller}.{scatteritem}.filtered.bcf"
-    else:
-        return "results/candidate-calls/{group}.{caller}.{scatteritem}.bcf"
+
+    return "results/candidate-calls/{group}.{caller}.{scatteritem}.bcf"
 
 
 def _get_report_batch(calling_type, batch):
@@ -950,7 +992,9 @@ def get_candidate_filter_aux_files():
         return []
     else:
         # aux files are considered as part of the config, hence local
-        return [local(path) for name, path in get_filter_aux_entries("candidates").items()]
+        return [
+            local(path) for name, path in get_filter_aux_entries("candidates").items()
+        ]
 
 
 def get_candidate_filter_aux(wildcards, input):
@@ -959,7 +1003,9 @@ def get_candidate_filter_aux(wildcards, input):
     else:
         return [
             f"--aux {name}={path}"
-            for name, path in zip(get_filter_aux_entries("candidates").keys(), input.aux)
+            for name, path in zip(
+                get_filter_aux_entries("candidates").keys(), input.aux
+            )
         ]
 
 
@@ -979,7 +1025,7 @@ def get_varlociraptor_params(wildcards, params):
 wildcard_constraints:
     group="|".join(groups),
     sample="|".join(samples["sample_name"]),
-    caller="|".join(["freebayes", "delly", "arriba"]),
+    caller="|".join(["freebayes", "delly", "arriba"] + orthanq_callers),
     filter="|".join(config["calling"]["filter"]),
     event="|".join(config["calling"]["fdr-control"]["events"].keys()),
     regions_type="|".join(["expanded", "covered"]),
@@ -1481,7 +1527,7 @@ def get_primer_extra(wc, input):
     min_primer_len = get_shortest_primer_length(input.reads)
     # Check if shortest primer is below default values
     if min_primer_len < 32:
-        extra += f" -T {min_primer_len - 2}"
+        extra += f" -T {min_primer_len-2}"
     if min_primer_len < 19:
         extra += f" -k {min_primer_len}"
     return extra

diff --git a/workflow/rules/hla_typing.smk b/workflow/rules/hla_typing.smk
@@ -0,0 +1,109 @@
+# dowloads latest IMGT/HLA database version
+rule get_hla_genes_and_xml:
+    output:
+        genes="results/preparation/hla_gen.fasta",
+        xml="results/preparation/hla.xml.zip",
+    log:
+        "logs/get_hla_genes_and_xml.log",
+    params:
+        genes_link="ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/hla_gen.fasta",
+        xml_link="ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/xml/hla.xml.zip",
+    shell:
+        "wget -c {params.genes_link} -O {output.genes} && "
+        "wget -c {params.xml_link} -O {output.xml} 2> {log}"
+
+
+rule unzip_xml:
+    input:
+        "results/preparation/hla.xml.zip",
+    output:
+        xml="results/preparation/hla.xml",
+    log:
+        "logs/unzip_xml.log",
+    params:
+        path_to_unzip=lambda wc, output: os.path.dirname(output.xml),
+    shell:
+        "unzip -o {input} -d {params.path_to_unzip}"
+
+
+rule orthanq_candidate_variants:
+    input:
+        genome=genome,
+        xml="results/preparation/hla.xml",
+        alleles="results/preparation/hla_gen.fasta",
+    output:
+        temp(directory("results/orthanq-candidate-calls-temp/"))
+    log:
+        "logs/orthanq_candidates/candidates.log",
+    threads: 8
+    params:
+        command="candidates",
+        subcommand="hla",
+        output_bcf=True 
+    wrapper:
+        "v7.6.2/bio/orthanq"
+
+
+rule rename_candidate_bcf:
+    input:
+        "results/orthanq-candidate-calls-temp/"
+    output:
+        "results/orthanq-candidate-calls/orthanq-{locus}.hla-variants.bcf"
+    log:
+        "logs/scatter-candidates/rename_candidates_{locus}.log",
+    shell:
+        "cp {input}/{wildcards.locus}.bcf {output}"
+
+
+# The scatter_candidates rule does not work without group wildcards.
+rule scatter_candidates_orthanq:
+    input:
+        "results/orthanq-candidate-calls/{orthanq_locus}.hla-variants.bcf",
+    output:
+        scatter.calling(
+            "results/orthanq-candidate-calls/{{orthanq_locus}}.hla-variants.{scatteritem}.bcf"
+        ),
+    log:
+        "logs/scatter-candidates/{orthanq_locus}.log",
+    conda:
+        "../envs/rbt.yaml"
+    shell:
+        "rbt vcf-split {input} {output}"
+
+
+rule gather_annotated_calls_orthanq:
+    input:
+        calls=gather.calling("results/calls/{{group}}.{{caller}}.{scatteritem}.bcf"),
+        idx=gather.calling("results/calls/{{group}}.{{caller}}.{scatteritem}.bcf.csi"),
+    output:
+        "results/calls/{group}.{caller}.bcf",
+    log:
+        "logs/gather-hla-variants/{group}.{caller}.log",
+    params:
+        extra="-a",
+    group:
+        "annotation"
+    wrapper:
+        "v2.3.2/bio/bcftools/concat"
+
+
+rule orthanq_call_hla:
+    input:
+        haplotype_calls="results/calls/{group}.{caller}.bcf",
+        haplotype_variants="results/orthanq-candidate-calls/{caller}.hla-variants.bcf",
+        xml="results/preparation/hla.xml",
+    output:
+        directory("results/hla-typing/{group}-{caller}/{alias}")
+    log:
+        "logs/orthanq/{group}-{alias}-{caller}.log",
+    params:
+        command="call",
+        subcommand="hla",
+        prior="diploid",
+        extra=""
+    wrapper:
+        "v7.6.2/bio/orthanq"
+
+# TODO add other outputs (plots), fill missing inputs and commands
+# TODO decide how to handle deactivation of biases in varlociraptor:
+# my current favorite is a special INFO tag in the orthanq candidates.