galaxyproject · ksuderman · May 19, 2026 · May 20, 2026
diff --git a/data_managers/data_manager_corenlp_models/.shed.yml b/data_managers/data_manager_corenlp_models/.shed.yml
@@ -0,0 +1,23 @@
+name: data_manager_corenlp_models
+owner: iuc
+description: Data manager for downloading Stanford CoreNLP language model JARs
+long_description: |
+  This data manager allows Galaxy administrators to download and install Stanford
+  CoreNLP language model JAR files from Maven Central. It supports 8 languages
+  with multi-select installation for use with the Stanford CoreNLP annotation tool.
+
+  Supported languages: Arabic, Chinese, English, French, German, Hungarian, Italian, Spanish.
+  Also provides the common models JAR required for coreference resolution.
+homepage_url: https://stanfordnlp.github.io/CoreNLP/
+remote_repository_url: https://github.com/galaxyproject/tools-iuc
+type: unrestricted
+categories:
+  - Data Managers
+  - Text Manipulation
+  - Natural Language Processing
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }}"
+  include:
+    - data_manager_corenlp_models.xml
+  test: false
diff --git a/data_managers/data_manager_corenlp_models/README.md b/data_managers/data_manager_corenlp_models/README.md
@@ -0,0 +1,63 @@
+# Stanford CoreNLP Language Models Data Manager
+
+Galaxy data manager for downloading and installing Stanford CoreNLP language model JARs.
+
+## Description
+
+This data manager allows Galaxy administrators to easily download and install Stanford CoreNLP language model JAR files from Maven Central for use with the Stanford CoreNLP annotation tool.
+
+## Features
+
+- **Multi-select support**: Install multiple language models at once using checkboxes
+- **Common models option**: Optionally install common models JAR (required for coreference resolution)
+- **Automatic download**: Downloads model JARs from Maven Central
+- **Automatic registration**: Registers models in Galaxy's data table for immediate use
+- **Error handling**: Continues with remaining models if one fails to download
+
+## Supported Languages
+
+Language models are available for:
+- Arabic (ar)
+- Chinese (zh)
+- English (en)
+- French (fr)
+- German (de)
+- Hungarian (hu)
+- Italian (it)
+- Spanish (es)
+
+**Note**: Not all annotators are available for all languages. See the [Stanford CoreNLP documentation](https://stanfordnlp.github.io/CoreNLP/human-languages.html) for language-specific capabilities.
+
+## Installation
+
+1. Install this data manager via the Galaxy Tool Shed or manually
+2. As a Galaxy admin, go to **Admin → Local Data**
+3. Select "Stanford CoreNLP Language Models"
+4. Check the boxes for the languages you want to install
+5. Optionally check "Install common models" if you need coreference resolution (checked by default)
+6. Click "Execute"
+
+The models are large files (typically 100-500 MB each), so downloading multiple models may take several minutes depending on your connection speed.
+
+### Common Models
+
+The common models JAR (452 MB) contains shared dictionaries and resources needed for coreference resolution. If you plan to use the coreference annotator, you must install the common models. This option is checked by default for convenience.
+
+## Requirements
+
+- Python 3.9+
+- Internet connection for downloading models from Maven Central
+
+## Version
+
+This data manager downloads models for CoreNLP version 4.5.10.
+
+## Usage with Stanford CoreNLP Tool
+
+After installing language models via this data manager, they will be available in the Stanford CoreNLP tool's "Language Model" dropdown. The tool uses a Docker container (`ksuderman/corenlp:4.5.10`) that includes the base CoreNLP library, and mounts the language-specific model JARs at runtime.
+
+## More Information
+
+- Stanford CoreNLP website: https://stanfordnlp.github.io/CoreNLP/
+- Maven Central repository: https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/
+- Galaxy Tool Shed: https://toolshed.g2.bx.psu.edu
diff --git a/data_managers/data_manager_corenlp_models/data_manager_corenlp_models.py b/data_managers/data_manager_corenlp_models/data_manager_corenlp_models.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+# Copyright 2006 The Galaxy Project. All rights reserved.
+"""
+Data Manager for Stanford CoreNLP Language Models
+
+Downloads CoreNLP language model JARs to a persistent directory and registers
+them in the Galaxy data table. JARs are stored at the absolute path so the
+CoreNLP tool can symlink them at runtime.
+"""
+
+import argparse
+import json
+import sys
+import urllib.request
+from pathlib import Path
+
+
+# CoreNLP version and model information
+CORENLP_VERSION = "4.5.10"
+
+# Common models JAR (contains dcoref dictionaries and common models)
+COMMON_MODELS = {
+    "name": "Common Models",
+    "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models.jar",
+    "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models.jar"
+}
+
+LANGUAGE_MODELS = {
+    "ar": {
+        "name": "Arabic",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-arabic.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-arabic.jar"
+    },
+    "zh": {
+        "name": "Chinese",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-chinese.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-chinese.jar"
+    },
+    "en": {
+        "name": "English",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-english.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-english.jar"
+    },
+    "fr": {
+        "name": "French",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-french.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-french.jar"
+    },
+    "de": {
+        "name": "German",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-german.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-german.jar"
+    },
+    "hu": {
+        "name": "Hungarian",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-hungarian.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-hungarian.jar"
+    },
+    "it": {
+        "name": "Italian",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-italian.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-italian.jar"
+    },
+    "es": {
+        "name": "Spanish",
+        "jar_name": f"stanford-corenlp-{CORENLP_VERSION}-models-spanish.jar",
+        "url": f"https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/{CORENLP_VERSION}/stanford-corenlp-{CORENLP_VERSION}-models-spanish.jar"
+    }
+}
+
+
+def download_model(url, target_path):
+    """Download a file from URL to target path with progress reporting."""
+    print(f"Downloading from {url}")
+    print(f"Saving to {target_path}")
+
+    def report_progress(block_num, block_size, total_size):
+        downloaded = block_num * block_size
+        if total_size > 0:
+            percent = min(100, (downloaded / total_size) * 100)
+            mb = downloaded / 1024 / 1024
+            total_mb = total_size / 1024 / 1024
+            print(f"  {percent:.0f}% ({mb:.0f}/{total_mb:.0f} MB)", flush=True)
+
+    try:
+        urllib.request.urlretrieve(url, target_path, reporthook=report_progress)
+        print("Download complete!")
+        return True
+    except Exception as e:
+        print(f"Error downloading file: {e}", file=sys.stderr)
+        return False
+
+
+def load_existing_models(data_table_path):
+    """Load existing model entries from the data table to avoid duplicates."""
+    existing = set()
+    if data_table_path and Path(data_table_path).exists():
+        with open(data_table_path) as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    parts = line.split('\t')
+                    if parts:
+                        existing.add(parts[0])
+    return existing
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and register CoreNLP language models")
+    parser.add_argument("--language", action="append", choices=LANGUAGE_MODELS.keys(),
+                        help="Language code(s) for the model(s) to download")
+    parser.add_argument("--common-models", action="store_true",
+                        help="Download the common models JAR (required for coreference)")
+    parser.add_argument("--target-directory", required=True,
+                        help="Directory to store the downloaded model JARs")
+    parser.add_argument("--output", required=True,
+                        help="JSON output file for Galaxy data manager")
+    parser.add_argument("--data-table", required=False,
+                        help="Path to existing data table file to check for duplicates")
+
+    args = parser.parse_args()
+
+    if not args.language and not args.common_models:
+        parser.error("At least one of --language or --common-models must be specified")
+
+    existing_models = load_existing_models(args.data_table)
+
+    target_dir = Path(args.target_directory)
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    data_table_entries = []
+
+    # Process common models if requested
+    if args.common_models:
+        if "common" in existing_models:
+            print(f"\n{'=' * 60}")
+            print(f"Skipping {COMMON_MODELS['name']} - already in data table")
+            print(f"{'=' * 60}")
+        else:
+            print(f"\n{'=' * 60}")
+            print(f"Processing {COMMON_MODELS['name']}...")
+            print(f"{'=' * 60}")
+
+            jar_path = target_dir / COMMON_MODELS["jar_name"]
+
+            if jar_path.exists():
+                print(f"Model already exists at {jar_path}")
+            else:
+                if not download_model(COMMON_MODELS["url"], str(jar_path)):
+                    print(f"WARNING: Failed to download {COMMON_MODELS['name']}", file=sys.stderr)
+
+            if jar_path.exists():
+                data_table_entries.append({
+                    "value": "common",
+                    "name": COMMON_MODELS["name"],
+                    "lang_code": "common",
+                    "models_path": str(jar_path.absolute())
+                })
+                print(f"Registered {COMMON_MODELS['name']}")
+                print(f"  Path: {jar_path.absolute()}")
+
+    # Process each language
+    if args.language:
+        for lang_code in args.language:
+            if lang_code in existing_models:
+                print(f"\n{'=' * 60}")
+                print(f"Skipping {LANGUAGE_MODELS[lang_code]['name']} - already in data table")
+                print(f"{'=' * 60}")
+            else:
+                model_info = LANGUAGE_MODELS[lang_code]
+
+                print(f"\n{'=' * 60}")
+                print(f"Processing {model_info['name']} model...")
+                print(f"{'=' * 60}")
+
+                jar_path = target_dir / model_info["jar_name"]
+
+                if jar_path.exists():
+                    print(f"Model already exists at {jar_path}")
+                else:
+                    if not download_model(model_info["url"], str(jar_path)):
+                        print(f"WARNING: Failed to download {model_info['name']} model", file=sys.stderr)
+                        continue
+
+                data_table_entries.append({
+                    "value": lang_code,
+                    "name": model_info["name"],
+                    "lang_code": lang_code,
+                    "models_path": str(jar_path.absolute())
+                })
+                print(f"Registered {model_info['name']} model")
+                print(f"  Path: {jar_path.absolute()}")
+
+    # Write data manager JSON output
+    data_manager_output = {
+        "data_tables": {
+            "corenlp_models": data_table_entries
+        }
+    }
+
+    with open(args.output, "w") as f:
+        json.dump(data_manager_output, f, indent=2)
+
+    print(f"\n{'=' * 60}")
+    print(f"Summary: {len(data_table_entries)} model(s) registered")
+    print(f"{'=' * 60}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data_managers/data_manager_corenlp_models/data_manager_corenlp_models.xml b/data_managers/data_manager_corenlp_models/data_manager_corenlp_models.xml
@@ -0,0 +1,83 @@
+<tool id="data_manager_corenlp_models" name="Stanford CoreNLP Language Models" version="4.5.10.5" tool_type="manage_data" profile="24.1">
+    <description>Download and install CoreNLP language model JARs</description>
+    <requirements>
+        <requirement type="package" version="3.9">python</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__/data_manager_corenlp_models.py'
+        #for $lang in $languages
+            --language '$lang'
+        #end for
+        #if $common_models
+            --common-models
+        #end if
+        --target-directory '${__tool_data_path__}/corenlp_models'
+        --output '$out_file'
+        --data-table '${__tool_data_path__}/corenlp_models.loc'
+    ]]></command>
+    <inputs>
+        <param name="languages" type="select" label="Language Models" multiple="true" display="checkboxes">
+            <option value="ar">Arabic</option>
+            <option value="zh">Chinese</option>
+            <option value="en" selected="true">English</option>
+            <option value="fr">French</option>
+            <option value="de">German</option>
+            <option value="hu">Hungarian</option>
+            <option value="it">Italian</option>
+            <option value="es">Spanish</option>
+        </param>
+        <param name="common_models" type="boolean" label="Install common models (required for coreference)" checked="true" help="Downloads the common models JAR (452 MB) which includes dictionary files and models needed for coreference resolution"/>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <help><![CDATA[
+Stanford CoreNLP Language Models Data Manager
+==============================================
+
+This data manager downloads and installs Stanford CoreNLP language model JAR files
+for use with the Stanford CoreNLP annotation tool.
+
+Available Languages
+-------------------
+
+- **Arabic** (ar)
+- **Chinese** (zh)
+- **English** (en)
+- **French** (fr)
+- **German** (de)
+- **Hungarian** (hu)
+- **Italian** (it)
+- **Spanish** (es)
+
+Usage
+-----
+
+1. Select one or more language models you want to install (checkboxes)
+2. Optionally check "Install common models" if you need coreference resolution (checked by default)
+3. Run the data manager
+4. The model JARs will be downloaded from Maven Central and registered in the data table
+
+The language models are large files (typically 100-500 MB each), so downloading
+multiple models may take several minutes depending on your connection speed.
+
+Common Models
+-------------
+
+The common models JAR (452 MB) contains shared dictionaries and resources needed for
+coreference resolution. It is required for the coreference annotator to work. This
+option is checked by default.
+
+Version
+-------
+
+This data manager downloads models for CoreNLP version 4.5.10.
+
+**Note:** Not all annotators are available for all languages. See the Stanford CoreNLP
+documentation for language-specific capabilities:
+https://stanfordnlp.github.io/CoreNLP/human-languages.html
+    ]]></help>
+    <citations>
+        <citation type="doi">10.3115/v1/P14-5010</citation>
+    </citations>
+</tool>