diff --git a/Dockerfile b/Dockerfile index b0d1bedc7..d81d72a5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,7 +42,7 @@ RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-for libcusolver \ cuda-nvvm && \ source activate curator && \ - pip install --upgrade pytest pip + pip install --upgrade pytest pip nemo-toolkit RUN \ --mount=type=bind,source=/opt/NeMo-Curator/nemo_curator/__init__.py,target=/opt/NeMo-Curator/nemo_curator/__init__.py,from=curator-update \ diff --git a/pyproject.toml b/pyproject.toml index 329c2eec4..abb0d9e3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "distributed>=2021.7.1", "fasttext==0.9.3", "ftfy==6.1.1", + "huggingface-hub", "in-place==0.5.0", "jieba==0.42.1", "justext==3.0.1", @@ -65,6 +66,7 @@ dependencies = [ "resiliparse", "sentencepiece", "spacy>=3.6.0, <3.8.0", + "tqdm", # TODO: Remove this pin once newer version is released "transformers==4.46.3", "unidic-lite==1.0.8", @@ -107,17 +109,9 @@ image_nightly = [ "timm>=1.0.8", "nemo_curator[cuda12x_nightly]", ] -# Installs bitext curation modules -bitext = [ - "huggingface-hub", - "tqdm", - "transformers", - "nemo_curator[cuda12x]", -] # Installs all of the above with Stable RAPIDS all = [ "nemo_curator[image]", - "nemo_curator[bitext]", ] # Installs all of the above with RAPIDS Nightlies all_nightly = [