Skip to content

Commit

Permalink
Merge branch 'NVIDIA:main' into nemo2_chat_sft_agent
Browse files Browse the repository at this point in the history
  • Loading branch information
chenrui17 authored Jan 12, 2025
2 parents 905768c + 7f3ac6b commit 4b4a265
Show file tree
Hide file tree
Showing 83 changed files with 3,366 additions and 173 deletions.
136 changes: 131 additions & 5 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3622,7 +3622,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3633,7 +3633,17 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --strategy fsdp --devices 2
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --strategy fsdp --devices 2
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_VLM_HF_Transformer_PEFT_4bit:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_4bit') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --use-4bit
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3644,7 +3654,7 @@ jobs:
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --disable-ckpt
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft_hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3666,7 +3676,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp --disable-ckpt
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft_hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp --disable-ckpt
AFTER_SCRIPT: |
rm -rf nemo_experiments
Expand All @@ -3691,6 +3701,28 @@ jobs:
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT_FSDP2_2gpu:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_SFT_FSDP2_2gpu') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_fsdp2.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_2gpu:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT_2gpu_nemorun:
needs: [ cicd-test-container-setup ]
Expand All @@ -3702,6 +3734,50 @@ jobs:
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_nemorun_fsdp2.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_2gpu_nemorun:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_nemorun:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_nemorun') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT:
needs: [ cicd-test-container-setup ]
Expand All @@ -3724,7 +3800,7 @@ jobs:
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/sft_nemorun.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_SFT_TE_Acceleration:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
Expand All @@ -3736,6 +3812,17 @@ jobs:
AFTER_SCRIPT: |
rm -rf nemo_experiments
L2_HF_Transformer_PT_TE_Acceleration:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_PT_TE_Acceleration') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |
TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/pretrain.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --model-accelerator te --max-steps 10
AFTER_SCRIPT: |
rm -rf nemo_experiments
# L2: Megatron Mock Data Generation
L2_Megatron_Mock_Data_Generation_MockGPTDataset:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -4784,6 +4871,35 @@ jobs:
rm -rf /tmp/nemo2_ckpt
rm -rf /tmp/nemo2_ptq_engine
L2_NeMo_2_Export_In_Framework:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Export_In_Framework') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/test_hf_import.py \
--hf_model /home/TestData/nlp/megatron_llama/llama-ci-hf \
--output_path /tmp/nemo2_ckpt
python tests/setup/data/create_sample_lambada.py \
--output_file /tmp/lambada.json
python tests/export/nemo_export.py \
--model_name test \
--model_type llama \
--checkpoint_dir /tmp/nemo2_ckpt \
--min_tps 1 \
--in_framework True \
--test_deployment True \
--run_accuracy True \
--test_data_path /tmp/lambada.json \
--accuracy_threshold 0.0 \
--debug
AFTER_SCRIPT: |
rm -rf /tmp/nemo2_ckpt /tmp/lambada.json
L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4930,8 +5046,15 @@ jobs:
- L2_HF_Transformer_SFT_2gpu
- L2_VLM_HF_Transformer_PEFT
- L2_VLM_HF_Transformer_PEFT_FSDP
- L2_VLM_HF_Transformer_PEFT_4bit
- L2_HF_Transformer_SFT_2gpu_nemorun
- L2_HF_Transformer_SFT_TE_Acceleration
- L2_HF_Transformer_PT
- L2_HF_Transformer_PT_nemorun
- L2_HF_Transformer_PT_2gpu
- L2_HF_Transformer_PT_2gpu_nemorun
- L2_HF_Transformer_PT_TE_Acceleration
- L2_VLM_HF_Transformer_PEFT
- L2_NeMo_2_SSM_Pretraining
- L2_NeMo_2_SSM_Finetuning
- L2_NeMo_2_T5_Pretraining
Expand Down Expand Up @@ -4974,8 +5097,11 @@ jobs:
- L2_Megatron_GPT_Reranker
- L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact
- L2_NeMo_2_PTQ_Llama2_FP8
- L2_NeMo_2_Export_In_Framework
- L2_NeMo_2_jit_callback
- L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
- L2_HF_Transformer_SFT_FSDP2_2gpu
- L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
if: always()
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
ARG IMAGE_LABEL
FROM ${BASE_IMAGE}

ARG IMAGE_LABEL
LABEL "nemo.library"=${IMAGE_LABEL}

ENV TRANSFORMERS_OFFLINE=0
Expand Down
14 changes: 12 additions & 2 deletions examples/llm/peft/hf.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@

import fiddle as fdl
from lightning.pytorch.loggers import WandbLogger

from nemo import lightning as nl
from nemo.collections import llm
from nemo.lightning import NeMoLogger
from nemo.lightning.pytorch.callbacks import JitConfig, JitTransform


Expand Down Expand Up @@ -69,6 +71,7 @@ def main():
parser.add_argument('--max-steps', type=int, default=100)
parser.add_argument('--wandb-project', type=str, default=None)
parser.add_argument('--use-torch-jit', action='store_true')
parser.add_argument('--ckpt-folder', type=str, default=None)
args = parser.parse_args()

wandb = None
Expand All @@ -84,6 +87,13 @@ def main():
# https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
grad_clip = None
use_dist_samp = False

import tempfile

if args.ckpt_folder is None:
args.ckpt_folder = tempfile.TemporaryDirectory().name
print("Temp directory created for base model: ", args.ckpt_folder)

tokenizer = llm.HFAutoModelForCausalLM.configure_tokenizer(args.model)

callbacks = []
Expand All @@ -110,10 +120,10 @@ def main():
precision="bf16",
),
optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
log=None,
log=NeMoLogger(log_dir=args.ckpt_folder, use_datetime_version=False),
peft=llm.peft.LoRA(
target_modules=['*_proj'],
dim=32,
dim=8,
),
)

Expand Down
42 changes: 42 additions & 0 deletions examples/llm/peft/hf_vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

try:
from nemo.export.vllm_hf_exporter import vLLMHFExporter
except Exception:
raise Exception(
"vLLM should be installed in the environment or import "
"the vLLM environment in the NeMo FW container using "
"source /opt/venv/bin/activate command"
)


if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True, type=str, help="Local path of the base model")
parser.add_argument('--lora-model', required=True, type=str, help="Local path of the lora model")
# parser.add_argument('--triton-model-name', required=True, type=str, help="Name for the service")
args = parser.parse_args()

lora_model_name = "lora_model"

exporter = vLLMHFExporter()
exporter.export(model=args.model, enable_lora=True)
exporter.add_lora_models(lora_model_name=lora_model_name, lora_model=args.lora_model)

print(
"------------- Output: ", exporter.forward(input_texts=["How are you doing?"], lora_model_name=lora_model_name)
)
4 changes: 3 additions & 1 deletion examples/vlm/hf/peft.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def fmt(sample):
parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
parser.add_argument('--max-steps', type=int, default=100)
parser.add_argument('--wandb-project', type=str, default=None)
parser.add_argument('--use-4bit', help="Load model in 4bit", action="store_true")
args = parser.parse_args()

wandb = None
Expand All @@ -103,7 +104,7 @@ def fmt(sample):
processor = vlm.HFAutoModelForImageTextToText.configure_processor(args.model)

llm.api.finetune(
model=vlm.HFAutoModelForImageTextToText(args.model),
model=vlm.HFAutoModelForImageTextToText(args.model, load_in_4bit=args.use_4bit),
data=mk_hf_vlm_dataset(processor, args.mbs, args.gbs),
trainer=nl.Trainer(
devices=args.devices,
Expand All @@ -124,5 +125,6 @@ def fmt(sample):
peft=llm.peft.LoRA(
target_modules=['*_proj'],
dim=16,
lora_dtype=torch.bfloat16 if args.use_4bit else None,
),
)
7 changes: 1 addition & 6 deletions nemo/collections/common/parts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,9 @@ def extend_instance(obj, mixin):
) # mixin needs to go first for our forward() logic to work


def apply_rope_scaling(freqs):
def apply_rope_scaling(freqs, scale_factor=8, low_freq_factor=1, high_freq_factor=4, old_context_len=8192):
# Apply scaling for RoPE frequencies
logger.info("apply rope scaling ...")
# Values obtained from grid search
scale_factor = 8
low_freq_factor = 1
high_freq_factor = 4
old_context_len = 8192 # original llama3 length

low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
Expand Down
Loading

0 comments on commit 4b4a265

Please sign in to comment.