update genereate script to be able to use axonn

Dando18 · Dando18 · commit 690afc70ac9c · 2024-02-27T18:11:22.000-08:00
diff --git a/generate/generate-codellama-70b-prompted.sbatch b/generate/generate-codellama-70b-prompted.sbatch
@@ -1,38 +1,60 @@
 #!/bin/bash
-#SBATCH -n 1
-#SBATCH -c 16
-#SBATCH --ntasks-per-node=1
+#SBATCH -N 1
+#SBATCH -c 32
+#SBATCH --ntasks-per-node=4
 #SBATCH --gpus-per-task=1
-#SBATCH --mem=164000
-#SBATCH -t 23:59:59
+###SBATCH --gpu-bind=none
+###SBATCH --mem=164000
+#SBATCH -t 02:00:00
 #SBATCH -A m2404
-#SBATCH -C gpu&hbm80g
+#SBATCH -C gpu
 #SBATCH -q regular
 #SBATCH -J generate-codellama-70b-prompted
-#SBATCH -o generate-codellama-70b-prompted-%A.out
+#SBATCH -o generation-job-logs/generate-codellama-70b-prompted-%A.out
 
 # settings
 MODEL="codellama/CodeLlama-70b-hf"
 TEMP=0.2
 TOPP=0.95
 MAX_NEW_TKNS=1024
 SAMPLES_PER_PROMPT=20
-BATCH_SIZE=2
+BATCH_SIZE=1
 hash=$(md5sum ../prompts/generation-prompts.json | cut -d' ' -f1)
 OUTPUT="../outputs/output_${hash:0:8}_${MODEL//\//--}_prompted_temp${TEMP}.json"
 CACHE="../outputs/cache/cache_${hash:0:8}_${MODEL//\//--}_prompted_temp${TEMP}.jsonl"
 echo "Writing to $OUTPUT"
 echo "model=$MODEL   MAX_NEW_TKNS=$MAX_NEW_TKNS   SAMPLES_PER_PROMPT=$SAMPLES_PER_PROMPT   BATCH_SIZE=$BATCH_SIZE"
 
 # setup
-#ml cuda/11.8.0
+module load PrgEnv-gnu/8.4.0.lua pytorch/2.0.1.lua
 source .env/bin/activate
+#conda activate llms-for-hpc
+export MPICH_GPU_SUPPORT_ENABLED=1
 export HF_HOME=/pscratch/sd/d/dnicho/.cache/huggingface
-export OMP_NUM_THREADS=16
+export OMP_NUM_THREADS=4
 export SLURM_CPU_BIND="cores"
+export PYTHONPATH="${PYTHONPATH}:/global/homes/d/dnicho/axonn/models/transformers"
+
+
+# axonn exports
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=PHB
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
 
 # generate
-srun python generate.py \
+srun ./select_gpu_device python generate.py \
     --model $MODEL \
     --prompts ../prompts/generation-prompts.json \
     --cache $CACHE \
@@ -43,4 +65,5 @@ srun python generate.py \
     --max_new_tokens $MAX_NEW_TKNS \
     --num_samples_per_prompt $SAMPLES_PER_PROMPT \
     --batch_size $BATCH_SIZE \
+    --axonn \
     --prompted
diff --git a/generate/generate.py b/generate/generate.py
@@ -8,7 +8,7 @@
 
 # tpl imports
 import torch
-from transformers import pipeline
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 
 # local imports
 from utils import BalancedBracketsCriteria, PromptDataset, clean_output, get_inference_config
@@ -33,6 +33,10 @@
 parser.add_argument('--do_sample', action='store_true', help='Enable sampling (default: False)')
 parser.add_argument('--batch_size', type=int, default=16, help='Batch size for generation (default: 8)')
 parser.add_argument('--prompted', action='store_true', help='Use prompted generation. See StarCoder paper (default: False)')
+device_group = parser.add_mutually_exclusive_group()
+device_group.add_argument('--device_map', help='Path to the device map JSON file or the string "auto"')
+device_group.add_argument('--device', type=int, help='Device to use for inference')
+device_group.add_argument('--axonn', action='store_true', help='Use AxoNN for inference')
 args = parser.parse_args()
 
 """ Load prompts """
@@ -96,12 +100,48 @@
 # and repeat them for however many samples we want to generate per prompt
 prompts_repeated = [p for p in prompts for _ in range(args.num_samples_per_prompt)]
 
+""" Set device kwarg for inference """
+device_kwarg = {}
+USE_AXONN = False
+if args.device_map:
+    if args.device_map == "auto":
+        device_kwarg["device_map"] = "auto"
+    else:
+        with open(args.device_map, 'r') as json_file:
+            device_map = json.load(json_file)
+            device_kwarg["device_map"] = device_map
+elif args.device:
+    device_kwarg["device"] = args.device
+elif args.axonn:
+    from mpi4py import MPI
+    from axonn import axonn as ax
+    from modify_llama import monkey_patch_llama_with_axonn
+    world_size = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+    if rank == 0:
+        print(f"Using AxoNN with {world_size} GPUs.")
+    ax.init(G_data=1, G_inter=1, G_intra_r=1, G_intra_c=1, G_intra_d=world_size)
+    if "llama" in args.model:
+        monkey_patch_llama_with_axonn()
+    USE_AXONN = True
+    device_kwarg["device"] = "cuda"
+else:
+    device_kwarg["device"] = 0
+
+""" Load model and tokenizer """
+model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=inference_config.get_dtype())
+if USE_AXONN:
+    model = model.to("cuda")
+tokenizer = AutoTokenizer.from_pretrained(args.model)
+
 """ Initialize HuggingFace pipeline for generation """
-generator = pipeline(model=args.model, torch_dtype=inference_config.get_dtype(), device=0)
+generator = pipeline(task='text-generation', model=model, tokenizer=tokenizer, **device_kwarg)
 inference_config.init_padding(generator.tokenizer)
 
 """ Create a prompt data set to pass to generate method """
 prompt_dataset = PromptDataset([inference_config.format_prompt(p["prompt"]) for p in prompts_repeated])
+if USE_AXONN:
+    prompt_dataset = prompt_dataset#.to("cuda")
 generated_outputs = generator(
     prompt_dataset,
     max_new_tokens=args.max_new_tokens,
@@ -114,7 +154,7 @@
 )
 
 """ Iterate over prompts and generate code """
-if not args.restart and args.cache is not None:
+if not args.restart and args.cache is not None and os.path.exists(args.cache):
     with open(args.cache, 'r') as jsonl_file:
         responses = [json.loads(line) for line in jsonl_file]
         responses = [r for r in responses if r["temperature"] == args.temperature and r["prompted"] == args.prompted
@@ -140,8 +180,9 @@
         responses.append(cur_prompt)
 
         if not args.restart and args.cache is not None:
-            with open(args.cache, 'a') as jsonl_file:
-                jsonl_file.write(json.dumps(cur_prompt) + "\n")
+            if not USE_AXONN or rank == 0:
+                with open(args.cache, 'a') as jsonl_file:
+                    jsonl_file.write(json.dumps(cur_prompt) + "\n")
 
     if idx != 0 and idx % args.num_samples_per_prompt == 0:
         print(f"Tokens per second: {total_tokens / (time.time() - start_time):.2f}")
@@ -151,5 +192,6 @@
 print(f"Generated {len(responses)} code samples in {end_time - start_time:.2f} seconds ({tokens_per_second:.2f} tokens per second)")
 
 """ Save responses to JSON file """
-with open(args.output, 'w') as output_file:
-    json.dump(responses, output_file, indent=4)
+if not USE_AXONN or rank == 0:
+    with open(args.output, 'w') as output_file:
+        json.dump(responses, output_file, indent=4)
diff --git a/generate/select_gpu_device b/generate/select_gpu_device
@@ -0,0 +1,6 @@
+#!/bin/bash
+# select_gpu_device wrapper script
+export RANK=${SLURM_PROCID}
+export WORLD_SIZE=${SLURM_NTASKS}
+export LOCAL_RANK=${SLURM_LOCALID}
+exec $*