Skip to content

Commit c492e18

Browse files
committed
Add VLLM_T_COMPILE_FULLGRAPH flag
The flag is set to False as default to don't change behavior for end-users, but allow use it in CI to earlier catch performance regression as usually graph breaks reduce performance.
1 parent c5a68d2 commit c492e18

File tree

3 files changed

+123
-33
lines changed

3 files changed

+123
-33
lines changed

.jenkins/test_config_t_compile.yaml

Lines changed: 97 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,98 +4,168 @@ stages:
44
steps:
55
- name: gsm8k_small_g3_tp1
66
flavor: g3
7-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
7+
command: >
8+
cd .jenkins/lm-eval-harness &&
9+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
10+
bash run-tests.sh -c configs/models-small.txt -t 1
811
- name: gsm8k_small_g3_tp2
912
flavor: g3.s
10-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2
13+
command: >
14+
cd .jenkins/lm-eval-harness &&
15+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
16+
bash run-tests.sh -c configs/models-small.txt -t 2
1117
- name: gsm8k_small_g2_tp1
1218
flavor: g2
13-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
19+
command: >
20+
cd .jenkins/lm-eval-harness &&
21+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
22+
bash run-tests.sh -c configs/models-small.txt -t 1
1423
- name: gsm8k_small_g2_tp2
1524
flavor: g2.s
16-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2
25+
command: >
26+
cd .jenkins/lm-eval-harness &&
27+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
28+
bash run-tests.sh -c configs/models-small.txt -t 2
1729
- name: test_gsm8k_large_models
1830
steps:
1931
- name: gsm8k_large_g3_tp2
2032
flavor: g3.s
21-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-large.txt -t 2
33+
command: >
34+
cd .jenkins/lm-eval-harness &&
35+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
36+
bash run-tests.sh -c configs/models-large.txt -t 2
2237
- name: gsm8k_large_g2_tp4
2338
flavor: g2.m
24-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-large.txt -t 4
39+
command: >
40+
cd .jenkins/lm-eval-harness &&
41+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
42+
bash run-tests.sh -c configs/models-large.txt -t 4
2543
- name: test_gsm8k_fp8
2644
steps:
2745
- name: gsm8k_small_g3_tp1_fp8
2846
flavor: g3
29-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-fp8.txt -t 1
47+
command: >
48+
cd .jenkins/lm-eval-harness &&
49+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
50+
bash run-tests.sh -c configs/models-fp8.txt -t 1
3051
- name: gsm8k_small_g3_tp2_fp8
3152
flavor: g3.s
32-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-fp8.txt -t 2
53+
command: >
54+
cd .jenkins/lm-eval-harness &&
55+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
56+
bash run-tests.sh -c configs/models-fp8.txt -t 2
3357
- name: test_gsm8k_mss
3458
steps:
3559
- name: gsm8k_small_g3_tp1_mss
3660
flavor: g3
37-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 1
61+
command: >
62+
cd .jenkins/lm-eval-harness &&
63+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
64+
bash run-tests.sh -c configs/models-mss.txt -t 1
3865
- name: gsm8k_small_g2_tp1_mss
3966
flavor: g2
40-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 1
67+
command: >
68+
cd .jenkins/lm-eval-harness &&
69+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
70+
bash run-tests.sh -c configs/models-mss.txt -t 1
4171
- name: gsm8k_small_g3_tp2_mss
4272
flavor: g3.s
43-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 2
73+
command: >
74+
cd .jenkins/lm-eval-harness &&
75+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
76+
bash run-tests.sh -c configs/models-mss.txt -t 2
4477
- name: gsm8k_small_g2_tp2_mss
4578
flavor: g2.s
46-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 2
79+
command: >
80+
cd .jenkins/lm-eval-harness &&
81+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
82+
bash run-tests.sh -c configs/models-mss.txt -t 2
4783
- name: gsm8k_small_g2_tp1_spec_decode
4884
flavor: g2
49-
command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 1
85+
command: >
86+
cd .jenkins/lm-eval-harness &&
87+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
88+
bash run-tests.sh -c configs/models-mss.txt -t 1
5089
- name: test_gsm8k_spec_decode
5190
steps:
5291
- name: gsm8k_small_g2_tp1_mlp_spec_decode
5392
flavor: g2
54-
command: PT_HPU_LAZY_MODE=0 TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_mlp_correctness.py::test_mlp_e2e_greedy_correctness
93+
command: >
94+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
95+
pytest -v tests/spec_decode/e2e/test_mlp_correctness.py::test_mlp_e2e_greedy_correctness
5596
- name: gsm8k_small_g2_tp1_medusa_spec_decode
5697
flavor: g2
57-
command: PT_HPU_LAZY_MODE=0 TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness
98+
command: >
99+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
100+
pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness
58101
- name: gsm8k_small_g2_tp1_eagle_spec_decode
59102
flavor: g2
60-
command: PT_HPU_LAZY_MODE=0 VLLM_COS_SIN_RECOMPUTE=true TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness
103+
command: >
104+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 VLLM_COS_SIN_RECOMPUTE=true TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True
105+
pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness
61106
- name: tests_lora
62107
steps:
63108
- name: test_llama_lora
64109
flavor: g2
65-
command: PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_llama_hpu.py::test_llama_lora_1x
110+
command: >
111+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true
112+
pytest -v tests/lora/test_llama_hpu.py::test_llama_lora_1x
66113
- name: test_multilora
67114
flavor: g2
68-
command: PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_multilora_hpu.py::test_llama_multilora_1x
115+
command: >
116+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true
117+
pytest -v tests/lora/test_multilora_hpu.py::test_llama_multilora_1x
69118
# - name: test_long_context
70119
# flavor: g2
71-
# command: PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_long_context_hpu.py::test_quality
120+
# command: VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_long_context_hpu.py::test_quality
72121
- name: tests_multimodal
73122
steps:
74123
- name: multimodal_small_g3_tp1
75124
flavor: g3
76-
command: cd .jenkins/vision && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
125+
command: >
126+
cd .jenkins/vision &&
127+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
128+
bash run-tests.sh -c configs/models-small.txt -t 1
77129
- name: multimodal_small_g3_tp2
78130
flavor: g3.s
79-
command: cd .jenkins/vision && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2
131+
command: >
132+
cd .jenkins/vision &&
133+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
134+
bash run-tests.sh -c configs/models-small.txt -t 2
80135
- name: multimodal_small_g3_tp1_mss
81136
flavor: g3
82-
command: cd .jenkins/vision && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 1
137+
command: >
138+
cd .jenkins/vision && VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
139+
bash run-tests.sh -c configs/models-mss.txt -t 1
83140
- name: multimodal_small_g3_tp2_mss
84141
flavor: g3.s
85-
command: cd .jenkins/vision && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-mss.txt -t 2
142+
command: >
143+
cd .jenkins/vision &&
144+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
145+
bash run-tests.sh -c configs/models-mss.txt -t 2
86146
- name: tests_int4_quantization
87147
steps:
88148
- name: test_awq
89149
flavor: g2
90-
command: PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true pytest -v tests/quantization/test_awq.py::test_awq
150+
command: >
151+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true
152+
pytest -v tests/quantization/test_awq.py::test_awq
91153
- name: test_gptq
92154
flavor: g2
93-
command: PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true pytest -v tests/quantization/test_gptq.py::test_gptq
155+
command: >
156+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0 VLLM_SKIP_WARMUP=true
157+
pytest -v tests/quantization/test_gptq.py::test_gptq
94158
- name: tests_guided_decode
95159
steps:
96160
- name: test_lazy_outlines
97161
flavor: g2
98-
command: export VLLM_SKIP_WARMUP=true && pip install -e tests/vllm_test_utils && PT_HPU_LAZY_MODE=0 pytest -v tests/entrypoints/llm/test_lazy_outlines.py -s -vvv --log-cli-level=INFO
162+
command: >
163+
export VLLM_SKIP_WARMUP=true && pip install -e tests/vllm_test_utils &&
164+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
165+
pytest -v tests/entrypoints/llm/test_lazy_outlines.py -s -vvv --log-cli-level=INFO
99166
- name: test_guided_generate
100167
flavor: g2
101-
command: export VLLM_SKIP_WARMUP=true && pip install -e tests/vllm_test_utils && PT_HPU_LAZY_MODE=0 pytest -v tests/entrypoints/llm/test_guided_generate.py -s -vvv --log-cli-level=INFO
168+
command: >
169+
export VLLM_SKIP_WARMUP=true && pip install -e tests/vllm_test_utils &&
170+
VLLM_T_COMPILE_FULLGRAPH=True PT_HPU_LAZY_MODE=0
171+
pytest -v tests/entrypoints/llm/test_guided_generate.py -s -vvv --log-cli-level=INFO

README_GAUDI.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
334334
- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true` - logs graph compilations for every vLLM engine step, even if no compilation occurs. Disabled by default.
335335
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true` - logs CPU fallbacks for each vLLM engine step, but only if any fallback occurs. Disabled by default.
336336
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true` - logs CPU fallbacks for each vLLM engine step, even if no fallback occur. Disabled by default.
337+
- `VLLM_T_COMPILE_FULLGRAPH`: if `true` - PyTorch compile function raise an error if any graph breaks happened during compilation. This allow easy catch existing graph breaks which usually reduce performance. Disabled by default.
337338

338339
**Performance Tuning Knobs:**
339340

vllm/worker/hpu_model_runner.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,35 +245,54 @@ def __init__(self, model, vllm_config, layer_names):
245245
self.set_causal_option(self.model)
246246
if not is_fake_hpu() and not htorch.utils.internal.is_lazy(
247247
) and not enforce_eager:
248+
fullgraph = os.getenv('VLLM_T_COMPILE_FULLGRAPH',
249+
'false').strip().lower() in ("1", "true")
248250
if os.getenv('VLLM_REGIONAL_COMPILATION',
249251
'true').lower() == 'true':
250252
self.regional_compilation_layers_list = [
251253
RMSNorm, VocabParallelEmbedding
252254
]
253-
self._regional_compilation(self.model)
255+
self._regional_compilation(self.model, fullgraph)
254256
else:
255257
self.model = torch.compile(self.model,
256258
backend='hpu_backend',
259+
fullgraph=fullgraph,
257260
dynamic=False)
258261

259262
def _regional_compilation(self,
260263
module,
264+
fullgraph,
261265
parent_module=None,
262266
module_name=None):
263267
if isinstance(module, torch.nn.ModuleList):
264268
for children_name, children_module in module.named_children():
265-
self._compile_region(module, children_name, children_module)
269+
self._compile_region(module, fullgraph, children_name,
270+
children_module)
266271
elif any(
267272
isinstance(module, layer)
268273
for layer in self.regional_compilation_layers_list):
269-
self._compile_region(parent_module, module_name, module)
274+
self._compile_region(
275+
parent_module,
276+
fullgraph,
277+
module_name,
278+
module,
279+
)
270280
else:
271281
for children_name, children_module in module.named_children():
272-
self._regional_compilation(children_module, module,
282+
self._regional_compilation(children_module, fullgraph, module,
273283
children_name)
274284

275-
def _compile_region(self, model, name, module):
276-
module = torch.compile(module, backend='hpu_backend', dynamic=False)
285+
def _compile_region(
286+
self,
287+
model,
288+
fullgraph,
289+
name,
290+
module,
291+
):
292+
module = torch.compile(module,
293+
backend='hpu_backend',
294+
fullgraph=fullgraph,
295+
dynamic=False)
277296
setattr(model, name, module)
278297

279298
def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,

0 commit comments

Comments
 (0)