tkestack · tlipoca9 · Jun 6, 2025 · Jun 6, 2025 · Jun 7, 2025 · Jun 12, 2025
diff --git a/helm-charts/deepseek-r1-lws/.helmignore b/helm-charts/deepseek-r1-lws/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/helm-charts/deepseek-r1-lws/Chart.yaml b/helm-charts/deepseek-r1-lws/Chart.yaml
@@ -0,0 +1,34 @@
+apiVersion: v2
+name: lws
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
+
+dependencies:
+- name: etcd
+  condition: etcd.enable
+  version: "~11.2.1"
+  repository: https://charts.bitnami.com/bitnami
+- name: nats
+  condition: nats.enable
+  version: "~1.3.1"
+  repository: https://nats-io.github.io/k8s/helm/charts/
diff --git a/helm-charts/deepseek-r1-lws/charts/etcd-11.2.1.tgz b/helm-charts/deepseek-r1-lws/charts/etcd-11.2.1.tgz
diff --git a/helm-charts/deepseek-r1-lws/charts/nats-1.3.1.tgz b/helm-charts/deepseek-r1-lws/charts/nats-1.3.1.tgz
diff --git a/helm-charts/deepseek-r1-lws/perf_deepseek-r1.md b/helm-charts/deepseek-r1-lws/perf_deepseek-r1.md
@@ -0,0 +1,148 @@
+# Dynamo 1P1D Benchmark
+
+4 * 8 * H20 96GB
+
+## Config
+
+```yaml
+Common:
+  model: /data/models/deepseek-ai/DeepSeek-R1
+  block-size: 128
+  max-model-len: 4096
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+  tensor-parallel-size: 16
+  max-num-batched-tokens: 4096
+
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: false
+  compilation-config: '{"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256]}'
+  enable-reasoning: true
+  reasoning-parser: deepseek_r1
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '16'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size]
+
+PrefillWorker:
+  enforce-eager: true
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '16'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size, max-num-batched-tokens, enforce-eager]
+```
+
+
+
+## Benchmark Result (ISL:OSL=3000:150)
+
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.13 | 18.93 | 7922.63 | 7996.39 | 1360.33 | 1393.85 | 44.04 | 44.32 |
+| 2 | 0.24 | 35.64 | 8409.71 | 8517.96 | 2205.08 | 2270.39 | 41.64 | 42.3 |
+| 4 | 0.39 | 58.32 | 10284.39 | 10434.39 | 3943.15 | 4050.12 | 42.56 | 43.28 |
+| 8 | 0.51 | 76.43 | 15161.7 | 28652.96 | 8085.13 | 21478.65 | 47.49 | 48.75 |
+| 16 | 0.63 | 94.86 | 24621.11 | 29305.45 | 16449.11 | 22884.79 | 54.85 | 56 |
+| 32 | 0.78 | 117.22 | 40234.58 | 71022.54 | 30692.74 | 61249.9 | 64.04 | 67.49 |
+| 64 | 0.91 | 136.39 | 68418.96 | 128655.75 | 56918.99 | 117056.13 | 77.31 | 81.88 |
+| 128 | 1 | 148.59 | 127672.86 | 136909.7 | 111751.54 | 122149.31 | 107.46 | 112.6 |
+
+## Benchmark Result (ISL:OSL=1000:1000) 
+
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.02 | 23.42 | 42706.66 | 43007.17 | 654.31 | 824.1 | 42.09 | 42.38 |
+| 2 | 0.05 | 47.61 | 40991.73 | 42171.98 | 1128.78 | 1353.75 | 40.39 | 40.93 |
+| 4 | 0.09 | 85.51 | 44475.7 | 46562.75 | 1957.36 | 3158.4 | 43.48 | 44.43 |
+| 8 | 0.15 | 144.25 | 52616.46 | 67798.97 | 4181.03 | 19999.62 | 48.88 | 51.45 |
+| 16 | 0.24 | 238.13 | 63480.23 | 69032.23 | 9562.51 | 13258.1 | 55.27 | 58.46 |
+| 32 | 0.43 | 425.53 | 71995.77 | 88377.43 | 16720.02 | 31021.77 | 55.86 | 57.46 |
+| 64 | 0.61 | 594.35 | 101217.94 | 120527.49 | 37248.22 | 53620.66 | 66.1 | 69.15 |
+| 128 | 0.93 | 869.79 | 132903.79 | 169650.72 | 53761.13 | 94068.37 | 85.24 | 93.62 |
+
+## Benchmark Result (ISL:OSL=500:1000) 
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms) | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.02 | 23.67 | 42250.43 | 42343.32 | 494.09 | 660.57 | 41.8 | 41.87 |
+| 2 | 0.06 | 45.76 | 30870.14 | 42075.51 | 847.16 | 971.93 | 40.07 | 41.38 |
+| 4 | 0.11 | 91.17 | 34452.23 | 43497.29 | 937.09 | 2076.48 | 41.56 | 42.68 |
+| 8 | 0.16 | 151.54 | 48637.4 | 51613.52 | 2380.23 | 3625.33 | 48.09 | 48.68 |
+| 16 | 0.27 | 254.98 | 57453.77 | 63437.44 | 4869.1 | 6965.22 | 54.92 | 57.45 |
+| 32 | 0.49 | 422.52 | 62772.4 | 76217.87 | 11445.05 | 16105.9 | 59.04 | 61.24 |
+| 64 | 0.75 | 637.14 | 80681.09 | 103841.12 | 23147.03 | 34647.87 | 68.12 | 74.13 |
+| 128 | 0.98 | 914.33 | 124675.48 | 142365.73 | 42858.29 | 53433.67 | 88.17 | 92.8 |
+
+
+# vllm Benchmark Baseline
+
+2 * 8 * H20 96GB
+
+## Image
+
+```Dockerfile
+FROM vllm/vllm-openai:v0.9.0.1
+RUN pip3 install pyarrow pandas
+```
+
+## Command
+```bash
+vllm serve /data/models/deepseek-ai/DeepSeek-R1 \
+  --served-model-name deepseek-ai/DeepSeek-R1 \
+  --block-size 128 \
+  --max-model-len 4096 \
+  --max-num-batched-tokens 4096 \
+  --no-enable-prefix-caching \
+  --enable-chunked-prefill \
+  --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256]}' \
+  -tp 16 \
+  --disable-log-requests \
+  --enable-reasoning \
+  --reasoning-parser deepseek_r1
+```
+
+## Benchmark Result (ISL:OSL=3000:150)
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.21 | 31.28 | 4794.41 | 4841.15 | 315.73 | 318.74 | 30.06 | 30.39 |
+| 2 | 0.35 | 52.1 | 5756.44 | 5870.18 | 459.21 | 612.92 | 35.55 | 37.18 |
+| 4 | 0.61 | 91.37 | 6561.53 | 7169.04 | 561.26 | 1171.5 | 40.27 | 45.18 |
+| 8 | 1 | 150.35 | 7968.12 | 9226.66 | 675.11 | 2264.11 | 48.95 | 51.44 |
+| 16 | 1.45 | 217.66 | 10997.11 | 14324.16 | 875.6 | 4363.43 | 67.93 | 71.48 |
+| 32 | 1.94 | 290.69 | 16448.69 | 23940.38 | 1271.29 | 8635.51 | 101.86 | 107.46 |
+| 64 | 2.39 | 358.77 | 26629.59 | 42937.18 | 1872.85 | 17588.34 | 166.15 | 175.1 |
+| 128 | 2.77 | 416.2 | 45830.93 | 79375.69 | 2976.7 | 35364.6 | 287.61 | 302.65 |
+
+## Benchmark Result (ISL:OSL=1000:1000)
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.03 | 33.32 | 30010.56 | 30202.48 | 158.64 | 167.93 | 29.88 | 30.07 |
+| 2 | 0.06 | 57.96 | 34507.17 | 34743.53 | 233.92 | 305.74 | 34.31 | 34.59 |
+| 4 | 0.11 | 110 | 36362.91 | 37012.77 | 370.24 | 453.67 | 36.03 | 36.76 |
+| 8 | 0.2 | 204.05 | 39201.31 | 39719.73 | 486.31 | 846.82 | 38.75 | 39.37 |
+| 16 | 0.35 | 352.11 | 45430.89 | 46466 | 615.43 | 1620.85 | 44.86 | 45.54 |
+| 32 | 0.59 | 593.36 | 53908.9 | 56004.3 | 788.2 | 3114.6 | 53.17 | 54.67 |
+| 64 | 0.93 | 932.26 | 68607.6 | 73112.96 | 1035.6 | 5904.52 | 67.64 | 68.83 |
+| 128 | 1.44 | 1441.79 | 88679.14 | 98840.42 | 1426 | 11664.79 | 87.34 | 89.41 |
+
+## Benchmark Result (ISL:OSL=500:1000)
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.03 | 33.3 | 30031.68 | 30295.86 | 151.6 | 158.35 | 29.91 | 30.18 |
+| 2 | 0.06 | 58.08 | 34431.96 | 34660.79 | 239.67 | 462.21 | 34.23 | 34.52 |
+| 4 | 0.11 | 111.58 | 35846.46 | 36235.7 | 235.6 | 314.28 | 35.65 | 36.09 |
+| 8 | 0.21 | 207.91 | 38476.3 | 39121.09 | 370.98 | 470.8 | 38.14 | 38.98 |
+| 16 | 0.36 | 363.93 | 43958.6 | 44262.81 | 550.2 | 886.11 | 43.45 | 43.9 |
+| 32 | 0.62 | 620.9 | 51526.32 | 52619.08 | 644.86 | 1675 | 50.93 | 52.19 |
+| 64 | 1.01 | 1008.83 | 63417.48 | 65812.21 | 818.42 | 3342.4 | 62.66 | 63.16 |
+| 128 | 1.62 | 1620.67 | 78928.23 | 83811.97 | 1052.66 | 6221.44 | 77.95 | 80.24 |
diff --git a/helm-charts/deepseek-r1-lws/perf_deepseek-r1_pr1333.md b/helm-charts/deepseek-r1-lws/perf_deepseek-r1_pr1333.md
@@ -0,0 +1,94 @@
+# Dynamo 1P1D
+
+4 * 8 * H20 96GB
+
+## Image
+
+```Dockerfile
+# git clone https://github.com/ai-dynamo/dynamo.git
+# git checkout v0.3.0
+# git cherry-pick f7b3ccfcb2f1afa5a0b3a0d0489ed3bff3b1789c
+# ./container/build.sh --make-efa --release-build
+FROM ccr.ccs.tencentyun.com/tke-ai-playbook/dynamo:v0.3.0-pr1333
+COPY --from=vllm/vllm-openai:v0.9.0.1 /vllm-workspace/examples/online_serving/multi-node-serving.sh /workspace/multi-node-serving.sh
+```
+
+## Config
+
+```yaml
+Common:
+  model: /data/models/deepseek-ai/DeepSeek-R1
+  block-size: 128
+  max-model-len: 4096
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+  tensor-parallel-size: 16
+  max-num-batched-tokens: 4096
+
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: false
+  compilation-config: '{"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256]}'
+  enable-reasoning: true
+  reasoning-parser: deepseek_r1
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '16'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size]
+
+PrefillWorker:
+  enforce-eager: true
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '16'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size, max-num-batched-tokens, enforce-eager]
+```
+
+## Benchmark Result (ISL:OSL=3000:150)
+
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.15 | 22.06 | 6798.05 | 7706.13 | 596.85 | 1490.01 | 41.62 | 41.73 |
+| 2 | 0.33 | 43.49 | 5833.42 | 6811.13 | 605.16 | 971.57 | 40.47 | 41.01 |
+| 4 | 0.58 | 85.85 | 6886.52 | 7730.57 | 666.96 | 1547.9 | 41.99 | 42.43 |
+| 8 | 0.96 | 143.8 | 8082.22 | 10174.44 | 839.25 | 2907.49 | 48.76 | 49.69 |
+| 16 | 1.6 | 239.94 | 9634.47 | 14303.2 | 955.28 | 5576.56 | 58.25 | 59.41 |
+| 32 | 2.53 | 378.92 | 12177.73 | 20423.12 | 1978.06 | 10643.02 | 68.66 | 72.41 |
+| 64 | 2.6 | 390.39 | 23603.91 | 31825.9 | 12804.34 | 21492.83 | 72.48 | 82.57 |
+| 128 | 2.71 | 405.99 | 45051.74 | 52616.73 | 34351.46 | 41339.98 | 71.93 | 77.02 |
+
+## Benchmark Result (ISL:OSL=1000:1000)
+
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.02 | 23.71 | 42170.47 | 42532.82 | 346.16 | 540.49 | 41.87 | 42.25 |
+| 2 | 0.05 | 47.13 | 38409.82 | 41789.88 | 346.84 | 540.01 | 40.01 | 41.49 |
+| 4 | 0.1 | 95.12 | 42020.87 | 42699.99 | 357.34 | 892.35 | 41.71 | 41.86 |
+| 8 | 0.17 | 167.58 | 46592.75 | 48272.2 | 420.44 | 1655.74 | 46.76 | 47.19 |
+| 16 | 0.28 | 283.87 | 56210.15 | 58758.89 | 535.07 | 3186.66 | 55.73 | 58.4 |
+| 32 | 0.53 | 528.38 | 59547.24 | 64284.12 | 723.85 | 6226.31 | 59.28 | 59.92 |
+| 64 | 0.87 | 858.62 | 72790.39 | 84330.61 | 1082.1 | 12513.69 | 72.42 | 73.82 |
+| 128 | 1.36 | 1345.37 | 92156.25 | 114674.5 | 1828.51 | 24682.93 | 91.57 | 96.94 |
+
+## Benchmark Result (ISL:OSL=500:1000)
+
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.02 | 23.74 | 42124.64 | 42227.86 | 300.86 | 305.41 | 41.87 | 41.97 |
+| 2 | 0.06 | 49.83 | 36081.04 | 40183.97 | 311.43 | 496.72 | 39.82 | 40.78 |
+| 4 | 0.11 | 90.2 | 34319.53 | 41391.73 | 340 | 872.66 | 40.8 | 41.09 |
+| 8 | 0.17 | 157.53 | 45864.78 | 49424.88 | 417.54 | 1664.12 | 47.99 | 49.29 |
+| 16 | 0.29 | 276.96 | 53183.63 | 60674.81 | 554.4 | 3281.8 | 54.98 | 58.96 |
+| 32 | 0.54 | 500.75 | 56291.53 | 66380.54 | 744.87 | 6446.54 | 59.65 | 61.68 |
+| 64 | 0.91 | 873.08 | 68062.95 | 81116.36 | 1152.33 | 12504.93 | 69.55 | 71.23 |
+| 128 | 1.47 | 1441.31 | 84769.19 | 110952.12 | 1905.81 | 25269.48 | 84.59 | 86.33 |
diff --git a/helm-charts/deepseek-r1-lws/perf_deepseek-r1_vllm-baseline_tp16_dp1.md b/helm-charts/deepseek-r1-lws/perf_deepseek-r1_vllm-baseline_tp16_dp1.md
@@ -0,0 +1,62 @@
+# vllm Benchmark Baseline
+
+1 nodes, use 2 * 8 * H20 96GB
+
+## Image
+
+```Dockerfile
+FROM vllm/vllm-openai:v0.9.0.1
+RUN pip3 install pyarrow pandas
+```
+
+## Command
+```bash
+vllm serve /data/models/deepseek-ai/DeepSeek-R1 \
+  --served-model-name deepseek-ai/DeepSeek-R1 \
+  --block-size 128 \
+  --max-model-len 4096 \
+  --max-num-batched-tokens 4096 \
+  --no-enable-prefix-caching \
+  --enable-chunked-prefill \
+  --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256]}' \
+  -tp 16 \
+  --disable-log-requests \
+  --enable-reasoning \
+  --reasoning-parser deepseek_r1
+```
+
+## Benchmark Result (ISL:OSL=3000:150)
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.21 | 31.28 | 4794.41 | 4841.15 | 315.73 | 318.74 | 30.06 | 30.39 |
+| 2 | 0.35 | 52.1 | 5756.44 | 5870.18 | 459.21 | 612.92 | 35.55 | 37.18 |
+| 4 | 0.61 | 91.37 | 6561.53 | 7169.04 | 561.26 | 1171.5 | 40.27 | 45.18 |
+| 8 | 1 | 150.35 | 7968.12 | 9226.66 | 675.11 | 2264.11 | 48.95 | 51.44 |
+| 16 | 1.45 | 217.66 | 10997.11 | 14324.16 | 875.6 | 4363.43 | 67.93 | 71.48 |
+| 32 | 1.94 | 290.69 | 16448.69 | 23940.38 | 1271.29 | 8635.51 | 101.86 | 107.46 |
+| 64 | 2.39 | 358.77 | 26629.59 | 42937.18 | 1872.85 | 17588.34 | 166.15 | 175.1 |
+| 128 | 2.77 | 416.2 | 45830.93 | 79375.69 | 2976.7 | 35364.6 | 287.61 | 302.65 |
+
+## Benchmark Result (ISL:OSL=1000:1000)
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.03 | 33.32 | 30010.56 | 30202.48 | 158.64 | 167.93 | 29.88 | 30.07 |
+| 2 | 0.06 | 57.96 | 34507.17 | 34743.53 | 233.92 | 305.74 | 34.31 | 34.59 |
+| 4 | 0.11 | 110 | 36362.91 | 37012.77 | 370.24 | 453.67 | 36.03 | 36.76 |
+| 8 | 0.2 | 204.05 | 39201.31 | 39719.73 | 486.31 | 846.82 | 38.75 | 39.37 |
+| 16 | 0.35 | 352.11 | 45430.89 | 46466 | 615.43 | 1620.85 | 44.86 | 45.54 |
+| 32 | 0.59 | 593.36 | 53908.9 | 56004.3 | 788.2 | 3114.6 | 53.17 | 54.67 |
+| 64 | 0.93 | 932.26 | 68607.6 | 73112.96 | 1035.6 | 5904.52 | 67.64 | 68.83 |
+| 128 | 1.44 | 1441.79 | 88679.14 | 98840.42 | 1426 | 11664.79 | 87.34 | 89.41 |
+
+## Benchmark Result (ISL:OSL=500:1000)
+| Concurrency | Reqeuest Throughput (req/s) | Output Token Throughput (tok/s) | Mean E2E Lantency (ms)  | P99 E2E Lantency (ms) | Mean TTFT (ms) | P99 TTFT (ms) | Mean TPOT (ms) | P99 TPOT (ms) |
+|:-----------:|:--------------------------:|:-------------------------------:|:-----------------------:|:---------------------:|:--------------:|:-------------:|:-------------:|:------------:|
+| 1 | 0.03 | 33.3 | 30031.68 | 30295.86 | 151.6 | 158.35 | 29.91 | 30.18 |
+| 2 | 0.06 | 58.08 | 34431.96 | 34660.79 | 239.67 | 462.21 | 34.23 | 34.52 |
+| 4 | 0.11 | 111.58 | 35846.46 | 36235.7 | 235.6 | 314.28 | 35.65 | 36.09 |
+| 8 | 0.21 | 207.91 | 38476.3 | 39121.09 | 370.98 | 470.8 | 38.14 | 38.98 |
+| 16 | 0.36 | 363.93 | 43958.6 | 44262.81 | 550.2 | 886.11 | 43.45 | 43.9 |
+| 32 | 0.62 | 620.9 | 51526.32 | 52619.08 | 644.86 | 1675 | 50.93 | 52.19 |
+| 64 | 1.01 | 1008.83 | 63417.48 | 65812.21 | 818.42 | 3342.4 | 62.66 | 63.16 |
+| 128 | 1.62 | 1620.67 | 78928.23 | 83811.97 | 1052.66 | 6221.44 | 77.95 | 80.24 |