Add vLLM (#17756)

--------- Co-authored-by: Kyle-Neale <[email protected]> Co-authored-by: Steven Yuen <[email protected]>
DataDog · Jun 26, 2024 · be9ce36 · be9ce36
1 parent fb9fe43
commit be9ce36
Show file tree

Hide file tree

Showing 32 changed files with 1,807 additions and 0 deletions.
diff --git a/.codecov.yml b/.codecov.yml
@@ -678,6 +678,10 @@ coverage:
         target: 75
         flags:
         - fluxcd
+      vLLM:
+        target: 75
+        flags:
+        - vllm
       vSphere:
         target: 75
         flags:
@@ -1479,6 +1483,11 @@ flags:
     paths:
     - vertica/datadog_checks/vertica
     - vertica/tests
+  vllm:
+    carryforward: true
+    paths:
+    - vllm/datadog_checks/vllm
+    - vllm/tests
   voltdb:
     carryforward: true
     paths:

diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml
@@ -483,6 +483,8 @@ integration/vault:
 - vault/**/*
 integration/vertica:
 - vertica/**/*
+integration/vllm:
+- vllm/**/*
 integration/voltdb:
 - voltdb/**/*
 integration/vsphere:

diff --git a/.github/workflows/test-all.yml b/.github/workflows/test-all.yml
@@ -3488,6 +3488,25 @@ jobs:
       test-py3: ${{ inputs.test-py3 }}
       minimum-base-package: ${{ inputs.minimum-base-package }}
     secrets: inherit
+  jeae0788:
+    uses: ./.github/workflows/test-target.yml
+    with:
+      job-name: vLLM
+      target: vllm
+      platform: linux
+      runner: '["ubuntu-22.04"]'
+      repo: "${{ inputs.repo }}"
+      python-version: "${{ inputs.python-version }}"
+      standard: ${{ inputs.standard }}
+      latest: ${{ inputs.latest }}
+      agent-image: "${{ inputs.agent-image }}"
+      agent-image-py2: "${{ inputs.agent-image-py2 }}"
+      agent-image-windows: "${{ inputs.agent-image-windows }}"
+      agent-image-windows-py2: "${{ inputs.agent-image-windows-py2 }}"
+      test-py2: ${{ inputs.test-py2 }}
+      test-py3: ${{ inputs.test-py3 }}
+      minimum-base-package: ${{ inputs.minimum-base-package }}
+    secrets: inherit
   jb120af1:
     uses: ./.github/workflows/test-target.yml
     with:

diff --git a/vllm/CHANGELOG.md b/vllm/CHANGELOG.md
@@ -0,0 +1,4 @@
+# CHANGELOG - vLLM
+
+<!-- towncrier release notes start -->
+
diff --git a/vllm/README.md b/vllm/README.md
@@ -0,0 +1,54 @@
+# Agent Check: vLLM
+
+## Overview
+
+This check monitors [vLLM][1] through the Datadog Agent.
+
+## Setup
+
+Follow the instructions below to install and configure this check for an Agent running on a host. 
+
+### Installation
+
+The vLLM check is included in the [Datadog Agent][2] package.
+No additional installation is needed on your server.
+
+### Configuration
+
+1. Edit the `vllm.d/conf.yaml` file, in the `conf.d/` folder at the root of your Agent's configuration directory to start collecting your vllm performance data. See the [sample vllm.d/conf.yaml][3] for all available configuration options.
+
+2. [Restart the Agent][4].
+
+### Validation
+
+[Run the Agent's status subcommand][5] and look for `vllm` under the Checks section.
+
+## Data Collected
+
+### Metrics
+
+See [metadata.csv][6] for a list of metrics provided by this integration.
+
+### Events
+
+The vLLM integration does not include any events.
+
+### Service Checks
+
+The vLLM integration does not include any service checks.
+
+See [service_checks.json][7] for a list of service checks provided by this integration.
+
+## Troubleshooting
+
+Need help? Contact [Datadog support][9].
+
+
+[1]: https://docs.vllm.ai/en/stable/
+[2]: https://app.datadoghq.com/account/settings/agent/latest
+[3]: https://github.com/DataDog/integrations-core/blob/master/vllm/datadog_checks/vllm/data/conf.yaml.example
+[4]: https://docs.datadoghq.com/agent/guide/agent-commands/#start-stop-and-restart-the-agent
+[5]: https://docs.datadoghq.com/agent/guide/agent-commands/#agent-status-and-information
+[6]: https://github.com/DataDog/integrations-core/blob/master/vllm/metadata.csv
+[7]: https://github.com/DataDog/integrations-core/blob/master/vllm/assets/service_checks.json
+[9]: https://docs.datadoghq.com/help/
diff --git a/vllm/assets/configuration/spec.yaml b/vllm/assets/configuration/spec.yaml
@@ -0,0 +1,16 @@
+name: vLLM
+files:
+- name: vllm.yaml
+  options:
+  - template: init_config
+    options:
+    - template: init_config/openmetrics
+  - template: instances
+    options:
+    - template: instances/openmetrics
+      overrides:
+        openmetrics_endpoint.required: true
+        openmetrics_endpoint.value.example: http://localhost:8000/metrics
+        openmetrics_endpoint.description: |
+          Endpoint exposing the vLLM's Prometheus metrics. For more information refer to:
+          https://docs.vllm.ai/en/stable/serving/metrics.html
diff --git a/vllm/assets/service_checks.json b/vllm/assets/service_checks.json
@@ -0,0 +1,32 @@
+[
+    {
+        "agent_version": "7.56.0",
+        "integration": "vLLM",
+        "check": "vllm.openmetrics.health",
+        "statuses": [
+            "ok",
+            "critical"
+        ],
+        "groups": [
+            "host",
+            "endpoint"
+        ],
+        "name": "vLLM OpenMetrics endpoint health",
+        "description": "Returns `CRITICAL` if the Agent is unable to connect to the vLLM OpenMetrics endpoint, otherwise returns `OK`."
+    },
+    {
+        "agent_version": "7.56.0",
+        "integration": "vLLM",
+        "check": "vllm.health.status",
+        "statuses": [
+            "ok",
+            "warning",
+            "critical"
+        ],
+        "groups": [
+            "host"
+                ],
+        "name": "Health Status of vLLM",
+        "description": "Returns `CRITICAL` if the Server is having a 4xx or 5xx response, `OK` if the response is 200, and `unknown` for everything else."
+    }
+]
diff --git a/vllm/changelog.d/17756.added b/vllm/changelog.d/17756.added
@@ -0,0 +1 @@
+Initial Release
diff --git a/vllm/datadog_checks/__init__.py b/vllm/datadog_checks/__init__.py
@@ -0,0 +1,4 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+__path__ = __import__('pkgutil').extend_path(__path__, __name__)  # type: ignore
diff --git a/vllm/datadog_checks/vllm/__about__.py b/vllm/datadog_checks/vllm/__about__.py
@@ -0,0 +1,4 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+__version__ = '0.0.1'
diff --git a/vllm/datadog_checks/vllm/__init__.py b/vllm/datadog_checks/vllm/__init__.py
@@ -0,0 +1,7 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from .__about__ import __version__
+from .check import vLLMCheck
+
+__all__ = ['__version__', 'vLLMCheck']
diff --git a/vllm/datadog_checks/vllm/check.py b/vllm/datadog_checks/vllm/check.py
@@ -0,0 +1,49 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2  # noqa: F401
+
+from .metrics import METRIC_MAP, RENAME_LABELS_MAP
+
+
+class vLLMCheck(OpenMetricsBaseCheckV2):
+
+    DEFAULT_METRIC_LIMIT = 0
+    # This will be the prefix of every metric and service check the integration sends
+    __NAMESPACE__ = 'vllm'
+
+    def get_default_config(self):
+        return {
+            'metrics': [METRIC_MAP],
+            "rename_labels": RENAME_LABELS_MAP,
+        }
+
+    @AgentCheck.metadata_entrypoint
+    def _submit_version_metadata(self):
+
+        endpoint = self.instance["openmetrics_endpoint"].replace("/metrics", "/version")
+        response = self.http.get(endpoint)
+        response.raise_for_status()
+
+        data = response.json()
+        version = data.get("version", "")
+        version_split = version.split(".")
+        if len(version_split) >= 3:
+            major = version_split[0]
+            minor = version_split[1]
+            patch = version_split[2]
+
+            version_raw = f'{major}.{minor}.{patch}'
+
+            version_parts = {
+                'major': major,
+                'minor': minor,
+                'patch': patch,
+            }
+            self.set_metadata('version', version_raw, scheme='semver', part_map=version_parts)
+        else:
+            self.log.debug("Invalid vLLM version format: %s", version)
+
+    def check(self, instance):
+        super().check(instance)
+        self._submit_version_metadata()
diff --git a/vllm/datadog_checks/vllm/config_models/__init__.py b/vllm/datadog_checks/vllm/config_models/__init__.py
@@ -0,0 +1,24 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+
+# This file is autogenerated.
+# To change this file you should edit assets/configuration/spec.yaml and then run the following commands:
+#     ddev -x validate config -s <INTEGRATION_NAME>
+#     ddev -x validate models -s <INTEGRATION_NAME>
+
+from .instance import InstanceConfig
+from .shared import SharedConfig
+
+
+class ConfigMixin:
+    _config_model_instance: InstanceConfig
+    _config_model_shared: SharedConfig
+
+    @property
+    def config(self) -> InstanceConfig:
+        return self._config_model_instance
+
+    @property
+    def shared_config(self) -> SharedConfig:
+        return self._config_model_shared
diff --git a/vllm/datadog_checks/vllm/config_models/defaults.py b/vllm/datadog_checks/vllm/config_models/defaults.py
@@ -0,0 +1,132 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+
+# This file is autogenerated.
+# To change this file you should edit assets/configuration/spec.yaml and then run the following commands:
+#     ddev -x validate config -s <INTEGRATION_NAME>
+#     ddev -x validate models -s <INTEGRATION_NAME>
+
+
+def shared_skip_proxy():
+    return False
+
+
+def shared_timeout():
+    return 10
+
+
+def instance_allow_redirects():
+    return True
+
+
+def instance_auth_type():
+    return 'basic'
+
+
+def instance_cache_metric_wildcards():
+    return True
+
+
+def instance_cache_shared_labels():
+    return True
+
+
+def instance_collect_counters_with_distributions():
+    return False
+
+
+def instance_collect_histogram_buckets():
+    return True
+
+
+def instance_disable_generic_tags():
+    return False
+
+
+def instance_empty_default_hostname():
+    return False
+
+
+def instance_enable_health_service_check():
+    return True
+
+
+def instance_histogram_buckets_as_distributions():
+    return False
+
+
+def instance_ignore_connection_errors():
+    return False
+
+
+def instance_kerberos_auth():
+    return 'disabled'
+
+
+def instance_kerberos_delegate():
+    return False
+
+
+def instance_kerberos_force_initiate():
+    return False
+
+
+def instance_log_requests():
+    return False
+
+
+def instance_min_collection_interval():
+    return 15
+
+
+def instance_non_cumulative_histogram_buckets():
+    return False
+
+
+def instance_persist_connections():
+    return False
+
+
+def instance_request_size():
+    return 16
+
+
+def instance_skip_proxy():
+    return False
+
+
+def instance_tag_by_endpoint():
+    return True
+
+
+def instance_telemetry():
+    return False
+
+
+def instance_timeout():
+    return 10
+
+
+def instance_tls_ignore_warning():
+    return False
+
+
+def instance_tls_use_host_header():
+    return False
+
+
+def instance_tls_verify():
+    return True
+
+
+def instance_use_latest_spec():
+    return False
+
+
+def instance_use_legacy_auth_encoding():
+    return True
+
+
+def instance_use_process_start_time():
+    return False
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		# CHANGELOG - vLLM

		<!-- towncrier release notes start -->