Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add query to get pod labels and generate report by class labels #106

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions openshift_metrics/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,22 @@ def main():
condensed_metrics_dict = processor.condense_metrics(
["cpu_request", "memory_request", "gpu_request", "gpu_type"]
)

utils.write_metrics_by_namespace(
condensed_metrics_dict=condensed_metrics_dict,
file_name=invoice_file,
report_month=report_month,
rates=rates,
ignore_hours=ignore_hours,
)
utils.write_metrics_by_classes(
condensed_metrics_dict=condensed_metrics_dict,
file_name=invoice_file,
report_month=report_month,
rates=rates,
namespaces_with_classes=["rhods-notebooks"],
ignore_hours=ignore_hours,
)
utils.write_metrics_by_pod(condensed_metrics_dict, pod_report_file, ignore_hours)

if args.upload_to_s3:
Expand Down
24 changes: 23 additions & 1 deletion openshift_metrics/metrics_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def __init__(

def merge_metrics(self, metric_name, metric_list):
"""Merge metrics (cpu, memory, gpu) by pod"""

for metric in metric_list:
pod = metric["metric"]["pod"]
namespace = metric["metric"]["namespace"]
Expand All @@ -34,6 +33,11 @@ def merge_metrics(self, metric_name, metric_list):
self.merged_data.setdefault(namespace, {})
self.merged_data[namespace].setdefault(pod, {"metrics": {}})

if metric_name == "cpu_request":
class_name = metric["metric"].get("label_nerc_mghpcc_org_class")
if class_name is not None:
self.merged_data[namespace][pod]["label_nerc_mghpcc_org_class"] = class_name

gpu_type, gpu_resource, node_model = self._extract_gpu_info(
metric_name, metric
)
Expand Down Expand Up @@ -193,3 +197,21 @@ def insert_node_labels(node_labels: list, resource_request_metrics: list) -> lis
"machine"
)
return resource_request_metrics

@staticmethod
def insert_pod_labels(pod_labels: list, resource_request_metrics: list) -> list:
"""Inserts `label_nerc_mghpcc_org_class` label into resource_request_metrics"""
pod_label_dict = {}
for pod_label in pod_labels:
pod_name = pod_label["metric"]["pod"]
class_name = pod_label["metric"].get("label_nerc_mghpcc_org_class")
pod_label_dict[pod_name] = {"pod": pod_name, "class": class_name}

for pod in resource_request_metrics:
pod_name = pod["metric"]["pod"]
if pod_name not in pod_label_dict:
continue
pod["metric"]["label_nerc_mghpcc_org_class"] = pod_label_dict[pod_name].get(
"class"
)
return resource_request_metrics
10 changes: 9 additions & 1 deletion openshift_metrics/openshift_prometheus_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
KUBE_NODE_LABELS = 'kube_node_labels{label_nvidia_com_gpu_product!=""}'
KUBE_POD_LABELS = 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}'

def main():
"""This method kick starts the process of collecting and saving the metrics"""
Expand Down Expand Up @@ -87,10 +88,17 @@ def main():
cpu_request_metrics = prom_client.query_metric(
CPU_REQUEST, report_start_date, report_end_date
)

try:
pod_labels = prom_client.query_metric(KUBE_POD_LABELS, report_start_date, report_end_date)
metrics_dict["cpu_metrics"] = MetricsProcessor.insert_pod_labels(pod_labels, cpu_request_metrics)
except utils.EmptyResultError:
logger.info(f"No pod labels found for the period {report_start_date} to {report_end_date}")
metrics_dict["cpu_metrics"] = cpu_request_metrics

memory_request_metrics = prom_client.query_metric(
MEMORY_REQUEST, report_start_date, report_end_date
)
metrics_dict["cpu_metrics"] = cpu_request_metrics
metrics_dict["memory_metrics"] = memory_request_metrics

# because if nobody requests a GPU then we will get an empty set
Expand Down
74 changes: 74 additions & 0 deletions openshift_metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,77 @@ def write_metrics_by_pod(condensed_metrics_dict, file_name, ignore_hours=None):
rows.append(pod_obj.generate_pod_row(ignore_hours))

csv_writer(rows, file_name)

def write_metrics_by_classes(condensed_metrics_dict, file_name, report_month, rates, namespaces_with_classes, ignore_hours=None):
"""
Process metrics dictionary to aggregate usage by the class label.

If a pod has a class label, then the project name is composed of namespace:class_name
otherwise it's namespace:noclass.
"""
invoices = {}
rows = []
headers = [
"Invoice Month",
"Project - Allocation",
"Project - Allocation ID",
"Manager (PI)",
"Invoice Email",
"Invoice Address",
"Institution",
"Institution - Specific Code",
"SU Hours (GBhr or SUhr)",
"SU Type",
"Rate",
"Cost",
]

rows.append(headers)

for namespace, pods in condensed_metrics_dict.items():
if namespace not in namespaces_with_classes:
continue

for pod, pod_dict in pods.items():
class_name = pod_dict.get("label_nerc_mghpcc_org_class")
if class_name:
project_name = f"{namespace}:{class_name}"
else:
project_name = f"{namespace}:noclass"

if project_name not in invoices:
project_invoice = invoice.ProjectInvoce(
invoice_month=report_month,
project=project_name,
project_id=project_name,
pi="",
invoice_email="",
invoice_address="",
intitution="",
institution_specific_code="",
rates=rates,
ignore_hours=ignore_hours,
)
invoices[project_name] = project_invoice
project_invoice = invoices[project_name]

for epoch_time, pod_metric_dict in pod_dict["metrics"].items():
pod_obj = invoice.Pod(
pod_name=pod,
namespace=project_name,
start_time=epoch_time,
duration=pod_metric_dict["duration"],
cpu_request=Decimal(pod_metric_dict.get("cpu_request", 0)),
gpu_request=Decimal(pod_metric_dict.get("gpu_request", 0)),
memory_request=Decimal(pod_metric_dict.get("memory_request", 0)) / 2**30,
gpu_type=pod_metric_dict.get("gpu_type"),
gpu_resource=pod_metric_dict.get("gpu_resource"),
node_hostname=pod_metric_dict.get("node"),
node_model=pod_metric_dict.get("node_model"),
)
project_invoice.add_pod(pod_obj)

for project_invoice in invoices.values():
rows.extend(project_invoice.generate_invoice_rows(report_month))

csv_writer(rows, f"By-class-{file_name}")