Skip to content

Commit

Permalink
Use pySMART
Browse files Browse the repository at this point in the history
  • Loading branch information
technowhizz committed Jan 21, 2025
1 parent 915c0ab commit b5b72ba
Showing 1 changed file with 168 additions and 111 deletions.
279 changes: 168 additions & 111 deletions etc/kayobe/ansible/scripts/smartmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,155 +2,212 @@

import subprocess
import json
import re
from datetime import datetime

from pySMART import DeviceList

SMARTCTL_PATH = "/usr/sbin/smartctl"

SMARTMON_ATTRS = {
"airflow_temperature_cel",
"command_timeout",
"current_pending_sector",
"end_to_end_error",
"erase_fail_count",
"g_sense_error_rate",
"hardware_ecc_recovered",
"host_reads_32mib",
"host_reads_mib",
"host_writes_32mib",
"host_writes_mib",
"load_cycle_count",
"media_wearout_indicator",
"nand_writes_1gib",
"offline_uncorrectable",
"power_cycle_count",
"power_on_hours",
"program_fail_cnt_total",
"program_fail_count",
"raw_read_error_rate",
"reallocated_event_count",
"reallocated_sector_ct",
"reported_uncorrect",
"runtime_bad_block",
"sata_downshift_count",
"seek_error_rate",
"spin_retry_count",
"spin_up_time",
"start_stop_count",
"temperature_case",
"temperature_celsius",
"temperature_internal",
"total_lbas_read",
"total_lbas_written",
"udma_crc_error_count",
"unsafe_shutdown_count",
"unused_rsvd_blk_cnt_tot",
"wear_leveling_count",
"workld_host_reads_perc",
"workld_media_wear_indic",
"workload_minutes",
"critical_warning",
"temperature",
"available_spare",
"available_spare_threshold",
"percentage_used",
"data_units_read",
"data_units_written",
"host_reads",
"host_writes",
"controller_busy_time",
"power_cycles",
"unsafe_shutdowns",
"media_errors",
"num_err_log_entries",
"warning_temp_time",
"critical_comp_time",
}

def run_command(command, parse_json=False):
"""
Helper to run a subprocess command and optionally parse JSON output.
"""
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if parse_json:
return json.loads(result.stdout)
else:
return result.stdout.strip()

def parse_smartctl_attributes(disk, disk_type, serial, json_data):
labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"'
metrics = []
smartmon_attrs = set([
"airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count",
"g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib",
"host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable",
"power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate",
"reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count",
"seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius",
"temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count",
"unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes",
"critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used",
"data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time",
"power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries",
"warning_temp_time", "critical_comp_time"
])
if 'nvme_smart_health_information_log' in json_data:
smart_log = json_data['nvme_smart_health_information_log']
for attr_name, value in smart_log.items():
attr_name = attr_name.replace(' ', '_').lower()
if attr_name in smartmon_attrs:
metrics.append(f"{attr_name}{{{labels}}} {value}")
elif 'scsi_grown_defect_list' in json_data:
scsi_attrs = json_data.get('scsi_grown_defect_list', {})
for attr_name, value in scsi_attrs.items():
attr_name = attr_name.replace(' ', '_').lower()
if attr_name in smartmon_attrs:
metrics.append(f"{attr_name}{{{labels}}} {value}")
elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']:
for attr in json_data['ata_smart_attributes']['table']:
attr_name = attr['name'].replace('-', '_').lower()
if attr_name in smartmon_attrs:
attr_id = attr.get('id', '')
value = attr.get('value', '')
worst = attr.get('worst', '')
threshold = attr.get('thresh', '')
raw_value = attr.get('raw', {}).get('value', '')
metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}")
metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}")
metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}")
metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}")
return metrics

def parse_smartctl_info(disk, disk_type, json_data):
info = json_data.get('device', {})
smart_status = json_data.get('smart_status', {})
return result.stdout.strip()

def parse_device_info(device):
"""
Produce Prometheus lines describing the device's identity and SMART status:
- device_info
- device_smart_available
- device_smart_enabled
- device_smart_healthy
"""
serial_number = (device.serial or "").lower()
labels = {
'disk': disk,
'type': disk_type,
'vendor': info.get('vendor', ''),
'product': info.get('product', ''),
'revision': info.get('revision', ''),
'lun_id': info.get('lun_id', ''),
'model_family': json_data.get('model_family', ''),
'device_model': json_data.get('model_name', ''),
'serial_number': json_data.get('serial_number', '').lower(),
'firmware_version': json_data.get('firmware_version', '')
"disk": device.name,
"type": device.interface or "",
"vendor": device.vendor or "",
"model_family": device.family or "",
"device_model": device.model or "",
"serial_number": serial_number,
"firmware_version": device.firmware or "",
}
label_str = ','.join(f'{k}="{v}"' for k, v in labels.items())
label_str = ",".join(f'{k}="{v}"' for k, v in labels.items())

metrics = [
f'device_info{{{label_str}}} 1',
f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}',
f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}',
]
if smart_status.get("available", False):
metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}')
if 'passed' in smart_status:
metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}')

if device.smart_capable:
metrics.append(
f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}'
)
if device.assessment:
is_healthy = 1 if device.assessment.upper() == "PASS" else 0
metrics.append(
f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}'
)

return metrics

def parse_if_attributes(device):
"""
For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes.
We'll iterate over its public fields, convert them to snake_case,
and if it's in SMARTMON_ATTRS and numeric, we produce metrics.
"""
metrics = []

if not device.if_attributes:
return metrics

disk = device.name
disk_type = device.interface or ""
serial_number = (device.serial or "").lower()
labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"'

# Inspect all public attributes on device.if_attributes
for attr_name in dir(device.if_attributes):
if attr_name.startswith("_"):
continue # skip private / special methods
val = getattr(device.if_attributes, attr_name, None)
if callable(val):
continue # skip methods

# Convert CamelCase or PascalCase -> snake_case, e.g. dataUnitsRead -> data_units_read
snake_name = re.sub(r'(?<!^)(?=[A-Z])', '_', attr_name).lower()

if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)):
metrics.append(f"{snake_name}{{{labels}}} {val}")

return metrics

def format_output(metrics):
"""
Convert a list of lines like "some_metric{...} value"
into a Prometheus text output with # HELP / # TYPE lines.
"""
output = []
last_metric = ""
for metric in sorted(metrics):
metric_name = metric.split('{')[0]
metric_name = metric.split("{")[0]
if metric_name != last_metric:
output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}")
output.append(f"# TYPE smartmon_{metric_name} gauge")
last_metric = metric_name
output.append(f"smartmon_{metric}")
return '\n'.join(output)
return "\n".join(output)

def main():
all_metrics = []

try:
version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True)
smartctl_version_list = version_output.get('smartctl', {}).get('version', [])
if smartctl_version_list:
smartctl_version_str = '.'.join(map(str, smartctl_version_list))
version_output = run_command([SMARTCTL_PATH, "--version"])
if version_output.startswith("smartctl"):
first_line = version_output.splitlines()[0]
version_num = first_line.split()[1]
else:
smartctl_version_str = "unknown"
except json.JSONDecodeError:
smartctl_version_str = "unknown"
metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1']
version_num = "unknown"
except Exception:
version_num = "unknown"
all_metrics.append(f'smartctl_version{{version="{version_num}"}} 1')

try:
device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True)
devices = []
for device in device_list_output.get('devices', []):
disk = device.get('name', '')
disk_type = device.get('type', 'auto')
if disk:
devices.append((disk, disk_type))
except json.JSONDecodeError:
devices = []

for disk, disk_type in devices:
serial_number = ''
active = 1
metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}')
dev_list = DeviceList()

for dev in dev_list.devices:
disk_name = dev.name
disk_type = dev.interface or ""
serial_number = (dev.serial or "").lower()

run_timestamp = int(datetime.utcnow().timestamp())
all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}')

active = 1
try:
standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True)
power_mode = standby_output.get('power_mode', '')
if power_mode == 'standby':
cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name]
standby_json = run_command(cmd, parse_json=True)
if standby_json.get("power_mode", "") == "standby":
active = 0
except json.JSONDecodeError:
active = 0 # Assume device is inactive if we can't parse the output

metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}')
active = 0
except Exception:
active = 0

all_metrics.append(
f'device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}'
)
if active == 0:
continue

try:
info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True)
except json.JSONDecodeError:
continue
metrics.extend(parse_smartctl_info(disk, disk_type, info_output))
serial_number = info_output.get('serial_number', '').lower()

try:
attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True)
except json.JSONDecodeError:
continue
metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output))
all_metrics.extend(parse_device_info(dev))
all_metrics.extend(parse_if_attributes(dev))

formatted_output = format_output(metrics)
print(formatted_output)
print(format_output(all_metrics))

if __name__ == "__main__":
main()

0 comments on commit b5b72ba

Please sign in to comment.