You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Please add a general purpose windows perfomance counter plugin. The following is the code we are using on thousands of windows machines it is very robust. Importantly it uses win32pdh module which encapsulates the Windows Performance Data Helpers API
#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
"""Windows Performance Counter monitoring plugin with multithreading support and native SQLite threading."""
import argparse
import sys
from typing import Dict, List, Tuple
import time
import threading
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed
import sqlite3
import tempfile
import os
import lib.base
import lib.db_sqlite
from lib.globals import STATE_OK, STATE_UNKNOWN, STATE_WARN, STATE_CRIT
class CounterError(Exception):
"""Base exception for counter-related errors"""
def __init__(self, message: str, counter_path: str = None, error_code: int = None):
self.counter_path = counter_path
self.error_code = error_code
super().__init__(message)
class CounterNotFoundError(CounterError):
"""Exception raised when a counter path cannot be found"""
pass
class CounterAccessError(CounterError):
"""Exception raised when a counter exists but cannot be accessed"""
pass
class CounterTypeError(CounterError):
"""Exception raised when there's an issue with counter type detection"""
pass
class DatabaseError(Exception):
"""Exception raised for database-related errors"""
def __init__(self, message: str, sql_error: str = None):
self.sql_error = sql_error
super().__init__(message)
try:
import win32pdh
import pywintypes
except ImportError:
lib.base.cu('Python module "pywin32" is not installed.')
__author__ = 'xxxxx'
__version__ = '2024033103'
DESCRIPTION = """Monitors specified Windows Performance Counters using multithreading and native SQLite threading."""
DEFAULT_COUNT = 5
DEFAULT_WARN = 80
DEFAULT_CRIT = 90
MAX_WORKERS = 4 # Maximum number of worker threads
DEFAULT_PDH_TIMEOUT = 30 # seconds
PERF_COUNTER_COUNTER = 0x10410400
PERF_COUNTER_BULK_COUNT = 0x10410500
PERF_100NSEC_TIMER = 0x10410800
PERF_COUNTER_TIMER = 0x10410400
PERF_PRECISION_100NS_TIMER = 0x10410800
PERF_PRECISION_SYSTEM_TIMER = 0x10410800
PERF_SAMPLE_COUNTER = 0x10410400
PERF_SAMPLE_FRACTION = 0x20C20400
PERF_COUNTER_NODATA = 0x00400000
class DatabaseManager:
MAX_DB_SIZE = 60 * 1024 * 1024 # 60MB
"""Thread-safe database manager using SQLite's native threading support"""
def __init__(self, db_path: str):
self.db_path = db_path
self.conn = sqlite3.connect(db_path, check_same_thread=False)
self.conn.execute('PRAGMA journal_mode=WAL')
self.conn.execute('PRAGMA busy_timeout=5000')
self._write_lock = threading.Lock()
self._read_lock = threading.RLock()
self._reader_count = 0
self._reader_count_lock = threading.Lock()
def __enter__(self):
self._acquire_read()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self._release_read()
def initialize_table(self):
"""Create the performance data table if it doesn't exist"""
with self._write_lock:
self.conn.execute('''
CREATE TABLE IF NOT EXISTS perfdata (
counter TEXT NOT NULL,
value REAL NOT NULL,
timestamp INTEGER DEFAULT (strftime('%s', 'now'))
)
''')
self.conn.execute('''
CREATE INDEX IF NOT EXISTS idx_counter_timestamp
ON perfdata(counter, timestamp)
''')
self.conn.commit()
def _acquire_read(self):
"""Acquire a read lock while tracking reader count"""
with self._reader_count_lock:
self._reader_count += 1
if self._reader_count == 1:
self._write_lock.acquire()
try:
self._read_lock.acquire()
except:
with self._reader_count_lock:
self._reader_count -= 1
if self._reader_count == 0:
self._write_lock.release()
raise
def _release_read(self):
"""Release a read lock while tracking reader count"""
self._read_lock.release()
with self._reader_count_lock:
self._reader_count -= 1
if self._reader_count == 0:
self._write_lock.release()
def insert_value(self, counter: str, value: float):
"""Insert a single counter value"""
with self._write_lock:
self.conn.execute(
'INSERT INTO perfdata (counter, value) VALUES (?, ?)',
(counter, value)
)
self.conn.commit()
def check_db_size(self):
"""Check database size and vacuum if needed"""
try:
if os.path.exists(self.db_path):
if os.path.getsize(self.db_path) > self.MAX_DB_SIZE:
self.vacuum()
except OSError as e:
print(f"Warning: Unable to check database size: {str(e)}")
def bulk_insert_values(self, values: List[Tuple[str, float]]):
"""Insert multiple counter values with size checking"""
self.check_db_size() # Check size before bulk insert
with self._write_lock:
try:
self.conn.execute('BEGIN IMMEDIATE')
self.conn.executemany(
'INSERT INTO perfdata (counter, value) VALUES (?, ?)',
values
)
self.conn.commit()
except sqlite3.Error as e:
self.conn.rollback()
raise Exception(f"Database error during bulk insert: {str(e)}")
except Exception as e:
self.conn.rollback()
raise Exception(f"Unexpected error during bulk insert: {str(e)}")
def get_mean_value(self, counter: str, limit: int) -> Tuple[float, int]:
"""
Get weighted mean value for a counter from recent measurements.
More recent values have progressively higher weights.
Args:
counter (str): Counter path to query
limit (int): Maximum number of samples to consider
Returns:
Tuple[float, int]: (weighted mean value, number of samples used)
"""
try:
with self:
# Get ordered samples with row numbers for weighting
result = self.conn.execute('''
WITH ordered_samples AS (
SELECT
value,
ROW_NUMBER() OVER (ORDER BY timestamp DESC) as row_num
FROM perfdata
WHERE counter = ?
ORDER BY timestamp DESC
LIMIT ?
),
weights AS (
SELECT
value,
CAST(row_num AS FLOAT) as weight -- Linear weight based on position
FROM ordered_samples
),
final_calc AS (
SELECT
SUM(value * weight) / SUM(weight) as weighted_mean,
COUNT(*) as sample_count
FROM weights
)
SELECT weighted_mean, sample_count
FROM final_calc
''', (counter, limit)).fetchone()
if not result:
return 0.0, 0
weighted_mean = result[0] if result[0] is not None else 0.0
sample_count = result[1] if result[1] is not None else 0
return weighted_mean, sample_count
except sqlite3.Error as e:
raise Exception(f"Database error getting weighted mean for counter {counter}: {str(e)}")
def clean_old_data(self, counter: str, limit: int):
"""Clean old data for a counter using a single efficient query"""
with self._write_lock:
try:
self.conn.execute('''
DELETE FROM perfdata
WHERE counter = ?
AND rowid NOT IN (
SELECT rowid FROM perfdata
WHERE counter = ?
ORDER BY timestamp DESC
LIMIT ?
)
''', (counter, counter, limit))
self.conn.commit()
except:
self.conn.rollback()
raise
def vacuum(self):
"""Optimize database size and performance"""
with self._write_lock:
self.conn.execute('VACUUM')
self.conn.commit()
def close(self):
"""Close database connection"""
with self._write_lock:
if self.conn:
self.conn.close()
class CounterCache:
"""Thread-safe cache for counter metadata"""
def __init__(self):
self._cache = {}
self._lock = threading.Lock()
def get(self, key, default=None):
with self._lock:
return self._cache.get(key, default)
def set(self, key, value):
with self._lock:
self._cache[key] = value
class PDHCounter:
def __init__(self, counter_path: str):
self.path = counter_path
self.hquery = None
self.hcounter = None
self.counter_info = None
self.counter_type = None
self.base_sampling_interval = 0.5 # Base sampling interval
self.max_sampling_interval = 2.0 # Maximum sampling interval
try:
self.hquery = win32pdh.OpenQuery()
self.hcounter = win32pdh.AddCounter(self.hquery, counter_path)
self.counter_info = win32pdh.GetCounterInfo(self.hcounter, True)
self.counter_type = self.counter_info[2] # Get the PDH counter type
# Initial collection to prepare the counter
win32pdh.CollectQueryData(self.hquery)
except win32pdh.error as e:
self.close()
raise CounterAccessError(f"Failed to add counter {counter_path}: {str(e)}")
except pywintypes.error as e:
self.close()
raise CounterAccessError(f"Windows error initializing counter {counter_path}: {str(e)}")
except Exception as e:
self.close()
raise CounterError(f"Unexpected error initializing counter {counter_path}: {str(e)}")
def __enter__(self):
"""Support for context manager protocol"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Cleanup when exiting context manager"""
self.close()
return False # Re-raise any exceptions
def _needs_multiple_samples(self) -> bool:
"""Determine if counter needs multiple samples based on its PDH counter type."""
try:
multiple_sample_types = {
PERF_COUNTER_COUNTER,
PERF_COUNTER_BULK_COUNT,
PERF_100NSEC_TIMER,
PERF_COUNTER_TIMER,
PERF_PRECISION_100NS_TIMER,
PERF_PRECISION_SYSTEM_TIMER,
PERF_SAMPLE_COUNTER,
PERF_SAMPLE_FRACTION,
PERF_COUNTER_NODATA,
}
return self.counter_type in multiple_sample_types
except Exception as e:
raise CounterError(f"Error determining sample type for counter {self.path}: {str(e)}")
def _get_initial_sampling_interval(self) -> float:
"""
Determine initial sampling interval based on counter type and format
"""
try:
# Get base counter type by masking
base_type = self.counter_type & 0x00F00000
format_type = self.counter_type & 0x0000FFFF
# Timer-based counters need longer intervals
if format_type in {
0x0300, # PERF_COUNTER_TIMER
0x0400, # PERF_COUNTER_TIMER_INV
0x0200, # PERF_COUNTER_TIMER_TEXT
}:
return 1.0
# Rate-based counters
if base_type in {
0x00200000, # PERF_COUNTER_RATE
0x00400000, # PERF_COUNTER_FRACTION
}:
return 0.5
# Bulk counters
if format_type in {
0x0500, # PERF_COUNTER_BULK
0x0600, # PERF_COUNTER_LARGE_BULK
}:
return 0.5
# Delta counters
if format_type in {
0x0700, # PERF_COUNTER_DELTA
0x0800, # PERF_COUNTER_LARGE_DELTA
}:
return 0.5
# For instantaneous value counters, use a shorter interval
if format_type in {
0x0100, # PERF_COUNTER_VALUE
0x0200, # PERF_COUNTER_RATE
}:
return 0.1
# Default sampling interval for unknown types
return self.base_sampling_interval
except Exception as e:
print(f"Error determining sampling interval for counter {self.path}: {str(e)}")
return self.base_sampling_interval
def get_value(self, retries=3) -> float:
last_error = None
start_time = time.time()
sampling_interval = self._get_initial_sampling_interval()
for attempt in range(retries):
try:
if time.time() - start_time > DEFAULT_PDH_TIMEOUT:
raise TimeoutError(f"Counter collection timed out after {DEFAULT_PDH_TIMEOUT} seconds")
# First collection
win32pdh.CollectQueryData(self.hquery)
time.sleep(sampling_interval)
# Second collection
win32pdh.CollectQueryData(self.hquery)
# Always use NOCAP100 to get raw values
type_flag, val = win32pdh.GetFormattedCounterValue(
self.hcounter,
win32pdh.PDH_FMT_DOUBLE | win32pdh.PDH_FMT_NOCAP100
)
# Get the counter type from the counter info
counter_type = self.counter_type & 0x00F00000 # Mask to get the base counter type
# Check if this is a proper percentage counter based on its type
is_percentage_counter = counter_type in {
0x00200000, # PERF_COUNTER_RATE
0x00400000, # PERF_COUNTER_FRACTION
0x00800000, # PERF_COUNTER_BASE
} and '%' in self.path
if is_percentage_counter:
# Only cap actual percentage counters
return min(100.0, max(0.0, float(val)))
# For all other counters, return the raw value
return float(val)
except win32pdh.error as e:
last_error = f"PDH Error: {e.error_message} (code: {e.error_code})"
if attempt == retries - 1:
raise CounterAccessError(f"Failed to collect counter {self.path}: {last_error}")
time.sleep(0.5 * (attempt + 1)) # Progressive delay between retries
def close(self):
"""Clean up the counter with proper error handling."""
if hasattr(self, 'hcounter') and self.hcounter:
try:
win32pdh.RemoveCounter(self.hcounter)
except Exception as e:
print(f"Warning: Failed to remove counter {self.path}: {str(e)}")
finally:
self.hcounter = None
if hasattr(self, 'hquery') and self.hquery:
try:
win32pdh.CloseQuery(self.hquery)
except Exception as e:
print(f"Warning: Failed to close query for counter {self.path}: {str(e)}")
finally:
self.hquery = None
def handle_counter_error(e: Exception, counter_path: str = None) -> Tuple[str, int]:
"""
Centralized error handling for counter-related operations.
Returns (error_message, state) tuple.
"""
if isinstance(e, win32pdh.error):
error_code = getattr(e, 'error_code', None)
if error_code in (0xC0000BB8, 0x800007D5): # PDH_NO_DATA, ERROR_NO_COUNTER
return f"Counter not found: {counter_path}", STATE_UNKNOWN
elif error_code == 0xC0000BBA: # PDH_NO_MORE_DATA
return f"No data available for counter: {counter_path}", STATE_UNKNOWN
elif error_code == 0xC0000BC6: # PDH_INVALID_PATH
return f"Invalid counter path: {counter_path}", STATE_UNKNOWN
elif error_code == 0xC0000BBD: # PDH_CALC_NEGATIVE_VALUE
return f"Counter returned negative value: {counter_path}", STATE_UNKNOWN
else:
return f"PDH error {error_code}: {str(e)}", STATE_UNKNOWN
elif isinstance(e, sqlite3.Error):
return f"Database error: {str(e)}", STATE_UNKNOWN
elif isinstance(e, CounterError):
if isinstance(e, CounterNotFoundError):
return f"Counter not found: {e.counter_path}", STATE_UNKNOWN
elif isinstance(e, CounterAccessError):
return f"Cannot access counter: {e.counter_path}", STATE_UNKNOWN
elif isinstance(e, CounterTypeError):
return f"Counter type error: {e.counter_path}", STATE_UNKNOWN
else:
return str(e), STATE_UNKNOWN
else:
return f"Unexpected error: {str(e)}", STATE_UNKNOWN
def get_counter_type_worker(path: str, cache: CounterCache) -> str:
"""Worker function to get counter type based on PDH counter type and format"""
cached = cache.get(path)
if cached:
return cached
try:
with PDHCounter(path) as counter:
counter_info = win32pdh.GetCounterInfo(counter.hcounter, True)
counter_type = counter_info[2]
display_name = str(counter_info[1] or '')
path_str = str(path)
# Check for text counters first
if counter_type == 0x0000000B: # PERF_COUNTER_TEXT
result = 'TEXT'
# Check for percentage indicators
elif any([
counter_type & 0x20C20400 == 0x20C20400, # PERF_SAMPLE_FRACTION
any(indicator in (display_name.lower() + path_str.lower()) for indicator in
['%', 'percent', 'pct', ' ratio ', '/100'])
]):
result = 'PERCENT'
else:
result = 'NUMBER'
cache.set(path, result)
return result
except Exception as e:
print(f"Error in get_counter_type for {path}: {str(e)}")
return 'NUMBER'
def get_wildcard_counter_path(partial_path: str) -> str:
"""Convert a partial counter path into a wildcard path for matching"""
# Split the path into components
parts = partial_path.strip('\\').split('\\')
# Handle different formats:
# 1. Just counter name: "% Processor Time"
# 2. Object and counter: "Processor\% Processor Time"
# 3. Full path: "\Processor(_Total)\% Processor Time"
if len(parts) == 1:
# Just counter name - create wildcard path
return f"\\*\\{parts[0]}"
elif len(parts) == 2:
# Object and counter - add wildcard for instance
return f"\\{parts[0]}(*){parts[1]}"
else:
# Return as-is if it's a full path
return partial_path
def expand_counter_path(wildcard_path: str) -> List[str]:
"""Expand a wildcard counter path to all matching paths"""
try:
# Get list of paths matching the wildcard
paths = win32pdh.ExpandCounterPath(wildcard_path)
return paths if paths else []
except:
return []
def get_counter_object_names() -> Dict[str, str]:
"""Get mapping of English counter object names to localized names"""
try:
# Get all counter objects in the system
objects = win32pdh.EnumObjects(None, None, 0)
return {obj.lower(): obj for obj in objects}
except:
return {}
def get_counter_names_for_object(object_name: str) -> Dict[str, Tuple[str, List[str]]]:
"""Get mapping of English counter names to localized names and their instances for an object"""
try:
# Get all counters and instances for the object
counters, instances = win32pdh.EnumObjectItems(None, None, object_name, win32pdh.PERF_DETAIL_WIZARD)
# Create a dictionary to store the results
counter_dict = {}
# For objects without instances, we'll get an empty list
if not instances:
instances = ['']
# Create a dictionary mapping each counter to its instances
for counter in counters:
counter_key = counter.lower()
counter_dict[counter_key] = (counter, instances)
return counter_dict
except win32pdh.error as e:
error_code, error_message = e.args
print(f"Error getting counter names for object '{object_name}': ({error_code}, '{error_message}')")
return {}
except Exception as e:
print(f"Unexpected error getting counter names for object '{object_name}': {str(e)}")
return {}
def resolve_counter_path(counter_path: str) -> str:
"""
Resolve a counter path to the correct format for the current system.
Uses a generalized approach that works for any counter type.
"""
try:
# If it's already a valid path, return it
if counter_path.startswith('\\') and check_counter_exists(counter_path):
return counter_path
# Strip leading/trailing backslashes and get path components
path = counter_path.strip('\\')
parts = path.split('\\')
# Get all available counter objects
objects = get_counter_object_names()
# Case 1: Single part - just counter name
if len(parts) == 1:
counter_name = parts[0].lower()
# Try each object with the counter
for obj_name in objects.values():
try:
counters = get_counter_names_for_object(obj_name)
for counter_key, (counter_orig, instances) in counters.items():
if counter_name in counter_key:
# Try different instance combinations
paths_to_try = []
if instances:
if '_Total' in instances:
paths_to_try.append(f"\\{obj_name}(_Total)\\{counter_orig}")
paths_to_try.append(f"\\{obj_name}(*)\\{counter_orig}")
paths_to_try.append(f"\\{obj_name}({instances[0]})\\{counter_orig}")
else:
paths_to_try.append(f"\\{obj_name}\\{counter_orig}")
# Try each path
for path in paths_to_try:
if check_counter_exists(path):
return path
except:
continue
# Case 2: Two parts - object and counter
elif len(parts) == 2:
obj_name, counter_name = parts
obj_name_lower = obj_name.lower()
# Find matching object
matched_obj = None
for orig_obj, mapped_obj in objects.items():
if obj_name_lower in orig_obj.lower():
matched_obj = mapped_obj
break
if matched_obj:
try:
counters = get_counter_names_for_object(matched_obj)
for counter_key, (counter_orig, instances) in counters.items():
if counter_name.lower() in counter_key:
# Try different instance combinations
paths_to_try = []
if instances:
if '_Total' in instances:
paths_to_try.append(f"\\{matched_obj}(_Total)\\{counter_orig}")
paths_to_try.append(f"\\{matched_obj}(*)\\{counter_orig}")
paths_to_try.append(f"\\{matched_obj}({instances[0]})\\{counter_orig}")
else:
paths_to_try.append(f"\\{matched_obj}\\{counter_orig}")
# Try each path
for path in paths_to_try:
if check_counter_exists(path):
return path
except:
pass
# Case 3: Three parts - object, instance, counter
elif len(parts) == 3:
obj_name, instance, counter_name = parts
# Try with provided instance and wildcards
paths_to_try = [
f"\\{obj_name}({instance})\\{counter_name}",
f"\\{obj_name}(*)\\{counter_name}",
f"\\{obj_name}\\_Total\\{counter_name}",
f"\\{obj_name}\\{counter_name}"
]
for path in paths_to_try:
if check_counter_exists(path):
return path
return None
except Exception as e:
print(f"Error resolving path '{counter_path}': {str(e)}")
return None
def validate_counter_paths_parallel(counter_paths: List[str]) -> Dict[str, str]:
"""Validate counter paths in parallel with generalized path resolution"""
valid_paths = {}
path_queue = Queue()
result_dict = {}
def validate_single_path(path: str) -> Tuple[str, str]:
try:
# First check if the path is already valid
if check_counter_exists(path):
return path, path
# Try wildcard expansion first
expanded_paths = expand_counter_path(get_wildcard_counter_path(path))
if expanded_paths:
# Prefer _Total or first available instance
for expanded in expanded_paths:
if '_Total' in expanded and check_counter_exists(expanded):
return path, expanded
return path, expanded_paths[0]
# If no wildcard match, try resolution
resolved_path = resolve_counter_path(path)
if resolved_path and check_counter_exists(resolved_path):
return path, resolved_path
return path, None
except:
return path, None
def worker():
while True:
try:
path = path_queue.get_nowait()
except:
break
orig_path, valid_path = validate_single_path(path)
result_dict[orig_path] = valid_path
path_queue.task_done()
# Add paths to queue
for path in counter_paths:
path_queue.put(path)
# Create and start worker threads
threads = []
for _ in range(min(MAX_WORKERS, len(counter_paths))):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
threads.append(t)
# Wait for completion
path_queue.join()
for t in threads:
t.join()
return result_dict
def check_counter_exists(path: str) -> bool:
"""
Check if a counter path exists, regardless of its value.
Returns True if the counter can be created and collected from.
"""
try:
query = win32pdh.OpenQuery()
try:
counter = win32pdh.AddCounter(query, path)
win32pdh.CollectQueryData(query)
win32pdh.RemoveCounter(counter)
return True
finally:
win32pdh.CloseQuery(query)
except:
return False
def collect_single_counter(path: str, type_cache: CounterCache) -> Tuple[str, float, str, str]:
"""Collect a single counter value with proper sampling"""
try:
with PDHCounter(path) as counter:
# Add initial delay for CPU counters
if "processor" in path.lower() and "time" in path.lower():
time.sleep(1.0) # Give CPU counter time to initialize
value = counter.get_value()
counter_type = get_counter_type_worker(path, type_cache)
return path, value, counter_type, None
except win32pdh.error as e:
msg, state = handle_counter_error(e, path)
return path, None, None, msg
except CounterAccessError as e:
msg, state = handle_counter_error(e, path)
return path, None, None, msg
except CounterError as e:
msg, state = handle_counter_error(e, path)
return path, None, None, msg
except Exception as e:
msg, state = handle_counter_error(e, path)
return path, None, None, msg
def collect_and_store_counter_data(counter_paths: List[str], db_manager: DatabaseManager) -> Tuple[Dict[str, float], List[str], Dict[str, str], Dict[str, str]]:
"""Collect counter data with improved error handling"""
values = {}
missing_counters = []
adjusted_counters = {}
counter_types = {}
if not counter_paths:
msg, state = handle_counter_error(ValueError("No counter paths provided"))
raise ValueError(msg)
try:
valid_paths = validate_counter_paths_parallel(counter_paths)
path_mapping = {v: k for k, v in valid_paths.items() if v}
missing_counters.extend(p for p, v in valid_paths.items() if not v)
valid_path_list = [p for p in valid_paths.values() if p]
if not valid_path_list:
return {}, missing_counters, {}, {}
type_cache = CounterCache()
batch_values = []
collection_errors = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = []
for path in valid_path_list:
futures.append(executor.submit(collect_single_counter, path, type_cache))
for future in as_completed(futures):
try:
path, value, ctype, error = future.result()
if error:
collection_errors.append(error)
orig_path = path_mapping.get(path, path)
missing_counters.append(orig_path)
elif value is not None:
orig_path = path_mapping.get(path, path)
values[orig_path] = value
counter_types[orig_path] = ctype
if path != orig_path:
adjusted_counters[orig_path] = path
batch_values.append((orig_path, value))
except Exception as e:
msg, state = handle_counter_error(e)
collection_errors.append(msg)
missing_counters.append(path_mapping.get(path, path))
if batch_values:
try:
db_manager.bulk_insert_values(batch_values)
except Exception as e:
msg, state = handle_counter_error(DatabaseError(f"Failed to store counter values: {str(e)}"))
raise DatabaseError(msg)
return values, missing_counters, adjusted_counters, counter_types
except CounterError as e:
msg, state = handle_counter_error(e)
raise CounterError(msg)
except Exception as e:
msg, state = handle_counter_error(e)
raise Exception(msg)
def parse_args():
"""Parse command line arguments with support for group-specific thresholds."""
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument('-V', '--version',
action='version',
version='{0}: v{1} by {2}'.format('%(prog)s', __version__, __author__))
parser.add_argument('--always-ok',
help='Always returns OK.',
dest='ALWAYS_OK',
action='store_true',
default=False)
parser.add_argument('--count',
help='Number of measurements to calculate mean value from. Default: %(default)s',
dest='COUNT',
type=int,
default=DEFAULT_COUNT)
# Default thresholds for percentage-type counters
parser.add_argument('--percent-critical',
help='Critical threshold for percentage counters. Default: %(default)s',
dest='PERCENT_CRIT',
type=float,
default=90)
parser.add_argument('--percent-warning',
help='Warning threshold for percentage counters. Default: %(default)s',
dest='PERCENT_WARN',
type=float,
default=80)
# Default thresholds for numeric-type counters
parser.add_argument('--number-critical',
help='Critical threshold for numeric counters. Default: %(default)s',
dest='NUMBER_CRIT',
type=float,
default=None)
parser.add_argument('--number-warning',
help='Warning threshold for numeric counters. Default: %(default)s',
dest='NUMBER_WARN',
type=float,
default=None)
parser.add_argument('--perfcounter', # Using original --perfcounter argument
help='Comma-separated list of performance counter paths to monitor',
dest='PERFCOUNTER', # Using original PERFCOUNTER destination
type=str,
required=True)
args = parser.parse_args()
# Validate percentage thresholds
try:
validate_thresholds(args.PERCENT_WARN, args.PERCENT_CRIT)
except ValueError as e:
lib.base.cu(f"Invalid percentage thresholds: {str(e)}")
# Validate numeric thresholds if provided
if args.NUMBER_WARN is not None or args.NUMBER_CRIT is not None:
try:
validate_thresholds(args.NUMBER_WARN, args.NUMBER_CRIT)
except ValueError as e:
lib.base.cu(f"Invalid numeric thresholds: {str(e)}")
return args
def validate_thresholds(warn: float, crit: float, counter_type: str = None):
"""
Validate warning and critical thresholds with counter type awareness.
"""
if warn is not None and crit is not None:
if counter_type == 'PERCENT':
if not (0 <= warn <= 100) or not (0 <= crit <= 100):
raise ValueError("Percentage thresholds must be between 0 and 100")
elif warn < 0 or crit < 0:
raise ValueError("Thresholds cannot be negative")
if warn > crit:
raise ValueError("Warning threshold cannot be greater than critical")
def get_threshold_state(value: float, warn: float, crit: float, counter_type: str = None) -> Tuple[int, str]:
"""
Determine state and status string based on thresholds and counter type.
Handles different types of counters appropriately.
"""
# No thresholds defined
if warn is None and crit is None:
return STATE_OK, ""
# Handle percentage counters
if counter_type == 'PERCENT':
# Ensure value is within percentage range
value = min(100, max(0, value))
# Determine state based on thresholds
if crit is not None and value >= crit:
return STATE_CRIT, "[CRITICAL]"
elif warn is not None and value >= warn:
return STATE_WARN, "[WARNING]"
return STATE_OK, ""
def format_perfdata(name: str, value: float, counter_type: str, warn: float = None, crit: float = None) -> str:
"""
Format performance data with counter type awareness.
"""
# Determine unit based on counter type
if counter_type == 'PERCENT':
unit = '%'
min_val = 0
max_val = 100
else:
unit = ''
min_val = 0
max_val = None
return lib.base.get_perfdata(
name,
value,
unit,
warn,
crit,
min_val,
max_val
)
def format_counter_value(name: str, value: float) -> str:
"""
Format counter values with appropriate precision based on counter type and value range
"""
# For very small values (like disk seconds), use scientific notation or more decimal places
if 'sec/' in name.lower() and value < 0.001:
return f"{value:.2e}" # Scientific notation for very small values
elif 'sec/' in name.lower():
return f"{value:.4f}" # 4 decimal places for timing values
# For byte values, round to nearest whole number
if 'bytes' in name.lower():
return f"{value:.0f}"
# For rates (/sec), use 2 decimal places
if '/sec' in name.lower():
return f"{value:.2f}"
# For percentage values
if '%' in name:
return f"{value:.1f}"
# Default format - determine based on value range
if value >= 100:
return f"{value:.0f}"
elif value >= 10:
return f"{value:.1f}"
else:
return f"{value:.2f}"
def get_threshold_summary(counter_types: Dict[str, str], args) -> str:
"""Generate a summary of active thresholds based on counter types"""
# Count counter types
type_counts = {'PERCENT': 0, 'NUMBER': 0, 'TEXT': 0}
for counter_type in counter_types.values():
type_counts[counter_type] = type_counts.get(counter_type, 0) + 1
# Generate appropriate threshold summary
summaries = []
if type_counts['PERCENT'] > 0:
if args.PERCENT_WARN is not None and args.PERCENT_CRIT is not None:
summaries.append(f"percentage counters (warn={args.PERCENT_WARN}% crit={args.PERCENT_CRIT}%)")
if type_counts['NUMBER'] > 0:
if args.NUMBER_WARN is not None and args.NUMBER_CRIT is not None:
summaries.append(f"numeric counters (warn={args.NUMBER_WARN} crit={args.NUMBER_CRIT})")
if not summaries:
return "Performance Metrics (no thresholds defined)"
return "Performance Metrics - thresholds for " + ", ".join(summaries)
def main():
"""Main function with improved database handling and grouped output."""
try:
args = parse_args()
except SystemExit:
sys.exit(STATE_UNKNOWN)
except Exception as e:
lib.base.cu(f'Error parsing arguments: {str(e)}')
counter_paths = [x.strip() for x in args.PERFCOUNTER.split(',')]
if not counter_paths:
lib.base.cu('No counter paths provided')
msg = ''
perfdata = ''
state = STATE_OK
db_manager = None
try:
# Get the temporary directory
temp_dir = tempfile.gettempdir()
if not os.path.exists(temp_dir):
raise Exception(f"Temporary directory {temp_dir} does not exist")
db_filename = 'linuxfabrik-monitoring-plugins-perfcounter.db'
db_path = os.path.join(temp_dir, db_filename)
try:
db_manager = DatabaseManager(db_path)
db_manager.initialize_table()
except sqlite3.Error as e:
raise DatabaseError(f"Failed to initialize database: {str(e)}")
try:
# Use size-based check:
db_manager.check_db_size()
counter_values, missing_counters, adjusted_counters, counter_types = collect_and_store_counter_data(
counter_paths, db_manager)
# If all counters are missing, keep OK state but provide clear message
if len(missing_counters) == len(counter_paths):
msg = "All specified performance counters are invalid or inaccessible:\n"
msg += "\n".join(f" {counter}" for counter in missing_counters)
lib.base.oao(msg, STATE_OK, perfdata, always_ok=args.ALWAYS_OK)
return
except Exception as e:
raise Exception(f"Failed to collect counter data: {str(e)}")
# Clean old data for each counter that was successfully collected
for counter in counter_values.keys():
db_manager.clean_old_data(counter, args.COUNT)
# Vacuum based on file size instead of random chance
try:
db_size = os.path.getsize(db_path)
if db_size > 100 * 1024 * 1024: # 100MB
db_manager.vacuum()
except OSError:
pass
msg_parts = []
# Add threshold summary as first line
msg_parts.append(get_threshold_summary(counter_types, args))
# Show missing counters first if there are any, but don't change state
if missing_counters:
msg_parts.append("\nMissing or invalid counters:")
msg_parts.extend(f" {counter}" for counter in missing_counters)
if adjusted_counters:
msg_parts.append("\nAdjusted counter paths:")
msg_parts.extend(f" \"{orig}\" -> \"{adj}\"" for orig, adj in adjusted_counters.items())
# Group counters by type
percent_counters = []
number_counters = []
text_counters = []
insufficient_data_warnings = []
for counter in counter_paths:
if counter in missing_counters:
continue
counter_name = counter.split('\\')[-1]
mean_value, count = db_manager.get_mean_value(counter, args.COUNT)
if count < args.COUNT:
insufficient_data_warnings.append(
f" {counter_name}: using {count}/{args.COUNT} values for mean calculation"
)
counter_type = counter_types.get(counter, 'NUMBER')
if counter_type == 'PERCENT':
current_state, status = get_threshold_state(
mean_value, args.PERCENT_WARN, args.PERCENT_CRIT)
formatted_value = format_counter_value(counter_name, mean_value)
percent_counters.append((counter_name, formatted_value, status))
state = max(state, current_state)
perfdata += lib.base.get_perfdata(
counter_name,
mean_value,
'%',
args.PERCENT_WARN,
args.PERCENT_CRIT,
0,
100
)
elif counter_type == 'NUMBER':
current_state, status = get_threshold_state(
mean_value, args.NUMBER_WARN, args.NUMBER_CRIT)
formatted_value = format_counter_value(counter_name, mean_value)
number_counters.append((counter_name, formatted_value, status))
state = max(state, current_state)
perfdata += lib.base.get_perfdata(
counter_name,
mean_value,
'',
args.NUMBER_WARN,
args.NUMBER_CRIT,
0,
None
)
else: # TEXT type
formatted_value = str(mean_value)
text_counters.append((counter_name, formatted_value, ""))
# Format grouped output
if insufficient_data_warnings:
msg_parts.append("\nInsufficient data for some counters:")
msg_parts.extend(insufficient_data_warnings)
if percent_counters:
msg_parts.append("\nPercentage Counters:")
for name, value, status in percent_counters:
msg_parts.append(f" {name}: {value}% {status}")
if number_counters:
msg_parts.append("\nNumeric Counters:")
for name, value, status in number_counters:
msg_parts.append(f" {name}: {value} {status}")
if text_counters:
msg_parts.append("\nText Counters:")
for name, value, _ in text_counters:
msg_parts.append(f" {name}: {value}")
msg = '\n'.join(msg_parts)
except DatabaseError as e:
lib.base.cu(f"Database error: {str(e)}")
except CounterError as e:
lib.base.cu(f"Counter error: {str(e)}")
except Exception as e:
lib.base.cu(f"Unexpected error: {str(e)}")
finally:
if db_manager:
try:
db_manager.close()
except Exception as e:
print(f"Warning: Failed to close database connection: {str(e)}")
lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)
if __name__ == '__main__':
try:
main()
except Exception as e:
lib.base.cu(f'Unexpected error: {str(e)}')
```
Also the director json:
```
{
"Command": {
"cmd-check-perfcounter-windows": {
"arguments": {
"--always-ok": {
"set_if": "$perfcounter_windows_always_ok$"
},
"--count": {
"value": "$perfcounter_windows_count$"
},
"--percent-critical": {
"value": "$perfcounter_windows_percent_critical$"
},
"--percent-warning": {
"value": "$perfcounter_windows_percent_warning$"
},
"--number-critical": {
"value": "$perfcounter_windows_number_critical$"
},
"--number-warning": {
"value": "$perfcounter_windows_number_warning$"
},
"--perfcounter": {
"value": "$perfcounter_windows_perfcounter$"
}
},
"command": "C:\\ProgramData\\icinga2\\usr\\lib64\\nagios\\plugins\\perfcounter.exe",
"disabled": false,
"fields": [
{
"datafield_id": 1,
"is_required": "n",
"var_filter": null
},
{
"datafield_id": 2,
"is_required": "n",
"var_filter": null
},
{
"datafield_id": 3,
"is_required": "n",
"var_filter": null
},
{
"datafield_id": 4,
"is_required": "n",
"var_filter": null
},
{
"datafield_id": 5,
"is_required": "n",
"var_filter": null
},
{
"datafield_id": 6,
"is_required": "n",
"var_filter": null
},
{
"datafield_id": 7,
"is_required": "y",
"var_filter": null
}
],
"imports": [],
"is_string": null,
"methods_execute": "PluginCheck",
"object_name": "cmd-check-perfcounter-windows",
"object_type": "object",
"timeout": "10",
"vars": {},
"zone": null,
"uuid": "c830d8ca-20a4-4fd8-bfdf-3ae25f947f96"
}
},
"ServiceTemplate": {
"tpl-service-perfcounter-windows": {
"action_url": null,
"apply_for": null,
"assign_filter": null,
"check_command": "cmd-check-perfcounter-windows",
"check_interval": 60,
"check_period": null,
"check_timeout": null,
"command_endpoint": null,
"disabled": false,
"display_name": null,
"enable_active_checks": null,
"enable_event_handler": null,
"enable_flapping": null,
"enable_notifications": true,
"enable_passive_checks": null,
"enable_perfdata": null,
"event_command": null,
"fields": [],
"flapping_threshold_high": null,
"flapping_threshold_low": null,
"groups": [],
"host": null,
"icon_image": "perfcounter.png",
"icon_image_alt": null,
"imports": [
"tpl-service-generic"
],
"max_check_attempts": 5,
"notes": "Checks Windows performance counters. Supports monitoring multiple counter paths and calculates mean values from specified number of measurements.",
"notes_url": "https://github.com/Linuxfabrik/monitoring-plugins/tree/main/check-plugins/perfcounter",
"object_name": "tpl-service-perfcounter-windows",
"object_type": "template",
"retry_interval": 15,
"service_set": null,
"template_choice": null,
"use_agent": null,
"use_var_overrides": null,
"vars": {
"criticality": "C",
"perfcounter_windows_always_ok": false,
"perfcounter_windows_count": 1,
"perfcounter_windows_percent_warning": 80,
"perfcounter_windows_percent_critical": 90,
"perfcounter_windows_number_warning": null,
"perfcounter_windows_number_critical": null
},
"volatile": null,
"zone": null,
"uuid": "abcd3b45-fff0-49f8-8065-19d824ed0d7f"
}
},
"Datafield": {
"1": {
"varname": "perfcounter_windows_always_ok",
"caption": "Perfcounter: Always OK?",
"description": "Always returns OK.",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeBoolean",
"format": null,
"settings": {},
"uuid": "3f4cd078-e8c6-4c2b-b4fe-449ebc9d0b01"
},
"2": {
"varname": "perfcounter_windows_count",
"caption": "Perfcounter: Count",
"description": "Number of measurements to calculate mean value from.",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
"format": null,
"settings": {
"visibility": "visible"
},
"uuid": "582ad4b0-9399-47e1-82a4-2db938828528"
},
"3": {
"varname": "perfcounter_windows_percent_critical",
"caption": "Perfcounter: Percent Critical",
"description": "Critical threshold for percentage counters.",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
"format": null,
"settings": {
"visibility": "visible"
},
"uuid": "146363e7-2655-49c7-96e0-42f5449da430"
},
"4": {
"varname": "perfcounter_windows_percent_warning",
"caption": "Perfcounter: Percent Warning",
"description": "Warning threshold for percentage counters.",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
"format": null,
"settings": {
"visibility": "visible"
},
"uuid": "afe764dd-a4de-4e61-b8d6-40849a9bebc5"
},
"5": {
"varname": "perfcounter_windows_number_critical",
"caption": "Perfcounter: Number Critical",
"description": "Critical threshold for numeric counters.",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
"format": null,
"settings": {
"visibility": "visible"
},
"uuid": "bfedc075-96bb-4911-97cb-efaa7022193c"
},
"6": {
"varname": "perfcounter_windows_number_warning",
"caption": "Perfcounter: Number Warning",
"description": "Warning threshold for numeric counters.",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
"format": null,
"settings": {
"visibility": "visible"
},
"uuid": "cfedc075-96bb-4911-97cb-efaa7022193d"
},
"7": {
"varname": "perfcounter_windows_perfcounter",
"caption": "Perfcounter: Paths",
"description": "Comma-separated list of performance counter paths to monitor",
"datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
"format": null,
"settings": {
"visibility": "visible"
},
"uuid": "dfedc075-96bb-4911-97cb-efaa7022193e"
}
}
}
```
### Additional context
_No response_
The text was updated successfully, but these errors were encountered:
Describe the solution you'd like
Please add a general purpose windows perfomance counter plugin. The following is the code we are using on thousands of windows machines it is very robust. Importantly it uses win32pdh module which encapsulates the Windows Performance Data Helpers API
The text was updated successfully, but these errors were encountered: