Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[perfcounter: ]A general purpose windows only performance counter plugin. #797

Open
drapiti opened this issue Jan 3, 2025 · 0 comments
Open
Assignees
Labels
enhancement New feature or request

Comments

@drapiti
Copy link

drapiti commented Jan 3, 2025

Describe the solution you'd like

Please add a general purpose windows perfomance counter plugin. The following is the code we are using on thousands of windows machines it is very robust. Importantly it uses win32pdh module which encapsulates the Windows Performance Data Helpers API

#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-

"""Windows Performance Counter monitoring plugin with multithreading support and native SQLite threading."""

import argparse
import sys
from typing import Dict, List, Tuple
import time
import threading
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed
import sqlite3
import tempfile
import os

import lib.base
import lib.db_sqlite
from lib.globals import STATE_OK, STATE_UNKNOWN, STATE_WARN, STATE_CRIT


class CounterError(Exception):
    """Base exception for counter-related errors"""
    def __init__(self, message: str, counter_path: str = None, error_code: int = None):
        self.counter_path = counter_path
        self.error_code = error_code
        super().__init__(message)

class CounterNotFoundError(CounterError):
    """Exception raised when a counter path cannot be found"""
    pass

class CounterAccessError(CounterError):
    """Exception raised when a counter exists but cannot be accessed"""
    pass

class CounterTypeError(CounterError):
    """Exception raised when there's an issue with counter type detection"""
    pass

class DatabaseError(Exception):
    """Exception raised for database-related errors"""
    def __init__(self, message: str, sql_error: str = None):
        self.sql_error = sql_error
        super().__init__(message)

try:
    import win32pdh
    import pywintypes
except ImportError:
    lib.base.cu('Python module "pywin32" is not installed.')

__author__ = 'xxxxx'
__version__ = '2024033103'

DESCRIPTION = """Monitors specified Windows Performance Counters using multithreading and native SQLite threading."""

DEFAULT_COUNT = 5
DEFAULT_WARN = 80
DEFAULT_CRIT = 90
MAX_WORKERS = 4  # Maximum number of worker threads
DEFAULT_PDH_TIMEOUT = 30  # seconds

PERF_COUNTER_COUNTER = 0x10410400
PERF_COUNTER_BULK_COUNT = 0x10410500
PERF_100NSEC_TIMER = 0x10410800
PERF_COUNTER_TIMER = 0x10410400
PERF_PRECISION_100NS_TIMER = 0x10410800
PERF_PRECISION_SYSTEM_TIMER = 0x10410800
PERF_SAMPLE_COUNTER = 0x10410400
PERF_SAMPLE_FRACTION = 0x20C20400
PERF_COUNTER_NODATA = 0x00400000

class DatabaseManager:
    MAX_DB_SIZE = 60 * 1024 * 1024  # 60MB
    """Thread-safe database manager using SQLite's native threading support"""
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path, check_same_thread=False)
        self.conn.execute('PRAGMA journal_mode=WAL')
        self.conn.execute('PRAGMA busy_timeout=5000')
        self._write_lock = threading.Lock()
        self._read_lock = threading.RLock()
        self._reader_count = 0
        self._reader_count_lock = threading.Lock()
        
    def __enter__(self):
        self._acquire_read()
        return self
        
    def __exit__(self, exc_type, exc_val, exc_tb):
        self._release_read()

    def initialize_table(self):
        """Create the performance data table if it doesn't exist"""
        with self._write_lock:
            self.conn.execute('''
                CREATE TABLE IF NOT EXISTS perfdata (
                    counter TEXT NOT NULL,
                    value REAL NOT NULL,
                    timestamp INTEGER DEFAULT (strftime('%s', 'now'))
                )
            ''')
            self.conn.execute('''
                CREATE INDEX IF NOT EXISTS idx_counter_timestamp 
                ON perfdata(counter, timestamp)
            ''')
            self.conn.commit()

    def _acquire_read(self):
        """Acquire a read lock while tracking reader count"""
        with self._reader_count_lock:
            self._reader_count += 1
            if self._reader_count == 1:
                self._write_lock.acquire()
        try:
            self._read_lock.acquire()
        except:
            with self._reader_count_lock:
                self._reader_count -= 1
                if self._reader_count == 0:
                    self._write_lock.release()
            raise

    def _release_read(self):
        """Release a read lock while tracking reader count"""
        self._read_lock.release()
        with self._reader_count_lock:
            self._reader_count -= 1
            if self._reader_count == 0:
                self._write_lock.release()

    def insert_value(self, counter: str, value: float):
        """Insert a single counter value"""
        with self._write_lock:
            self.conn.execute(
                'INSERT INTO perfdata (counter, value) VALUES (?, ?)',
                (counter, value)
            )
            self.conn.commit()

    def check_db_size(self):
        """Check database size and vacuum if needed"""
        try:
            if os.path.exists(self.db_path):
                if os.path.getsize(self.db_path) > self.MAX_DB_SIZE:
                    self.vacuum()
        except OSError as e:
            print(f"Warning: Unable to check database size: {str(e)}")

    def bulk_insert_values(self, values: List[Tuple[str, float]]):
        """Insert multiple counter values with size checking"""
        self.check_db_size()  # Check size before bulk insert
        with self._write_lock:
            try:
                self.conn.execute('BEGIN IMMEDIATE')
                self.conn.executemany(
                    'INSERT INTO perfdata (counter, value) VALUES (?, ?)',
                    values
                )
                self.conn.commit()
            except sqlite3.Error as e:
                self.conn.rollback()
                raise Exception(f"Database error during bulk insert: {str(e)}")
            except Exception as e:
                self.conn.rollback()
                raise Exception(f"Unexpected error during bulk insert: {str(e)}")

    def get_mean_value(self, counter: str, limit: int) -> Tuple[float, int]:
        """
        Get weighted mean value for a counter from recent measurements.
        More recent values have progressively higher weights.
        
        Args:
            counter (str): Counter path to query
            limit (int): Maximum number of samples to consider
            
        Returns:
            Tuple[float, int]: (weighted mean value, number of samples used)
        """
        try:
            with self:
                # Get ordered samples with row numbers for weighting
                result = self.conn.execute('''
                    WITH ordered_samples AS (
                        SELECT 
                            value,
                            ROW_NUMBER() OVER (ORDER BY timestamp DESC) as row_num
                        FROM perfdata 
                        WHERE counter = ?
                        ORDER BY timestamp DESC
                        LIMIT ?
                    ),
                    weights AS (
                        SELECT 
                            value,
                            CAST(row_num AS FLOAT) as weight  -- Linear weight based on position
                        FROM ordered_samples
                    ),
                    final_calc AS (
                        SELECT 
                            SUM(value * weight) / SUM(weight) as weighted_mean,
                            COUNT(*) as sample_count
                        FROM weights
                    )
                    SELECT weighted_mean, sample_count 
                    FROM final_calc
                ''', (counter, limit)).fetchone()
                
                if not result:
                    return 0.0, 0
                    
                weighted_mean = result[0] if result[0] is not None else 0.0
                sample_count = result[1] if result[1] is not None else 0
                
                return weighted_mean, sample_count
                
        except sqlite3.Error as e:
            raise Exception(f"Database error getting weighted mean for counter {counter}: {str(e)}")

    def clean_old_data(self, counter: str, limit: int):
        """Clean old data for a counter using a single efficient query"""
        with self._write_lock:
            try:
                self.conn.execute('''
                    DELETE FROM perfdata 
                    WHERE counter = ? 
                    AND rowid NOT IN (
                        SELECT rowid FROM perfdata 
                        WHERE counter = ? 
                        ORDER BY timestamp DESC 
                        LIMIT ?
                    )
                ''', (counter, counter, limit))
                self.conn.commit()
            except:
                self.conn.rollback()
                raise

    def vacuum(self):
        """Optimize database size and performance"""
        with self._write_lock:
            self.conn.execute('VACUUM')
            self.conn.commit()

    def close(self):
        """Close database connection"""
        with self._write_lock:
            if self.conn:
                self.conn.close()

                
class CounterCache:
    """Thread-safe cache for counter metadata"""
    def __init__(self):
        self._cache = {}
        self._lock = threading.Lock()

    def get(self, key, default=None):
        with self._lock:
            return self._cache.get(key, default)

    def set(self, key, value):
        with self._lock:
            self._cache[key] = value

class PDHCounter:
    def __init__(self, counter_path: str):
        self.path = counter_path
        self.hquery = None
        self.hcounter = None
        self.counter_info = None
        self.counter_type = None
        self.base_sampling_interval = 0.5  # Base sampling interval
        self.max_sampling_interval = 2.0  # Maximum sampling interval
        
        try:
            self.hquery = win32pdh.OpenQuery()
            self.hcounter = win32pdh.AddCounter(self.hquery, counter_path)
            self.counter_info = win32pdh.GetCounterInfo(self.hcounter, True)
            self.counter_type = self.counter_info[2]  # Get the PDH counter type
            
            # Initial collection to prepare the counter
            win32pdh.CollectQueryData(self.hquery)
            
        except win32pdh.error as e:
            self.close()
            raise CounterAccessError(f"Failed to add counter {counter_path}: {str(e)}")
        except pywintypes.error as e:
            self.close()
            raise CounterAccessError(f"Windows error initializing counter {counter_path}: {str(e)}")
        except Exception as e:
            self.close()
            raise CounterError(f"Unexpected error initializing counter {counter_path}: {str(e)}")

    def __enter__(self):
        """Support for context manager protocol"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Cleanup when exiting context manager"""
        self.close()
        return False  # Re-raise any exceptions

    def _needs_multiple_samples(self) -> bool:
        """Determine if counter needs multiple samples based on its PDH counter type."""
        try:
            multiple_sample_types = {
                PERF_COUNTER_COUNTER,
                PERF_COUNTER_BULK_COUNT,
                PERF_100NSEC_TIMER,
                PERF_COUNTER_TIMER,
                PERF_PRECISION_100NS_TIMER,
                PERF_PRECISION_SYSTEM_TIMER,
                PERF_SAMPLE_COUNTER,
                PERF_SAMPLE_FRACTION,
                PERF_COUNTER_NODATA,
            }
            return self.counter_type in multiple_sample_types
        except Exception as e:
            raise CounterError(f"Error determining sample type for counter {self.path}: {str(e)}")

    def _get_initial_sampling_interval(self) -> float:
        """
        Determine initial sampling interval based on counter type and format
        """
        try:
            # Get base counter type by masking
            base_type = self.counter_type & 0x00F00000
            format_type = self.counter_type & 0x0000FFFF
            
            # Timer-based counters need longer intervals
            if format_type in {
                0x0300,  # PERF_COUNTER_TIMER
                0x0400,  # PERF_COUNTER_TIMER_INV
                0x0200,  # PERF_COUNTER_TIMER_TEXT
            }:
                return 1.0
                
            # Rate-based counters
            if base_type in {
                0x00200000,  # PERF_COUNTER_RATE
                0x00400000,  # PERF_COUNTER_FRACTION
            }:
                return 0.5
                
            # Bulk counters
            if format_type in {
                0x0500,  # PERF_COUNTER_BULK
                0x0600,  # PERF_COUNTER_LARGE_BULK
            }:
                return 0.5
                
            # Delta counters
            if format_type in {
                0x0700,  # PERF_COUNTER_DELTA
                0x0800,  # PERF_COUNTER_LARGE_DELTA
            }:
                return 0.5
                
            # For instantaneous value counters, use a shorter interval
            if format_type in {
                0x0100,  # PERF_COUNTER_VALUE
                0x0200,  # PERF_COUNTER_RATE
            }:
                return 0.1
                
            # Default sampling interval for unknown types
            return self.base_sampling_interval
                
        except Exception as e:
            print(f"Error determining sampling interval for counter {self.path}: {str(e)}")
            return self.base_sampling_interval
    
    def get_value(self, retries=3) -> float:
        last_error = None
        start_time = time.time()
        sampling_interval = self._get_initial_sampling_interval()
        
        for attempt in range(retries):
            try:
                if time.time() - start_time > DEFAULT_PDH_TIMEOUT:
                    raise TimeoutError(f"Counter collection timed out after {DEFAULT_PDH_TIMEOUT} seconds")
                
                # First collection
                win32pdh.CollectQueryData(self.hquery)
                time.sleep(sampling_interval)
                
                # Second collection
                win32pdh.CollectQueryData(self.hquery)
                
                # Always use NOCAP100 to get raw values
                type_flag, val = win32pdh.GetFormattedCounterValue(
                    self.hcounter,
                    win32pdh.PDH_FMT_DOUBLE | win32pdh.PDH_FMT_NOCAP100
                )
                
                # Get the counter type from the counter info
                counter_type = self.counter_type & 0x00F00000  # Mask to get the base counter type
                
                # Check if this is a proper percentage counter based on its type
                is_percentage_counter = counter_type in {
                    0x00200000,  # PERF_COUNTER_RATE
                    0x00400000,  # PERF_COUNTER_FRACTION
                    0x00800000,  # PERF_COUNTER_BASE
                } and '%' in self.path
                
                if is_percentage_counter:
                    # Only cap actual percentage counters
                    return min(100.0, max(0.0, float(val)))
                
                # For all other counters, return the raw value
                return float(val)
                
            except win32pdh.error as e:
                last_error = f"PDH Error: {e.error_message} (code: {e.error_code})"
                if attempt == retries - 1:
                    raise CounterAccessError(f"Failed to collect counter {self.path}: {last_error}")
                time.sleep(0.5 * (attempt + 1))  # Progressive delay between retries

            
    def close(self):
        """Clean up the counter with proper error handling."""
        if hasattr(self, 'hcounter') and self.hcounter:
            try:
                win32pdh.RemoveCounter(self.hcounter)
            except Exception as e:
                print(f"Warning: Failed to remove counter {self.path}: {str(e)}")
            finally:
                self.hcounter = None

        if hasattr(self, 'hquery') and self.hquery:
            try:
                win32pdh.CloseQuery(self.hquery)
            except Exception as e:
                print(f"Warning: Failed to close query for counter {self.path}: {str(e)}")
            finally:
                self.hquery = None

def handle_counter_error(e: Exception, counter_path: str = None) -> Tuple[str, int]:
    """
    Centralized error handling for counter-related operations.
    Returns (error_message, state) tuple.
    """
    if isinstance(e, win32pdh.error):
        error_code = getattr(e, 'error_code', None)
        if error_code in (0xC0000BB8, 0x800007D5):  # PDH_NO_DATA, ERROR_NO_COUNTER
            return f"Counter not found: {counter_path}", STATE_UNKNOWN
        elif error_code == 0xC0000BBA:  # PDH_NO_MORE_DATA
            return f"No data available for counter: {counter_path}", STATE_UNKNOWN
        elif error_code == 0xC0000BC6:  # PDH_INVALID_PATH
            return f"Invalid counter path: {counter_path}", STATE_UNKNOWN
        elif error_code == 0xC0000BBD:  # PDH_CALC_NEGATIVE_VALUE
            return f"Counter returned negative value: {counter_path}", STATE_UNKNOWN
        else:
            return f"PDH error {error_code}: {str(e)}", STATE_UNKNOWN
    elif isinstance(e, sqlite3.Error):
        return f"Database error: {str(e)}", STATE_UNKNOWN
    elif isinstance(e, CounterError):
        if isinstance(e, CounterNotFoundError):
            return f"Counter not found: {e.counter_path}", STATE_UNKNOWN
        elif isinstance(e, CounterAccessError):
            return f"Cannot access counter: {e.counter_path}", STATE_UNKNOWN
        elif isinstance(e, CounterTypeError):
            return f"Counter type error: {e.counter_path}", STATE_UNKNOWN
        else:
            return str(e), STATE_UNKNOWN
    else:
        return f"Unexpected error: {str(e)}", STATE_UNKNOWN


def get_counter_type_worker(path: str, cache: CounterCache) -> str:
    """Worker function to get counter type based on PDH counter type and format"""
    cached = cache.get(path)
    if cached:
        return cached

    try:
        with PDHCounter(path) as counter:
            counter_info = win32pdh.GetCounterInfo(counter.hcounter, True)
            counter_type = counter_info[2]
            display_name = str(counter_info[1] or '')
            path_str = str(path)
            
            # Check for text counters first
            if counter_type == 0x0000000B:  # PERF_COUNTER_TEXT
                result = 'TEXT'
            # Check for percentage indicators
            elif any([
                counter_type & 0x20C20400 == 0x20C20400,  # PERF_SAMPLE_FRACTION
                any(indicator in (display_name.lower() + path_str.lower()) for indicator in 
                    ['%', 'percent', 'pct', ' ratio ', '/100'])
            ]):
                result = 'PERCENT'
            else:
                result = 'NUMBER'
                
            cache.set(path, result)
            return result
            
    except Exception as e:
        print(f"Error in get_counter_type for {path}: {str(e)}")
        return 'NUMBER'

    
def get_wildcard_counter_path(partial_path: str) -> str:
    """Convert a partial counter path into a wildcard path for matching"""
    # Split the path into components
    parts = partial_path.strip('\\').split('\\')
    
    # Handle different formats:
    # 1. Just counter name: "% Processor Time"
    # 2. Object and counter: "Processor\% Processor Time"
    # 3. Full path: "\Processor(_Total)\% Processor Time"
    
    if len(parts) == 1:
        # Just counter name - create wildcard path
        return f"\\*\\{parts[0]}"
    elif len(parts) == 2:
        # Object and counter - add wildcard for instance
        return f"\\{parts[0]}(*){parts[1]}"
    else:
        # Return as-is if it's a full path
        return partial_path

def expand_counter_path(wildcard_path: str) -> List[str]:
    """Expand a wildcard counter path to all matching paths"""
    try:
        # Get list of paths matching the wildcard
        paths = win32pdh.ExpandCounterPath(wildcard_path)
        return paths if paths else []
    except:
        return []

def get_counter_object_names() -> Dict[str, str]:
    """Get mapping of English counter object names to localized names"""
    try:
        # Get all counter objects in the system
        objects = win32pdh.EnumObjects(None, None, 0)
        return {obj.lower(): obj for obj in objects}
    except:
        return {}


def get_counter_names_for_object(object_name: str) -> Dict[str, Tuple[str, List[str]]]:
    """Get mapping of English counter names to localized names and their instances for an object"""
    try:
        # Get all counters and instances for the object
        counters, instances = win32pdh.EnumObjectItems(None, None, object_name, win32pdh.PERF_DETAIL_WIZARD)
        
        # Create a dictionary to store the results
        counter_dict = {}
        
        # For objects without instances, we'll get an empty list
        if not instances:
            instances = ['']
            
        # Create a dictionary mapping each counter to its instances
        for counter in counters:
            counter_key = counter.lower()
            counter_dict[counter_key] = (counter, instances)
        
        return counter_dict
    except win32pdh.error as e:
        error_code, error_message = e.args
        print(f"Error getting counter names for object '{object_name}': ({error_code}, '{error_message}')")
        return {}
    except Exception as e:
        print(f"Unexpected error getting counter names for object '{object_name}': {str(e)}")
        return {}

def resolve_counter_path(counter_path: str) -> str:
    """
    Resolve a counter path to the correct format for the current system.
    Uses a generalized approach that works for any counter type.
    """
    try:
        # If it's already a valid path, return it
        if counter_path.startswith('\\') and check_counter_exists(counter_path):
            return counter_path
            
        # Strip leading/trailing backslashes and get path components
        path = counter_path.strip('\\')
        parts = path.split('\\')
        
        # Get all available counter objects
        objects = get_counter_object_names()
        
        # Case 1: Single part - just counter name
        if len(parts) == 1:
            counter_name = parts[0].lower()
            # Try each object with the counter
            for obj_name in objects.values():
                try:
                    counters = get_counter_names_for_object(obj_name)
                    for counter_key, (counter_orig, instances) in counters.items():
                        if counter_name in counter_key:
                            # Try different instance combinations
                            paths_to_try = []
                            if instances:
                                if '_Total' in instances:
                                    paths_to_try.append(f"\\{obj_name}(_Total)\\{counter_orig}")
                                paths_to_try.append(f"\\{obj_name}(*)\\{counter_orig}")
                                paths_to_try.append(f"\\{obj_name}({instances[0]})\\{counter_orig}")
                            else:
                                paths_to_try.append(f"\\{obj_name}\\{counter_orig}")
                                
                            # Try each path
                            for path in paths_to_try:
                                if check_counter_exists(path):
                                    return path
                except:
                    continue
                    
        # Case 2: Two parts - object and counter
        elif len(parts) == 2:
            obj_name, counter_name = parts
            obj_name_lower = obj_name.lower()
            
            # Find matching object
            matched_obj = None
            for orig_obj, mapped_obj in objects.items():
                if obj_name_lower in orig_obj.lower():
                    matched_obj = mapped_obj
                    break
                    
            if matched_obj:
                try:
                    counters = get_counter_names_for_object(matched_obj)
                    for counter_key, (counter_orig, instances) in counters.items():
                        if counter_name.lower() in counter_key:
                            # Try different instance combinations
                            paths_to_try = []
                            if instances:
                                if '_Total' in instances:
                                    paths_to_try.append(f"\\{matched_obj}(_Total)\\{counter_orig}")
                                paths_to_try.append(f"\\{matched_obj}(*)\\{counter_orig}")
                                paths_to_try.append(f"\\{matched_obj}({instances[0]})\\{counter_orig}")
                            else:
                                paths_to_try.append(f"\\{matched_obj}\\{counter_orig}")
                                
                            # Try each path
                            for path in paths_to_try:
                                if check_counter_exists(path):
                                    return path
                except:
                    pass
                    
        # Case 3: Three parts - object, instance, counter
        elif len(parts) == 3:
            obj_name, instance, counter_name = parts
            # Try with provided instance and wildcards
            paths_to_try = [
                f"\\{obj_name}({instance})\\{counter_name}",
                f"\\{obj_name}(*)\\{counter_name}",
                f"\\{obj_name}\\_Total\\{counter_name}",
                f"\\{obj_name}\\{counter_name}"
            ]
            for path in paths_to_try:
                if check_counter_exists(path):
                    return path
                    
        return None
        
    except Exception as e:
        print(f"Error resolving path '{counter_path}': {str(e)}")
        return None

def validate_counter_paths_parallel(counter_paths: List[str]) -> Dict[str, str]:
    """Validate counter paths in parallel with generalized path resolution"""
    valid_paths = {}
    path_queue = Queue()
    result_dict = {}
    
    def validate_single_path(path: str) -> Tuple[str, str]:
        try:
            # First check if the path is already valid
            if check_counter_exists(path):
                return path, path
                
            # Try wildcard expansion first
            expanded_paths = expand_counter_path(get_wildcard_counter_path(path))
            if expanded_paths:
                # Prefer _Total or first available instance
                for expanded in expanded_paths:
                    if '_Total' in expanded and check_counter_exists(expanded):
                        return path, expanded
                return path, expanded_paths[0]
                
            # If no wildcard match, try resolution
            resolved_path = resolve_counter_path(path)
            if resolved_path and check_counter_exists(resolved_path):
                return path, resolved_path
                
            return path, None
        except:
            return path, None
            
    def worker():
        while True:
            try:
                path = path_queue.get_nowait()
            except:
                break
                
            orig_path, valid_path = validate_single_path(path)
            result_dict[orig_path] = valid_path
            path_queue.task_done()
            
    # Add paths to queue
    for path in counter_paths:
        path_queue.put(path)
        
    # Create and start worker threads
    threads = []
    for _ in range(min(MAX_WORKERS, len(counter_paths))):
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()
        threads.append(t)
        
    # Wait for completion
    path_queue.join()
    for t in threads:
        t.join()
        
    return result_dict
    

def check_counter_exists(path: str) -> bool:
    """
    Check if a counter path exists, regardless of its value.
    Returns True if the counter can be created and collected from.
    """
    try:
        query = win32pdh.OpenQuery()
        try:
            counter = win32pdh.AddCounter(query, path)
            win32pdh.CollectQueryData(query)
            win32pdh.RemoveCounter(counter)
            return True
        finally:
            win32pdh.CloseQuery(query)
    except:
        return False
    

def collect_single_counter(path: str, type_cache: CounterCache) -> Tuple[str, float, str, str]:
    """Collect a single counter value with proper sampling"""
    try:
        with PDHCounter(path) as counter:
            # Add initial delay for CPU counters
            if "processor" in path.lower() and "time" in path.lower():
                time.sleep(1.0)  # Give CPU counter time to initialize
            value = counter.get_value()
            counter_type = get_counter_type_worker(path, type_cache)
            return path, value, counter_type, None
    except win32pdh.error as e:
        msg, state = handle_counter_error(e, path)
        return path, None, None, msg
    except CounterAccessError as e:
        msg, state = handle_counter_error(e, path)
        return path, None, None, msg
    except CounterError as e:
        msg, state = handle_counter_error(e, path)
        return path, None, None, msg
    except Exception as e:
        msg, state = handle_counter_error(e, path)
        return path, None, None, msg

def collect_and_store_counter_data(counter_paths: List[str], db_manager: DatabaseManager) -> Tuple[Dict[str, float], List[str], Dict[str, str], Dict[str, str]]:
    """Collect counter data with improved error handling"""
    values = {}
    missing_counters = []
    adjusted_counters = {}
    counter_types = {}
    
    if not counter_paths:
        msg, state = handle_counter_error(ValueError("No counter paths provided"))
        raise ValueError(msg)
        
    try:
        valid_paths = validate_counter_paths_parallel(counter_paths)
        path_mapping = {v: k for k, v in valid_paths.items() if v}
        missing_counters.extend(p for p, v in valid_paths.items() if not v)
        
        valid_path_list = [p for p in valid_paths.values() if p]
        if not valid_path_list:
            return {}, missing_counters, {}, {}

        type_cache = CounterCache()
        batch_values = []
        collection_errors = []
        
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = []
            for path in valid_path_list:
                futures.append(executor.submit(collect_single_counter, path, type_cache))
                
            for future in as_completed(futures):
                try:
                    path, value, ctype, error = future.result()
                    if error:
                        collection_errors.append(error)
                        orig_path = path_mapping.get(path, path)
                        missing_counters.append(orig_path)
                    elif value is not None:
                        orig_path = path_mapping.get(path, path)
                        values[orig_path] = value
                        counter_types[orig_path] = ctype
                        if path != orig_path:
                            adjusted_counters[orig_path] = path
                        batch_values.append((orig_path, value))
                except Exception as e:
                    msg, state = handle_counter_error(e)
                    collection_errors.append(msg)
                    missing_counters.append(path_mapping.get(path, path))

        if batch_values:
            try:
                db_manager.bulk_insert_values(batch_values)
            except Exception as e:
                msg, state = handle_counter_error(DatabaseError(f"Failed to store counter values: {str(e)}"))
                raise DatabaseError(msg)

        return values, missing_counters, adjusted_counters, counter_types
        
    except CounterError as e:
        msg, state = handle_counter_error(e)
        raise CounterError(msg)
    except Exception as e:
        msg, state = handle_counter_error(e)
        raise Exception(msg)
    

def parse_args():
    """Parse command line arguments with support for group-specific thresholds."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)
    
    parser.add_argument('-V', '--version',
                       action='version',
                       version='{0}: v{1} by {2}'.format('%(prog)s', __version__, __author__))
    
    parser.add_argument('--always-ok',
                       help='Always returns OK.',
                       dest='ALWAYS_OK',
                       action='store_true',
                       default=False)
    
    parser.add_argument('--count',
                       help='Number of measurements to calculate mean value from. Default: %(default)s',
                       dest='COUNT',
                       type=int,
                       default=DEFAULT_COUNT)
    
    # Default thresholds for percentage-type counters
    parser.add_argument('--percent-critical',
                       help='Critical threshold for percentage counters. Default: %(default)s',
                       dest='PERCENT_CRIT',
                       type=float,
                       default=90)
    
    parser.add_argument('--percent-warning',
                       help='Warning threshold for percentage counters. Default: %(default)s',
                       dest='PERCENT_WARN',
                       type=float,
                       default=80)
    
    # Default thresholds for numeric-type counters
    parser.add_argument('--number-critical',
                       help='Critical threshold for numeric counters. Default: %(default)s',
                       dest='NUMBER_CRIT',
                       type=float,
                       default=None)
    
    parser.add_argument('--number-warning',
                       help='Warning threshold for numeric counters. Default: %(default)s',
                       dest='NUMBER_WARN',
                       type=float,
                       default=None)
    
    parser.add_argument('--perfcounter',  # Using original --perfcounter argument
                       help='Comma-separated list of performance counter paths to monitor',
                       dest='PERFCOUNTER',  # Using original PERFCOUNTER destination
                       type=str,
                       required=True)
    
    args = parser.parse_args()
    
    # Validate percentage thresholds
    try:
        validate_thresholds(args.PERCENT_WARN, args.PERCENT_CRIT)
    except ValueError as e:
        lib.base.cu(f"Invalid percentage thresholds: {str(e)}")
        
    # Validate numeric thresholds if provided
    if args.NUMBER_WARN is not None or args.NUMBER_CRIT is not None:
        try:
            validate_thresholds(args.NUMBER_WARN, args.NUMBER_CRIT)
        except ValueError as e:
            lib.base.cu(f"Invalid numeric thresholds: {str(e)}")
    
    return args

def validate_thresholds(warn: float, crit: float, counter_type: str = None):
    """
    Validate warning and critical thresholds with counter type awareness.
    """
    if warn is not None and crit is not None:
        if counter_type == 'PERCENT':
            if not (0 <= warn <= 100) or not (0 <= crit <= 100):
                raise ValueError("Percentage thresholds must be between 0 and 100")
        elif warn < 0 or crit < 0:
            raise ValueError("Thresholds cannot be negative")
            
        if warn > crit:
            raise ValueError("Warning threshold cannot be greater than critical")

def get_threshold_state(value: float, warn: float, crit: float, counter_type: str = None) -> Tuple[int, str]:
    """
    Determine state and status string based on thresholds and counter type.
    Handles different types of counters appropriately.
    """
    # No thresholds defined
    if warn is None and crit is None:
        return STATE_OK, ""
        
    # Handle percentage counters
    if counter_type == 'PERCENT':
        # Ensure value is within percentage range
        value = min(100, max(0, value))
        
    # Determine state based on thresholds
    if crit is not None and value >= crit:
        return STATE_CRIT, "[CRITICAL]"
    elif warn is not None and value >= warn:
        return STATE_WARN, "[WARNING]"
        
    return STATE_OK, ""

def format_perfdata(name: str, value: float, counter_type: str, warn: float = None, crit: float = None) -> str:
    """
    Format performance data with counter type awareness.
    """
    # Determine unit based on counter type
    if counter_type == 'PERCENT':
        unit = '%'
        min_val = 0
        max_val = 100
    else:
        unit = ''
        min_val = 0
        max_val = None
        
    return lib.base.get_perfdata(
        name,
        value,
        unit,
        warn,
        crit,
        min_val,
        max_val
    )

def format_counter_value(name: str, value: float) -> str:
    """
    Format counter values with appropriate precision based on counter type and value range
    """
    # For very small values (like disk seconds), use scientific notation or more decimal places
    if 'sec/' in name.lower() and value < 0.001:
        return f"{value:.2e}"  # Scientific notation for very small values
    elif 'sec/' in name.lower():
        return f"{value:.4f}"  # 4 decimal places for timing values
        
    # For byte values, round to nearest whole number
    if 'bytes' in name.lower():
        return f"{value:.0f}"
        
    # For rates (/sec), use 2 decimal places
    if '/sec' in name.lower():
        return f"{value:.2f}"
        
    # For percentage values
    if '%' in name:
        return f"{value:.1f}"
        
    # Default format - determine based on value range
    if value >= 100:
        return f"{value:.0f}"
    elif value >= 10:
        return f"{value:.1f}"
    else:
        return f"{value:.2f}"

def get_threshold_summary(counter_types: Dict[str, str], args) -> str:
    """Generate a summary of active thresholds based on counter types"""
    # Count counter types
    type_counts = {'PERCENT': 0, 'NUMBER': 0, 'TEXT': 0}
    for counter_type in counter_types.values():
        type_counts[counter_type] = type_counts.get(counter_type, 0) + 1

    # Generate appropriate threshold summary
    summaries = []
    
    if type_counts['PERCENT'] > 0:
        if args.PERCENT_WARN is not None and args.PERCENT_CRIT is not None:
            summaries.append(f"percentage counters (warn={args.PERCENT_WARN}% crit={args.PERCENT_CRIT}%)")
            
    if type_counts['NUMBER'] > 0:
        if args.NUMBER_WARN is not None and args.NUMBER_CRIT is not None:
            summaries.append(f"numeric counters (warn={args.NUMBER_WARN} crit={args.NUMBER_CRIT})")
            
    if not summaries:
        return "Performance Metrics (no thresholds defined)"
        
    return "Performance Metrics - thresholds for " + ", ".join(summaries)

def main():
    """Main function with improved database handling and grouped output."""
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)
    except Exception as e:
        lib.base.cu(f'Error parsing arguments: {str(e)}')

    counter_paths = [x.strip() for x in args.PERFCOUNTER.split(',')]  
    if not counter_paths:
        lib.base.cu('No counter paths provided')

    msg = ''
    perfdata = ''
    state = STATE_OK
    
    db_manager = None
    try:
        # Get the temporary directory
        temp_dir = tempfile.gettempdir()
        if not os.path.exists(temp_dir):
            raise Exception(f"Temporary directory {temp_dir} does not exist")
            
        db_filename = 'linuxfabrik-monitoring-plugins-perfcounter.db'
        db_path = os.path.join(temp_dir, db_filename)
        
        try:
            db_manager = DatabaseManager(db_path)
            db_manager.initialize_table()
        except sqlite3.Error as e:
            raise DatabaseError(f"Failed to initialize database: {str(e)}")

        try:
            # Use size-based check:
            db_manager.check_db_size()
        
            counter_values, missing_counters, adjusted_counters, counter_types = collect_and_store_counter_data(
                counter_paths, db_manager)
                
            # If all counters are missing, keep OK state but provide clear message
            if len(missing_counters) == len(counter_paths):
                msg = "All specified performance counters are invalid or inaccessible:\n"
                msg += "\n".join(f"  {counter}" for counter in missing_counters)
                lib.base.oao(msg, STATE_OK, perfdata, always_ok=args.ALWAYS_OK)
                return
                
        except Exception as e:
            raise Exception(f"Failed to collect counter data: {str(e)}")
        
        # Clean old data for each counter that was successfully collected
        for counter in counter_values.keys():
            db_manager.clean_old_data(counter, args.COUNT)

        # Vacuum based on file size instead of random chance
        try:
            db_size = os.path.getsize(db_path)
            if db_size > 100 * 1024 * 1024:  # 100MB
                db_manager.vacuum()
        except OSError:
            pass

        msg_parts = []
        
        # Add threshold summary as first line
        msg_parts.append(get_threshold_summary(counter_types, args))
        
        # Show missing counters first if there are any, but don't change state
        if missing_counters:
            msg_parts.append("\nMissing or invalid counters:")
            msg_parts.extend(f"  {counter}" for counter in missing_counters)
        
        if adjusted_counters:
            msg_parts.append("\nAdjusted counter paths:")
            msg_parts.extend(f"  \"{orig}\" -> \"{adj}\"" for orig, adj in adjusted_counters.items())

        # Group counters by type
        percent_counters = []
        number_counters = []
        text_counters = []
        insufficient_data_warnings = []

        for counter in counter_paths:
            if counter in missing_counters:
                continue
                
            counter_name = counter.split('\\')[-1]
            mean_value, count = db_manager.get_mean_value(counter, args.COUNT)
            if count < args.COUNT:
                insufficient_data_warnings.append(
                    f"  {counter_name}: using {count}/{args.COUNT} values for mean calculation"
                )
            
            counter_type = counter_types.get(counter, 'NUMBER')
            
            if counter_type == 'PERCENT':
                current_state, status = get_threshold_state(
                    mean_value, args.PERCENT_WARN, args.PERCENT_CRIT)
                formatted_value = format_counter_value(counter_name, mean_value)
                percent_counters.append((counter_name, formatted_value, status))
                state = max(state, current_state)
                
                perfdata += lib.base.get_perfdata(
                    counter_name,
                    mean_value,
                    '%',
                    args.PERCENT_WARN,
                    args.PERCENT_CRIT,
                    0,
                    100
                )
            elif counter_type == 'NUMBER':
                current_state, status = get_threshold_state(
                    mean_value, args.NUMBER_WARN, args.NUMBER_CRIT)
                formatted_value = format_counter_value(counter_name, mean_value)
                number_counters.append((counter_name, formatted_value, status))
                state = max(state, current_state)
                
                perfdata += lib.base.get_perfdata(
                    counter_name,
                    mean_value,
                    '',
                    args.NUMBER_WARN,
                    args.NUMBER_CRIT,
                    0,
                    None
                )
            else:  # TEXT type
                formatted_value = str(mean_value)
                text_counters.append((counter_name, formatted_value, ""))

        # Format grouped output
        if insufficient_data_warnings:
            msg_parts.append("\nInsufficient data for some counters:")
            msg_parts.extend(insufficient_data_warnings)

        if percent_counters:
            msg_parts.append("\nPercentage Counters:")
            for name, value, status in percent_counters:
                msg_parts.append(f"  {name}: {value}% {status}")

        if number_counters:
            msg_parts.append("\nNumeric Counters:")
            for name, value, status in number_counters:
                msg_parts.append(f"  {name}: {value} {status}")

        if text_counters:
            msg_parts.append("\nText Counters:")
            for name, value, _ in text_counters:
                msg_parts.append(f"  {name}: {value}")

        msg = '\n'.join(msg_parts)

    except DatabaseError as e:
        lib.base.cu(f"Database error: {str(e)}")
    except CounterError as e:
        lib.base.cu(f"Counter error: {str(e)}")
    except Exception as e:
        lib.base.cu(f"Unexpected error: {str(e)}")
    finally:
        if db_manager:
            try:
                db_manager.close()
            except Exception as e:
                print(f"Warning: Failed to close database connection: {str(e)}")

    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)

if __name__ == '__main__':
    try:
        main()
    except Exception as e:
        lib.base.cu(f'Unexpected error: {str(e)}')
```

Also the director json:
```
{
    "Command": {
        "cmd-check-perfcounter-windows": {
            "arguments": {
                "--always-ok": {
                    "set_if": "$perfcounter_windows_always_ok$"
                },
                "--count": {
                    "value": "$perfcounter_windows_count$"
                },
                "--percent-critical": {
                    "value": "$perfcounter_windows_percent_critical$"
                },
                "--percent-warning": {
                    "value": "$perfcounter_windows_percent_warning$"
                },
                "--number-critical": {
                    "value": "$perfcounter_windows_number_critical$"
                },
                "--number-warning": {
                    "value": "$perfcounter_windows_number_warning$"
                },
                "--perfcounter": {
                    "value": "$perfcounter_windows_perfcounter$"
                }
            },
            "command": "C:\\ProgramData\\icinga2\\usr\\lib64\\nagios\\plugins\\perfcounter.exe",
            "disabled": false,
            "fields": [
                {
                    "datafield_id": 1,
                    "is_required": "n",
                    "var_filter": null
                },
                {
                    "datafield_id": 2,
                    "is_required": "n",
                    "var_filter": null
                },
                {
                    "datafield_id": 3,
                    "is_required": "n",
                    "var_filter": null
                },
                {
                    "datafield_id": 4,
                    "is_required": "n",
                    "var_filter": null
                },
                {
                    "datafield_id": 5,
                    "is_required": "n",
                    "var_filter": null
                },
                {
                    "datafield_id": 6,
                    "is_required": "n",
                    "var_filter": null
                },
                {
                    "datafield_id": 7,
                    "is_required": "y",
                    "var_filter": null
                }
            ],
            "imports": [],
            "is_string": null,
            "methods_execute": "PluginCheck",
            "object_name": "cmd-check-perfcounter-windows",
            "object_type": "object",
            "timeout": "10",
            "vars": {},
            "zone": null,
            "uuid": "c830d8ca-20a4-4fd8-bfdf-3ae25f947f96"
        }
    },
    "ServiceTemplate": {
        "tpl-service-perfcounter-windows": {
            "action_url": null,
            "apply_for": null,
            "assign_filter": null,
            "check_command": "cmd-check-perfcounter-windows",
            "check_interval": 60,
            "check_period": null,
            "check_timeout": null,
            "command_endpoint": null,
            "disabled": false,
            "display_name": null,
            "enable_active_checks": null,
            "enable_event_handler": null,
            "enable_flapping": null,
            "enable_notifications": true,
            "enable_passive_checks": null,
            "enable_perfdata": null,
            "event_command": null,
            "fields": [],
            "flapping_threshold_high": null,
            "flapping_threshold_low": null,
            "groups": [],
            "host": null,
            "icon_image": "perfcounter.png",
            "icon_image_alt": null,
            "imports": [
                "tpl-service-generic"
            ],
            "max_check_attempts": 5,
            "notes": "Checks Windows performance counters. Supports monitoring multiple counter paths and calculates mean values from specified number of measurements.",
            "notes_url": "https://github.com/Linuxfabrik/monitoring-plugins/tree/main/check-plugins/perfcounter",
            "object_name": "tpl-service-perfcounter-windows",
            "object_type": "template",
            "retry_interval": 15,
            "service_set": null,
            "template_choice": null,
            "use_agent": null,
            "use_var_overrides": null,
            "vars": {
                "criticality": "C",
                "perfcounter_windows_always_ok": false,
                "perfcounter_windows_count": 1,
                "perfcounter_windows_percent_warning": 80,
                "perfcounter_windows_percent_critical": 90,
                "perfcounter_windows_number_warning": null,
                "perfcounter_windows_number_critical": null
            },
            "volatile": null,
            "zone": null,
            "uuid": "abcd3b45-fff0-49f8-8065-19d824ed0d7f"
        }
    },
    "Datafield": {
        "1": {
            "varname": "perfcounter_windows_always_ok",
            "caption": "Perfcounter: Always OK?",
            "description": "Always returns OK.",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeBoolean",
            "format": null,
            "settings": {},
            "uuid": "3f4cd078-e8c6-4c2b-b4fe-449ebc9d0b01"
        },
        "2": {
            "varname": "perfcounter_windows_count",
            "caption": "Perfcounter: Count",
            "description": "Number of measurements to calculate mean value from.",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
            "format": null,
            "settings": {
                "visibility": "visible"
            },
            "uuid": "582ad4b0-9399-47e1-82a4-2db938828528"
        },
        "3": {
            "varname": "perfcounter_windows_percent_critical",
            "caption": "Perfcounter: Percent Critical",
            "description": "Critical threshold for percentage counters.",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
            "format": null,
            "settings": {
                "visibility": "visible"
            },
            "uuid": "146363e7-2655-49c7-96e0-42f5449da430"
        },
        "4": {
            "varname": "perfcounter_windows_percent_warning",
            "caption": "Perfcounter: Percent Warning",
            "description": "Warning threshold for percentage counters.",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
            "format": null,
            "settings": {
                "visibility": "visible"
            },
            "uuid": "afe764dd-a4de-4e61-b8d6-40849a9bebc5"
        },
        "5": {
            "varname": "perfcounter_windows_number_critical",
            "caption": "Perfcounter: Number Critical",
            "description": "Critical threshold for numeric counters.",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
            "format": null,
            "settings": {
                "visibility": "visible"
            },
            "uuid": "bfedc075-96bb-4911-97cb-efaa7022193c"
        },
        "6": {
            "varname": "perfcounter_windows_number_warning",
            "caption": "Perfcounter: Number Warning",
            "description": "Warning threshold for numeric counters.",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
            "format": null,
            "settings": {
                "visibility": "visible"
            },
            "uuid": "cfedc075-96bb-4911-97cb-efaa7022193d"
        },
        "7": {
            "varname": "perfcounter_windows_perfcounter",
            "caption": "Perfcounter: Paths",
            "description": "Comma-separated list of performance counter paths to monitor",
            "datatype": "Icinga\\Module\\Director\\DataType\\DataTypeString",
            "format": null,
            "settings": {
                "visibility": "visible"
            },
            "uuid": "dfedc075-96bb-4911-97cb-efaa7022193e"
        }
    }
}

```

### Additional context

_No response_
@drapiti drapiti added the enhancement New feature or request label Jan 3, 2025
@markuslf markuslf self-assigned this Jan 7, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

2 participants