Skip to content

Commit e0d0e63

Browse files
committed
Let Linux system metrics monitor log only changing values
The Linux system metrics monitor produces a lot of metrics and logs them a couple times a minute. Some of them change only slowly, others not at all. This change adds a configuration option to allow non-changing metrics to be logged less frequently. The default behavior is unchanged (log everything on every monitor run) but you can now do implicit_metric_monitor: false, monitors: [ { // Listen for syslog messages module: "scalyr_agent.builtin_monitors.linux_system_metrics", log_all_interval: 3600 } ] to only log the full set of metrics once an hour. For the rest of the hour, only metrics whose values have changed will be logged. This cuts down significantly on the size of the system metrics logs, which can add up to a bunch of data as the number of hosts grows.
1 parent 6ba8421 commit e0d0e63

File tree

1 file changed

+38
-3
lines changed

1 file changed

+38
-3
lines changed

scalyr_agent/builtin_monitors/linux_system_metrics.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import os
2121
import re
2222
import scalyr_agent.third_party.tcollector.tcollector as tcollector
23+
import time
2324
from Queue import Empty
2425
from scalyr_agent import ScalyrMonitor, BadMonitorConfiguration, define_metric, define_log_field, define_config_option
2526
from scalyr_agent.third_party.tcollector.tcollector import ReaderThread
@@ -243,6 +244,12 @@
243244
'to create the full interface name when interating over network interfaces in /dev'
244245
)
245246

247+
define_config_option(__monitor__, 'log_all_interval',
248+
'Time in seconds between logging of the full set of metrics. Default is to log the full set '
249+
'of metrics every time the monitor runs. If this is set to a value greater than the interval '
250+
'between runs of the monitor, then intervening monitor runs will only log values that have '
251+
'changed since the last monitor run.')
252+
246253
class TcollectorOptions(object):
247254
"""Bare minimum implementation of an object to represent the tcollector options.
248255
@@ -283,16 +290,40 @@ def __init__(self, monitor, queue, logger, error_logger):
283290
self.__error_logger = error_logger
284291
self.__timestamp_matcher = re.compile('(\\S+)\\s+\\d+\\s+(.*)')
285292
self.__key_value_matcher = re.compile('(\\S+)=(\\S+)')
293+
self.__parts_matcher = re.compile('(\\S+)\\s+([\\d.]+)\\s*(.*)')
294+
self.__last_values = {}
286295

287296
def __rewrite_tsdb_line(self, line):
288-
"""Rewrites the TSDB line emitted by the collectors to the format used by the agent-metrics parser."""
297+
"""Rewrites the TSDB line emitted by the collectors to the format used by the agent-metrics parser.
298+
Returns None if the line shouldn't be logged."""
289299
# Strip out the timestamp that is the second token on the line.
290300
match = self.__timestamp_matcher.match(line)
291301
if match is not None:
292302
line = '%s %s' % (match.group(1), match.group(2))
293303

304+
# Lines are of the form
305+
#
306+
# metric value [key=value ...]
307+
#
308+
# The identity of a metric is the metric name plus all its key/value pairs; we'll
309+
# need that to figure out if the value has changed.
310+
if self.__monitor.log_all_interval > 0:
311+
parts_match = self.__parts_matcher.match(line)
312+
if parts_match is not None:
313+
identity = parts_match.group(1) + ' ' + parts_match.group(3)
314+
value = parts_match.group(2)
315+
316+
if identity in self.__last_values:
317+
(previous_value, previous_time) = self.__last_values[identity]
318+
expiration_time = previous_time + self.__monitor.log_all_interval
319+
if previous_value == value and expiration_time > time.time():
320+
return None
321+
322+
self.__last_values[identity] = (value, time.time())
323+
294324
# Now rewrite any key/value pairs from foo=bar to foo="bar"
295325
line = self.__key_value_matcher.sub('\\1="\\2"', line)
326+
296327
return line
297328

298329
def run(self):
@@ -307,15 +338,17 @@ def run(self):
307338
# returned by the queue. See the 'stop' method for details.
308339
if not self._run_state.is_running():
309340
continue
310-
self.__logger.info(line, metric_log_for_monitor=self.__monitor)
341+
if line:
342+
self.__logger.info(line, metric_log_for_monitor=self.__monitor)
311343
while True:
312344
try:
313345
line = self.__rewrite_tsdb_line(self.__queue.get(False))
314346
except Empty:
315347
break
316348
if not self._run_state.is_running():
317349
continue
318-
self.__logger.info(line, metric_log_for_monitor=self.__monitor)
350+
if line:
351+
self.__logger.info(line, metric_log_for_monitor=self.__monitor)
319352

320353
errors = 0 # We managed to do a successful iteration.
321354
except (ArithmeticError, EOFError, EnvironmentError, LookupError,
@@ -397,6 +430,8 @@ def _initialize(self):
397430
self.modules = tcollector.load_etc_dir(self.options, tags)
398431
self.tags = tags
399432

433+
self.log_all_interval = int(self._config.get('log_all_interval', default='0'))
434+
400435
def run(self):
401436
"""Begins executing the monitor, writing metric output to self._logger."""
402437
tcollector.override_logging(self._logger)

0 commit comments

Comments
 (0)