Add collectd files

2024-12-23 10:47:41 +01:00
parent c825994225
commit d5de88fcf2
13 changed files with 704 additions and 0 deletions
--- a/collectd/etc/collectd.conf
+++ b/collectd/etc/collectd.conf
@@ -0,0 +1,132 @@
+# Config file for collectd(1).
+#
+# Some plugins need additional configuration and are disabled by default.
+# Please read collectd.conf(5) for details.
+#
+# You should also read /usr/share/doc/collectd-core/README.Debian.plugins
+# before enabling any more plugins.
+
+##############################################################################
+# Global                                                                     #
+#----------------------------------------------------------------------------#
+# Global settings for the daemon.                                            #
+##############################################################################
+
+Hostname "server"
+FQDNLookup true
+#BaseDir "/var/lib/collectd"
+#PluginDir "/usr/lib/collectd"
+#TypesDB "/usr/share/collectd/types.db" "/etc/collectd/my_types.db" 
+#----------------------------------------------------------------------------#
+# When enabled, plugins are loaded automatically with the default options    #
+# when an appropriate <Plugin ...> block is encountered.                     #
+# Disabled by default.                                                       #
+#----------------------------------------------------------------------------#
+#AutoLoadPlugin false
+
+#----------------------------------------------------------------------------#
+# Interval at which to query values. This may be overwritten on a per-plugin #
+# base by using the 'Interval' option of the LoadPlugin block:               #
+#   <LoadPlugin foo>                                                         #
+#       Interval 60                                                          #
+#   </LoadPlugin>                                                            #
+#----------------------------------------------------------------------------#
+Interval 10
+
+#Timeout 2
+#ReadThreads 5
+#WriteThreads 5
+
+# Limit the size of the write queue. Default is no limit. Setting up a limit
+# is recommended for servers handling a high volume of traffic.
+#WriteQueueLimitHigh 1000000
+#WriteQueueLimitLow   800000
+
+##############################################################################
+# Logging                                                                    #
+#----------------------------------------------------------------------------#
+# Plugins which provide logging functions should be loaded first, so log     #
+# messages generated when loading or configuring other plugins can be        #
+# accessed.                                                                  #
+##############################################################################
+
+LoadPlugin logfile
+
+<Plugin logfile>
+	LogLevel "info"
+	File "/var/lib/collectd/log/collectd.log"
+	Timestamp true
+	PrintSeverity true
+</Plugin>
+
+##############################################################################
+# LoadPlugin section                                                         #
+#----------------------------------------------------------------------------#
+# Specify what features to activate.                                         #
+##############################################################################
+
+LoadPlugin cpu
+LoadPlugin cpufreq
+LoadPlugin df
+LoadPlugin disk
+LoadPlugin entropy
+#LoadPlugin interface
+#LoadPlugin irq
+LoadPlugin load
+LoadPlugin md
+LoadPlugin memory
+LoadPlugin network
+LoadPlugin processes
+LoadPlugin rrdtool
+LoadPlugin sensors
+LoadPlugin swap
+LoadPlugin uptime
+LoadPlugin users
+LoadPlugin smart
+
+##############################################################################
+# Plugin configuration                                                       #
+#----------------------------------------------------------------------------#
+# In this section configuration stubs for each plugin are provided. A desc-  #
+# ription of those options is available in the collectd.conf(5) manual page. #
+##############################################################################
+
+<Plugin rrdtool>
+	DataDir "/var/lib/collectd/rrd"
+#	CacheTimeout 120
+#	CacheFlush 900
+#	WritesPerSecond 30
+#	CreateFilesAsync false
+#	RandomTimeout 0
+#
+# The following settings are rather advanced
+# and should usually not be touched:
+#	StepSize 10
+#	HeartBeat 20
+#	RRARows 1200
+#	RRATimespan 158112000
+#	XFF 0.1
+</Plugin>
+
+<Plugin disk>
+    Disk "sda"
+    Disk "sdb"
+    Disk "nvme0"
+    IgnoreSelected false
+</Plugin>
+
+#<Plugin md>
+#   Device "/dev/md0"
+#   IgnoreSelected false
+#</Plugin>
+
+<Plugin "smart">
+    Disk "sda"
+    Disk "sdb"
+    Disk "nvme0"
+    IgnoreSelected false
+</Plugin>
+
+<Include "/etc/collectd/collectd.conf.d">
+	Filter "*.conf"
+</Include>
--- a/collectd/etc/collectd.conf.d/btrfs-data.conf
+++ b/collectd/etc/collectd.conf.d/btrfs-data.conf
@@ -0,0 +1,5 @@
+LoadPlugin exec
+
+<Plugin "exec">
+  Exec collectd "/host/usr/local/bin/btrfs-data"
+</Plugin>
--- a/collectd/etc/collectd.conf.d/cpufreq-data.conf
+++ b/collectd/etc/collectd.conf.d/cpufreq-data.conf
@@ -0,0 +1,5 @@
+LoadPlugin exec
+
+<Plugin "exec">
+  Exec collectd "/host/usr/local/bin/cpufreq-data"
+</Plugin>
--- a/collectd/etc/collectd.conf.d/df.conf
+++ b/collectd/etc/collectd.conf.d/df.conf
@@ -0,0 +1,5 @@
+<Plugin df>
+    MountPoint "/media/docker"
+    FSType "ext4"
+    IgnoreSelected false
+</Plugin>
--- a/collectd/etc/collectd.conf.d/du-data.conf
+++ b/collectd/etc/collectd.conf.d/du-data.conf
@@ -0,0 +1,5 @@
+LoadPlugin exec
+
+<Plugin "exec">
+  Exec collectd "/host/usr/local/bin/du-data"
+</Plugin>
--- a/collectd/etc/collectd.conf.d/power-data.conf
+++ b/collectd/etc/collectd.conf.d/power-data.conf
@@ -0,0 +1,5 @@
+LoadPlugin exec
+
+<Plugin "exec">
+  Exec collectd "/host/usr/local/bin/power-data"
+</Plugin>
--- a/collectd/etc/collectd.conf.d/speedtest-data.conf
+++ b/collectd/etc/collectd.conf.d/speedtest-data.conf
@@ -0,0 +1,5 @@
+LoadPlugin exec
+
+<Plugin "exec">
+  Exec nobody "/host/usr/local/bin/speedtest-data"
+</Plugin>
--- a/collectd/usr/local/bin/btrfs-data
+++ b/collectd/usr/local/bin/btrfs-data
@@ -0,0 +1,245 @@
+#!/usr/bin/python3
+
+#
+# Imports
+#
+import sys
+import time
+import subprocess
+import argparse
+
+
+#
+# Misc
+#
+# sys.tracebacklimit = 0
+
+
+#
+# Global variables
+#
+size_data_total = 0
+size_data_exclusive = 0
+size_snapshot_total = 0
+size_snapshot_exclusive = 0
+
+
+#
+# Methods
+#
+def get_subvol_list(path):
+    command = "sudo btrfs subvolume list -t %s" % (path)
+    status, output = subprocess.getstatusoutput(command)
+
+    if status != 0:
+        raise Exception(command)
+
+    # Every line contains the following values: subvol_id, gen, toplevel, path
+    return output.splitlines()[2:]
+
+
+def get_filesystem_size(path):
+    command = "sudo btrfs filesystem show --raw %s" % (path)
+    status, output = subprocess.getstatusoutput(command)
+
+    if status != 0:
+        # This command fails when running inside Docker container
+        # return maximum size of any filesystem instead
+        command = "sudo btrfs filesystem show --raw"
+        status, output = subprocess.getstatusoutput(command)
+        lines = output.splitlines()
+        lines = [x for x in lines if "devid" in x]
+        sizes = [int(line.split()[3]) for line in lines]
+        return max(sizes)
+
+    # The sizes are on the third line
+    line = output.splitlines()[2]
+
+    # Element 3 and 5 respectively contain total and used sizes
+    return int(line.split()[3])
+
+
+def get_id_root(name, path):
+    lines = get_subvol_list(path)
+
+    # Filter lines where toplevel == 5
+    subvol_ids = [x for x in lines if int(x.split()[2]) == 5]
+
+    # Try to retrieve the subvol_id for the root subvolume (if any)
+    if len(subvol_ids) == 1:
+        # The path contains a btrfs filesystem without subvolume for data
+        return int(subvol_ids[0].split()[0])
+    else:
+        # The path contains a btrfs filesystem with multiple subvolumes for data
+        try:
+            return int(list(filter(lambda x: x.split()[3] == name, subvol_ids))[0].split()[0])
+        except IndexError:
+            pass
+
+    # Volume not found, root is probably the btrfs default (5)
+    return 5
+
+
+def get_id_subvolumes(path, subvol_id):
+    lines = get_subvol_list(path)
+    lines = [x for x in lines if int(x.split()[2]) == subvol_id]
+    return list([int(x.split()[0]) for x in lines])
+
+
+def get_disk_usage(name, path):
+    id_root = get_id_root(name, path)
+    id_subvolumes = get_id_subvolumes(path, id_root)
+    size_filesystem = get_filesystem_size(path)
+
+    # Get disk usage from quota
+    command = "sudo btrfs qgroup show --raw %s" % (path)
+    status, output = subprocess.getstatusoutput(command)
+
+    if status != 0:
+        raise Exception(command)
+
+    lines = output.splitlines()[2:]
+
+    # Global variables
+    global size_data_total
+    global size_data_exclusive
+    global size_snapshot_total
+    global size_snapshot_exclusive
+
+    # Total data volume in subvolume
+    size_data_total = 0
+
+    # Total data volume in snapshots
+    #   -> this variable is useless
+    size_snapshot_total = 0
+
+    # Data exclusively in subvolume
+    #   -> data that is not (yet) incorporated in a snapshot
+    size_data_exclusive = 0
+
+    # Data exclusively available in snapshots
+    #   -> data that was removed from volume
+    size_snapshot_exclusive = 0
+
+    for line in lines:
+        split = line.split()
+        subvol_id = 0
+        size_total = 0
+        size_exclusive = 0
+        try:
+            subvol_id = int(split[0].split("/")[1])
+            size_total = float(split[1])
+            size_exclusive = float(split[2])
+        except IndexError:
+            # ignore "WARNING: Quota disabled"
+            pass
+
+        # size_exclusive is incorrect when snapshot is
+        # removed and qgroups are not updated yet,
+        # ignore the value when it seems unrealistic
+        if size_exclusive > size_filesystem:
+            size_exclusive = 0
+
+        if subvol_id == id_root:
+            size_data_total = size_total
+            size_data_exclusive = size_exclusive
+        elif subvol_id in id_subvolumes:
+            size_snapshot_total += size_total
+            size_snapshot_exclusive += size_exclusive
+
+
+def rescan_quota(path):
+    command = "sudo btrfs quota rescan %s" % (path)
+    status, output = subprocess.getstatusoutput(command)
+    if status != 0:
+        Exception(command)
+
+
+def print_human_readable(name):
+    global size_data_total
+    global size_data_exclusive
+    global size_snapshot_exclusive
+    size_data_total = size_data_total / (1024 * 1e6)
+    size_data_exclusive = size_data_exclusive / (1024 * 1e6)
+    size_snapshot_exclusive = size_snapshot_exclusive / (1024 * 1e6)
+    print(
+        "%10s: %6.1f Gb, %6.1f Gb, %6.1f Gb"
+        % (name, size_data_total, size_data_exclusive, size_snapshot_exclusive)
+    )
+
+
+def print_rrd(name):
+    timestamp = int(time.time())
+    print(
+        (
+            "PUTVAL {}/exec-btrfs_{}/gauge-data_total {}:{:.1f}".format(
+                hostname, name, timestamp, size_data_total
+            )
+        )
+    )
+    print(
+        (
+            "PUTVAL {}/exec-btrfs_{}/gauge-data_exclusive {}:{:.1f}".format(
+                hostname, name, timestamp, size_data_exclusive
+            )
+        )
+    )
+    print(
+        (
+            "PUTVAL {}/exec-btrfs_{}/gauge-snapshot_total {}:{:.1f}".format(
+                hostname, name, timestamp, size_snapshot_total
+            )
+        )
+    )
+    print(
+        (
+            "PUTVAL {}/exec-btrfs_{}/gauge-snapshot_exclusive {}:{:.1f}".format(
+                hostname, name, timestamp, size_snapshot_exclusive
+            )
+        )
+    )
+
+
+#
+# Volumes to scan
+#
+hostname = "server"
+interval = 10
+volumes = list()
+
+volumes.append(["helios", "/host/media/helios"])
+volumes.append(["borg", "/host/media//borg"])
+volumes.append(["rsnapshot", "/host/media/rsnapshot"])
+volumes.append(["mercury", "/host/media/mercury"])
+volumes.append(["neptune", "/host/media/neptune"])
+volumes.append(["nubes", "/host/media/nubes"])
+volumes.append(["scratch", "/host/media/scratch"])
+
+
+#
+# Command line arguments
+#
+parser = argparse.ArgumentParser(description="Get BTRFS disk usage")
+parser.add_argument("-s", action="store_true", help="print in human readable format")
+args = parser.parse_args()
+human_readable = args.s
+
+
+#
+# Main
+#
+if human_readable:
+    for (name, path) in volumes:
+        get_disk_usage(name, path)
+        print_human_readable(name)
+else:
+    # RRD mode
+    while True:
+        for (name, path) in volumes:
+            get_disk_usage(name, path)
+
+            print_rrd(name)
+
+            sys.stdout.flush()
+        time.sleep(interval)
+        # rescan_quota(path)
--- a/collectd/usr/local/bin/cpufreq-data
+++ b/collectd/usr/local/bin/cpufreq-data
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+import argparse
+import time
+import sys
+import os
+
+hostname = "server"
+measurement_interval = 5
+
+
+def get_cpu_frequencies():
+    frequencies = []
+    try:
+        cpu_dirs = [
+            d
+            for d in os.listdir("/sys/devices/system/cpu/")
+            if d.startswith("cpu") and d[3:].isdigit()
+        ]
+        for cpu_dir in cpu_dirs:
+            with open(
+                f"/sys/devices/system/cpu/{cpu_dir}/cpufreq/scaling_cur_freq", "r"
+            ) as f:
+                frequency = int(f.read().strip()) / 1000  # Convert Hz to MHz
+                frequencies.append((int(cpu_dir[3:]), frequency))
+    except Exception as e:
+        print("Error:", e)
+    return frequencies
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Query CPU frequencies.")
+    parser.add_argument(
+        "-s",
+        "--human-readable",
+        action="store_true",
+        help="Print frequencies in human-readable format",
+    )
+    args = parser.parse_args()
+
+    if args.human_readable:
+        frequencies = get_cpu_frequencies()
+        for cpu, frequency in frequencies:
+            print(f"CPU{cpu} Frequency: {frequency:.2f} MHz")
+    else:
+        while True:
+            frequencies = get_cpu_frequencies()
+            timestamp = int(time.time())
+            for cpu, frequency in frequencies:
+                print(
+                    f"PUTVAL {hostname}/cpu-frequency/gauge-cpu{cpu} {timestamp}:{frequency:.0f}"
+                )
+            sys.stdout.flush()
+            time.sleep(measurement_interval)
+
+
+if __name__ == "__main__":
+    main()
--- a/collectd/usr/local/bin/du-data
+++ b/collectd/usr/local/bin/du-data
@@ -0,0 +1,82 @@
+#!/usr/bin/python3
+
+#
+# Imports
+#
+import sys
+import time
+import subprocess
+import argparse
+
+
+#
+# Methods
+#
+def get_disk_usage(path, human_readable):
+    """disk usage in human readable format (e.g. '2,1GB')"""
+    arguments = "-sh" if human_readable else "-s"
+    command = "du %s %s" % (arguments, path)
+    status, output = subprocess.getstatusoutput(command)
+
+    if status != 0:
+        raise Exception(command)
+
+    disk_usage = output.split()[0]
+    if not human_readable:
+        # du reports in units of 1024 bytes, convert to plain number of bytes
+        disk_usage = int(disk_usage) * 1024
+    return disk_usage
+
+
+#
+# Directories to scan
+#
+hostname = "server"
+interval = 10
+directories = list()
+
+directories.append(["bram", "/host/media/helios/Bram"])
+directories.append(["rik", "/host/media/helios/Rik"])
+directories.append(["books", "/host/media/neptune/Books"])
+directories.append(["games", "/host/media/mercury/Games"])
+directories.append(["misc", "/host/media/neptune/Miscellaneous"])
+directories.append(["shows", "/host/media/neptune/Video/Shows"])
+directories.append(["movies", "/host/media/neptune/Video/Movies"])
+directories.append(["music", "/host/media/neptune/Music"])
+directories.append(["photographs", "/host/media/helios/Photographs"])
+directories.append(["pictures", "/host/media/helios/Pictures"])
+directories.append(["software", "/host/media/mercury/Software"])
+
+
+#
+# Command line arguments
+#
+parser = argparse.ArgumentParser(description="Get BTRFS disk usage")
+parser.add_argument("-s", action="store_true", help="print in human readable format")
+args = parser.parse_args()
+human_readable = args.s
+
+
+#
+# Main
+#
+if human_readable:
+    for (name, path) in directories:
+        disk_usage = get_disk_usage(path, human_readable)
+        print(("%s: %s" % (name, disk_usage)))
+else:
+    # RRD mode
+    while True:
+        for (name, path) in directories:
+            disk_usage = get_disk_usage(path, human_readable)
+            timestamp = int(time.time())
+            size = float(disk_usage)
+            print(
+                (
+                    "PUTVAL {}/exec-du_{}/gauge-size {}:{:.1f}".format(
+                        hostname, name, timestamp, size
+                    )
+                )
+            )
+            sys.stdout.flush()
+        time.sleep(interval)
--- a/collectd/usr/local/bin/power-data
+++ b/collectd/usr/local/bin/power-data
@@ -0,0 +1,66 @@
+#!/usr/bin/python3
+
+#
+# Imports
+#
+import sys
+import time
+import argparse
+import pmt
+
+#
+# Configuration
+#
+hostname = "server"
+measurement_duration = 5
+measurement_interval = 15
+pm = pmt.create("rapl")
+
+#
+# Command line arguments
+#
+parser = argparse.ArgumentParser(description="Get CPU power consumption")
+parser.add_argument("-s", action="store_true", help="print in human readable format")
+args = parser.parse_args()
+human_readable = args.s
+
+
+#
+# Methods
+#
+def get_power():
+    time.sleep(measurement_duration)
+    measurements = dict()
+    state = pm.read()
+    for i in range(state.nr_measurements()):
+        name = state.name(i)
+        watts = state.watts(i)
+        measurements[name] = watts
+    return measurements
+
+
+def print_rrd(measurements):
+    timestamp = int(time.time())
+    for measurement in list(measurements.items()):
+        name = measurement[0].lower()
+        power = measurement[1]
+        print(
+            (
+                "PUTVAL {}/exec-power/gauge-{} {}:{:.1f}".format(
+                    hostname, name, timestamp, power
+                )
+            )
+        )
+
+
+#
+# Main
+#
+if human_readable:
+    print(get_power())
+else:
+    while True:
+        power = get_power()
+        print_rrd(power)
+        sys.stdout.flush()
+        time.sleep(measurement_interval)
--- a/collectd/usr/local/bin/power-data-likwid
+++ b/collectd/usr/local/bin/power-data-likwid
@@ -0,0 +1,77 @@
+#!/usr/bin/python3
+
+#
+# Imports
+#
+import sys
+import time
+import argparse
+import pylikwid
+
+#
+# Configuration
+#
+hostname = "server"
+cpuid = 0
+pinfo = pylikwid.getpowerinfo()
+domainid = pinfo.get("domains").get("PKG").get("ID")
+measurement_duration = 5
+measurement_interval = 15
+dinfo = pinfo.get("domains")
+domain_names = list(dinfo.keys())
+domain_ids = [domain["ID"] for domain in list(dinfo.values())]
+
+#
+# Command line arguments
+#
+parser = argparse.ArgumentParser(description="Get CPU power consumption")
+parser.add_argument("-s", action="store_true", help="print in human readable format")
+args = parser.parse_args()
+human_readable = args.s
+
+#
+# Methods
+#
+def get_power():
+    start = list()
+    end = list()
+    power = list()
+    for domain_id in domain_ids:
+        e_start = pylikwid.startpower(cpuid, domain_id)
+        start.append(e_start)
+    time.sleep(measurement_duration)
+    for domain_id in domain_ids:
+        e_stop = pylikwid.stoppower(cpuid, domain_id)
+        end.append(e_stop)
+    for events in zip(start, end, domain_ids):
+        joules = pylikwid.getpower(events[0], events[1], events[2])
+        power.append(joules / measurement_duration)
+
+    return dict(list(zip(domain_names, power)))
+
+
+def print_rrd(measurements):
+    timestamp = int(time.time())
+    for measurement in list(measurements.items()):
+        name = measurement[0].lower()
+        power = measurement[1]
+        print(
+            (
+                "PUTVAL {}/exec-power/gauge-{} {}:{:.1f}".format(
+                    hostname, name, timestamp, power
+                )
+            )
+        )
+
+
+#
+# Main
+#
+if human_readable:
+    print(get_power())
+else:
+    while True:
+        power = get_power()
+        print_rrd(power)
+        sys.stdout.flush()
+        time.sleep(measurement_interval)
--- a/collectd/usr/local/bin/speedtest-data
+++ b/collectd/usr/local/bin/speedtest-data
@@ -0,0 +1,15 @@
+#!/bin/bash
+SPEEDTEST=/usr/bin/speedtest-cli
+COLLECTION=server
+INTERVAL=900
+
+while :; do
+    SECONDS=0 
+    RESULT=($($SPEEDTEST | grep Mbit | cut -d' ' -f 2))
+    TIMESTAMP=$(date +%s)
+    #echo "PUTVAL $COLLECTION/exec-speedtest/gauge-download interval=$INTERVAL N:${RESULT[0]}"
+    #echo "PUTVAL $COLLECTION/exec-speedtest/gauge-upload interval=$INTERVAL N:${RESULT[1]}"
+    echo "PUTVAL $COLLECTION/exec-speedtest/gauge-download ${TIMESTAMP}:${RESULT[0]}"
+    echo "PUTVAL $COLLECTION/exec-speedtest/gauge-upload ${TIMESTAMP}:${RESULT[1]}"
+    sleep $((INTERVAL-$SECONDS))
+done