Add collectd files

This commit is contained in:
Bram Veenboer
2024-12-23 10:47:41 +01:00
parent c825994225
commit d5de88fcf2
13 changed files with 704 additions and 0 deletions

132
collectd/etc/collectd.conf Normal file
View File

@@ -0,0 +1,132 @@
# Config file for collectd(1).
#
# Some plugins need additional configuration and are disabled by default.
# Please read collectd.conf(5) for details.
#
# You should also read /usr/share/doc/collectd-core/README.Debian.plugins
# before enabling any more plugins.
##############################################################################
# Global #
#----------------------------------------------------------------------------#
# Global settings for the daemon. #
##############################################################################
Hostname "server"
FQDNLookup true
#BaseDir "/var/lib/collectd"
#PluginDir "/usr/lib/collectd"
#TypesDB "/usr/share/collectd/types.db" "/etc/collectd/my_types.db"
#----------------------------------------------------------------------------#
# When enabled, plugins are loaded automatically with the default options #
# when an appropriate <Plugin ...> block is encountered. #
# Disabled by default. #
#----------------------------------------------------------------------------#
#AutoLoadPlugin false
#----------------------------------------------------------------------------#
# Interval at which to query values. This may be overwritten on a per-plugin #
# base by using the 'Interval' option of the LoadPlugin block: #
# <LoadPlugin foo> #
# Interval 60 #
# </LoadPlugin> #
#----------------------------------------------------------------------------#
Interval 10
#Timeout 2
#ReadThreads 5
#WriteThreads 5
# Limit the size of the write queue. Default is no limit. Setting up a limit
# is recommended for servers handling a high volume of traffic.
#WriteQueueLimitHigh 1000000
#WriteQueueLimitLow 800000
##############################################################################
# Logging #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log #
# messages generated when loading or configuring other plugins can be #
# accessed. #
##############################################################################
LoadPlugin logfile
<Plugin logfile>
LogLevel "info"
File "/var/lib/collectd/log/collectd.log"
Timestamp true
PrintSeverity true
</Plugin>
##############################################################################
# LoadPlugin section #
#----------------------------------------------------------------------------#
# Specify what features to activate. #
##############################################################################
LoadPlugin cpu
LoadPlugin cpufreq
LoadPlugin df
LoadPlugin disk
LoadPlugin entropy
#LoadPlugin interface
#LoadPlugin irq
LoadPlugin load
LoadPlugin md
LoadPlugin memory
LoadPlugin network
LoadPlugin processes
LoadPlugin rrdtool
LoadPlugin sensors
LoadPlugin swap
LoadPlugin uptime
LoadPlugin users
LoadPlugin smart
##############################################################################
# Plugin configuration #
#----------------------------------------------------------------------------#
# In this section configuration stubs for each plugin are provided. A desc- #
# ription of those options is available in the collectd.conf(5) manual page. #
##############################################################################
<Plugin rrdtool>
DataDir "/var/lib/collectd/rrd"
# CacheTimeout 120
# CacheFlush 900
# WritesPerSecond 30
# CreateFilesAsync false
# RandomTimeout 0
#
# The following settings are rather advanced
# and should usually not be touched:
# StepSize 10
# HeartBeat 20
# RRARows 1200
# RRATimespan 158112000
# XFF 0.1
</Plugin>
<Plugin disk>
Disk "sda"
Disk "sdb"
Disk "nvme0"
IgnoreSelected false
</Plugin>
#<Plugin md>
# Device "/dev/md0"
# IgnoreSelected false
#</Plugin>
<Plugin "smart">
Disk "sda"
Disk "sdb"
Disk "nvme0"
IgnoreSelected false
</Plugin>
<Include "/etc/collectd/collectd.conf.d">
Filter "*.conf"
</Include>

View File

@@ -0,0 +1,5 @@
LoadPlugin exec
<Plugin "exec">
Exec collectd "/host/usr/local/bin/btrfs-data"
</Plugin>

View File

@@ -0,0 +1,5 @@
LoadPlugin exec
<Plugin "exec">
Exec collectd "/host/usr/local/bin/cpufreq-data"
</Plugin>

View File

@@ -0,0 +1,5 @@
<Plugin df>
MountPoint "/media/docker"
FSType "ext4"
IgnoreSelected false
</Plugin>

View File

@@ -0,0 +1,5 @@
LoadPlugin exec
<Plugin "exec">
Exec collectd "/host/usr/local/bin/du-data"
</Plugin>

View File

@@ -0,0 +1,5 @@
LoadPlugin exec
<Plugin "exec">
Exec collectd "/host/usr/local/bin/power-data"
</Plugin>

View File

@@ -0,0 +1,5 @@
LoadPlugin exec
<Plugin "exec">
Exec nobody "/host/usr/local/bin/speedtest-data"
</Plugin>

245
collectd/usr/local/bin/btrfs-data Executable file
View File

@@ -0,0 +1,245 @@
#!/usr/bin/python3
#
# Imports
#
import sys
import time
import subprocess
import argparse
#
# Misc
#
# sys.tracebacklimit = 0
#
# Global variables
#
size_data_total = 0
size_data_exclusive = 0
size_snapshot_total = 0
size_snapshot_exclusive = 0
#
# Methods
#
def get_subvol_list(path):
command = "sudo btrfs subvolume list -t %s" % (path)
status, output = subprocess.getstatusoutput(command)
if status != 0:
raise Exception(command)
# Every line contains the following values: subvol_id, gen, toplevel, path
return output.splitlines()[2:]
def get_filesystem_size(path):
command = "sudo btrfs filesystem show --raw %s" % (path)
status, output = subprocess.getstatusoutput(command)
if status != 0:
# This command fails when running inside Docker container
# return maximum size of any filesystem instead
command = "sudo btrfs filesystem show --raw"
status, output = subprocess.getstatusoutput(command)
lines = output.splitlines()
lines = [x for x in lines if "devid" in x]
sizes = [int(line.split()[3]) for line in lines]
return max(sizes)
# The sizes are on the third line
line = output.splitlines()[2]
# Element 3 and 5 respectively contain total and used sizes
return int(line.split()[3])
def get_id_root(name, path):
lines = get_subvol_list(path)
# Filter lines where toplevel == 5
subvol_ids = [x for x in lines if int(x.split()[2]) == 5]
# Try to retrieve the subvol_id for the root subvolume (if any)
if len(subvol_ids) == 1:
# The path contains a btrfs filesystem without subvolume for data
return int(subvol_ids[0].split()[0])
else:
# The path contains a btrfs filesystem with multiple subvolumes for data
try:
return int(list(filter(lambda x: x.split()[3] == name, subvol_ids))[0].split()[0])
except IndexError:
pass
# Volume not found, root is probably the btrfs default (5)
return 5
def get_id_subvolumes(path, subvol_id):
lines = get_subvol_list(path)
lines = [x for x in lines if int(x.split()[2]) == subvol_id]
return list([int(x.split()[0]) for x in lines])
def get_disk_usage(name, path):
id_root = get_id_root(name, path)
id_subvolumes = get_id_subvolumes(path, id_root)
size_filesystem = get_filesystem_size(path)
# Get disk usage from quota
command = "sudo btrfs qgroup show --raw %s" % (path)
status, output = subprocess.getstatusoutput(command)
if status != 0:
raise Exception(command)
lines = output.splitlines()[2:]
# Global variables
global size_data_total
global size_data_exclusive
global size_snapshot_total
global size_snapshot_exclusive
# Total data volume in subvolume
size_data_total = 0
# Total data volume in snapshots
# -> this variable is useless
size_snapshot_total = 0
# Data exclusively in subvolume
# -> data that is not (yet) incorporated in a snapshot
size_data_exclusive = 0
# Data exclusively available in snapshots
# -> data that was removed from volume
size_snapshot_exclusive = 0
for line in lines:
split = line.split()
subvol_id = 0
size_total = 0
size_exclusive = 0
try:
subvol_id = int(split[0].split("/")[1])
size_total = float(split[1])
size_exclusive = float(split[2])
except IndexError:
# ignore "WARNING: Quota disabled"
pass
# size_exclusive is incorrect when snapshot is
# removed and qgroups are not updated yet,
# ignore the value when it seems unrealistic
if size_exclusive > size_filesystem:
size_exclusive = 0
if subvol_id == id_root:
size_data_total = size_total
size_data_exclusive = size_exclusive
elif subvol_id in id_subvolumes:
size_snapshot_total += size_total
size_snapshot_exclusive += size_exclusive
def rescan_quota(path):
command = "sudo btrfs quota rescan %s" % (path)
status, output = subprocess.getstatusoutput(command)
if status != 0:
Exception(command)
def print_human_readable(name):
global size_data_total
global size_data_exclusive
global size_snapshot_exclusive
size_data_total = size_data_total / (1024 * 1e6)
size_data_exclusive = size_data_exclusive / (1024 * 1e6)
size_snapshot_exclusive = size_snapshot_exclusive / (1024 * 1e6)
print(
"%10s: %6.1f Gb, %6.1f Gb, %6.1f Gb"
% (name, size_data_total, size_data_exclusive, size_snapshot_exclusive)
)
def print_rrd(name):
timestamp = int(time.time())
print(
(
"PUTVAL {}/exec-btrfs_{}/gauge-data_total {}:{:.1f}".format(
hostname, name, timestamp, size_data_total
)
)
)
print(
(
"PUTVAL {}/exec-btrfs_{}/gauge-data_exclusive {}:{:.1f}".format(
hostname, name, timestamp, size_data_exclusive
)
)
)
print(
(
"PUTVAL {}/exec-btrfs_{}/gauge-snapshot_total {}:{:.1f}".format(
hostname, name, timestamp, size_snapshot_total
)
)
)
print(
(
"PUTVAL {}/exec-btrfs_{}/gauge-snapshot_exclusive {}:{:.1f}".format(
hostname, name, timestamp, size_snapshot_exclusive
)
)
)
#
# Volumes to scan
#
hostname = "server"
interval = 10
volumes = list()
volumes.append(["helios", "/host/media/helios"])
volumes.append(["borg", "/host/media//borg"])
volumes.append(["rsnapshot", "/host/media/rsnapshot"])
volumes.append(["mercury", "/host/media/mercury"])
volumes.append(["neptune", "/host/media/neptune"])
volumes.append(["nubes", "/host/media/nubes"])
volumes.append(["scratch", "/host/media/scratch"])
#
# Command line arguments
#
parser = argparse.ArgumentParser(description="Get BTRFS disk usage")
parser.add_argument("-s", action="store_true", help="print in human readable format")
args = parser.parse_args()
human_readable = args.s
#
# Main
#
if human_readable:
for (name, path) in volumes:
get_disk_usage(name, path)
print_human_readable(name)
else:
# RRD mode
while True:
for (name, path) in volumes:
get_disk_usage(name, path)
print_rrd(name)
sys.stdout.flush()
time.sleep(interval)
# rescan_quota(path)

View File

@@ -0,0 +1,57 @@
#!/usr/bin/python3
import argparse
import time
import sys
import os
hostname = "server"
measurement_interval = 5
def get_cpu_frequencies():
frequencies = []
try:
cpu_dirs = [
d
for d in os.listdir("/sys/devices/system/cpu/")
if d.startswith("cpu") and d[3:].isdigit()
]
for cpu_dir in cpu_dirs:
with open(
f"/sys/devices/system/cpu/{cpu_dir}/cpufreq/scaling_cur_freq", "r"
) as f:
frequency = int(f.read().strip()) / 1000 # Convert Hz to MHz
frequencies.append((int(cpu_dir[3:]), frequency))
except Exception as e:
print("Error:", e)
return frequencies
def main():
parser = argparse.ArgumentParser(description="Query CPU frequencies.")
parser.add_argument(
"-s",
"--human-readable",
action="store_true",
help="Print frequencies in human-readable format",
)
args = parser.parse_args()
if args.human_readable:
frequencies = get_cpu_frequencies()
for cpu, frequency in frequencies:
print(f"CPU{cpu} Frequency: {frequency:.2f} MHz")
else:
while True:
frequencies = get_cpu_frequencies()
timestamp = int(time.time())
for cpu, frequency in frequencies:
print(
f"PUTVAL {hostname}/cpu-frequency/gauge-cpu{cpu} {timestamp}:{frequency:.0f}"
)
sys.stdout.flush()
time.sleep(measurement_interval)
if __name__ == "__main__":
main()

82
collectd/usr/local/bin/du-data Executable file
View File

@@ -0,0 +1,82 @@
#!/usr/bin/python3
#
# Imports
#
import sys
import time
import subprocess
import argparse
#
# Methods
#
def get_disk_usage(path, human_readable):
"""disk usage in human readable format (e.g. '2,1GB')"""
arguments = "-sh" if human_readable else "-s"
command = "du %s %s" % (arguments, path)
status, output = subprocess.getstatusoutput(command)
if status != 0:
raise Exception(command)
disk_usage = output.split()[0]
if not human_readable:
# du reports in units of 1024 bytes, convert to plain number of bytes
disk_usage = int(disk_usage) * 1024
return disk_usage
#
# Directories to scan
#
hostname = "server"
interval = 10
directories = list()
directories.append(["bram", "/host/media/helios/Bram"])
directories.append(["rik", "/host/media/helios/Rik"])
directories.append(["books", "/host/media/neptune/Books"])
directories.append(["games", "/host/media/mercury/Games"])
directories.append(["misc", "/host/media/neptune/Miscellaneous"])
directories.append(["shows", "/host/media/neptune/Video/Shows"])
directories.append(["movies", "/host/media/neptune/Video/Movies"])
directories.append(["music", "/host/media/neptune/Music"])
directories.append(["photographs", "/host/media/helios/Photographs"])
directories.append(["pictures", "/host/media/helios/Pictures"])
directories.append(["software", "/host/media/mercury/Software"])
#
# Command line arguments
#
parser = argparse.ArgumentParser(description="Get BTRFS disk usage")
parser.add_argument("-s", action="store_true", help="print in human readable format")
args = parser.parse_args()
human_readable = args.s
#
# Main
#
if human_readable:
for (name, path) in directories:
disk_usage = get_disk_usage(path, human_readable)
print(("%s: %s" % (name, disk_usage)))
else:
# RRD mode
while True:
for (name, path) in directories:
disk_usage = get_disk_usage(path, human_readable)
timestamp = int(time.time())
size = float(disk_usage)
print(
(
"PUTVAL {}/exec-du_{}/gauge-size {}:{:.1f}".format(
hostname, name, timestamp, size
)
)
)
sys.stdout.flush()
time.sleep(interval)

View File

@@ -0,0 +1,66 @@
#!/usr/bin/python3
#
# Imports
#
import sys
import time
import argparse
import pmt
#
# Configuration
#
hostname = "server"
measurement_duration = 5
measurement_interval = 15
pm = pmt.create("rapl")
#
# Command line arguments
#
parser = argparse.ArgumentParser(description="Get CPU power consumption")
parser.add_argument("-s", action="store_true", help="print in human readable format")
args = parser.parse_args()
human_readable = args.s
#
# Methods
#
def get_power():
time.sleep(measurement_duration)
measurements = dict()
state = pm.read()
for i in range(state.nr_measurements()):
name = state.name(i)
watts = state.watts(i)
measurements[name] = watts
return measurements
def print_rrd(measurements):
timestamp = int(time.time())
for measurement in list(measurements.items()):
name = measurement[0].lower()
power = measurement[1]
print(
(
"PUTVAL {}/exec-power/gauge-{} {}:{:.1f}".format(
hostname, name, timestamp, power
)
)
)
#
# Main
#
if human_readable:
print(get_power())
else:
while True:
power = get_power()
print_rrd(power)
sys.stdout.flush()
time.sleep(measurement_interval)

View File

@@ -0,0 +1,77 @@
#!/usr/bin/python3
#
# Imports
#
import sys
import time
import argparse
import pylikwid
#
# Configuration
#
hostname = "server"
cpuid = 0
pinfo = pylikwid.getpowerinfo()
domainid = pinfo.get("domains").get("PKG").get("ID")
measurement_duration = 5
measurement_interval = 15
dinfo = pinfo.get("domains")
domain_names = list(dinfo.keys())
domain_ids = [domain["ID"] for domain in list(dinfo.values())]
#
# Command line arguments
#
parser = argparse.ArgumentParser(description="Get CPU power consumption")
parser.add_argument("-s", action="store_true", help="print in human readable format")
args = parser.parse_args()
human_readable = args.s
#
# Methods
#
def get_power():
start = list()
end = list()
power = list()
for domain_id in domain_ids:
e_start = pylikwid.startpower(cpuid, domain_id)
start.append(e_start)
time.sleep(measurement_duration)
for domain_id in domain_ids:
e_stop = pylikwid.stoppower(cpuid, domain_id)
end.append(e_stop)
for events in zip(start, end, domain_ids):
joules = pylikwid.getpower(events[0], events[1], events[2])
power.append(joules / measurement_duration)
return dict(list(zip(domain_names, power)))
def print_rrd(measurements):
timestamp = int(time.time())
for measurement in list(measurements.items()):
name = measurement[0].lower()
power = measurement[1]
print(
(
"PUTVAL {}/exec-power/gauge-{} {}:{:.1f}".format(
hostname, name, timestamp, power
)
)
)
#
# Main
#
if human_readable:
print(get_power())
else:
while True:
power = get_power()
print_rrd(power)
sys.stdout.flush()
time.sleep(measurement_interval)

View File

@@ -0,0 +1,15 @@
#!/bin/bash
SPEEDTEST=/usr/bin/speedtest-cli
COLLECTION=server
INTERVAL=900
while :; do
SECONDS=0
RESULT=($($SPEEDTEST | grep Mbit | cut -d' ' -f 2))
TIMESTAMP=$(date +%s)
#echo "PUTVAL $COLLECTION/exec-speedtest/gauge-download interval=$INTERVAL N:${RESULT[0]}"
#echo "PUTVAL $COLLECTION/exec-speedtest/gauge-upload interval=$INTERVAL N:${RESULT[1]}"
echo "PUTVAL $COLLECTION/exec-speedtest/gauge-download ${TIMESTAMP}:${RESULT[0]}"
echo "PUTVAL $COLLECTION/exec-speedtest/gauge-upload ${TIMESTAMP}:${RESULT[1]}"
sleep $((INTERVAL-$SECONDS))
done