From 60ca9ba43715028b3bfb5c64c0b12dbf2a8ee5a4 Mon Sep 17 00:00:00 2001 From: sterni Date: Thu, 30 Nov 2023 17:26:20 +0100 Subject: feat(sterni/ingeborg): set up monitoring via netdata Main objective was to get SMART/md monitoring working, alerts go (via some awful glue code) to #sterni.lv on hackint. Bot nick should also be registered in the future. Change-Id: Ia73c5a64ee9f6df62f5fbe21fc1606477e3d6e73 Reviewed-on: https://cl.tvl.fyi/c/depot/+/10174 Reviewed-by: sterni Tested-by: BuildkiteCI --- users/sterni/machines/ingeborg/default.nix | 1 + users/sterni/machines/ingeborg/irccat.nix | 23 +++++ users/sterni/machines/ingeborg/monitoring.nix | 131 ++++++++++++++++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 users/sterni/machines/ingeborg/irccat.nix create mode 100644 users/sterni/machines/ingeborg/monitoring.nix (limited to 'users/sterni/machines/ingeborg') diff --git a/users/sterni/machines/ingeborg/default.nix b/users/sterni/machines/ingeborg/default.nix index 8c512eac3b..8784c37515 100644 --- a/users/sterni/machines/ingeborg/default.nix +++ b/users/sterni/machines/ingeborg/default.nix @@ -9,6 +9,7 @@ ./network.nix # (More or less) pluggable service configuration (depot.path.origSrc + "/ops/modules/btrfs-auto-scrub.nix") + ./monitoring.nix ]; config = { diff --git a/users/sterni/machines/ingeborg/irccat.nix b/users/sterni/machines/ingeborg/irccat.nix new file mode 100644 index 0000000000..0c40f15e33 --- /dev/null +++ b/users/sterni/machines/ingeborg/irccat.nix @@ -0,0 +1,23 @@ +{ depot, config, pkgs, lib, ... }: + +{ + imports = [ + (depot.path.origSrc + "/ops/modules/irccat.nix") + ]; + + config = { + services.depot.irccat = { + enable = true; + secretsFile = builtins.toFile "empty.json" "{}"; # TODO(sterni): register + config = { + tcp.listen = ":4722"; # ircc + irc = { + server = "irc.hackint.org:6697"; + tls = true; + nick = config.networking.hostName; + realname = "irccat"; + }; + }; + }; + }; +} diff --git a/users/sterni/machines/ingeborg/monitoring.nix b/users/sterni/machines/ingeborg/monitoring.nix new file mode 100644 index 0000000000..a199a6df25 --- /dev/null +++ b/users/sterni/machines/ingeborg/monitoring.nix @@ -0,0 +1,131 @@ +{ pkgs, lib, config, ... }: + +let + ircChannel = "#sterni.lv"; + irccatPort = + builtins.replaceStrings [ ":" ] [ "" ] + config.services.depot.irccat.config.tcp.listen; + + mkIrcMessager = + { name + , msgExpr + }: + pkgs.writeShellScript name '' + set -euo pipefail + printf '%s %s\n' ${lib.escapeShellArg ircChannel} ${msgExpr} | \ + ${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort} + ''; + + netdataPort = 19999; +in + +{ + imports = [ + ./irccat.nix + ]; + + config = { + services.depot.irccat.config.irc.channels = [ + ircChannel + ]; + + # Since we have irccat we can wire up mdadm --monitor + boot.swraid.mdadmConf = '' + PROGRAM ${ + mkIrcMessager { + name = "mdmonitor-to-irc"; + # prog EVENT MD_DEVICE COMPONENT_DEVICE + msgExpr = ''"mdmonitor: $1($2''${3:+, $3})"''; + } + } + ''; + + # TODO(sterni): irc notifications (?) + services = { + smartd = { + enable = true; + autodetect = true; + # Short self test every day 03:00 + # Long self test every tuesday 05:00 + defaults.autodetected = "-a -o on -s (S/../.././03|L/../../2/05)"; + extraOptions = [ + "-A" + "/var/log/smartd/" + ]; + }; + + netdata = { + enable = true; + config = { + logs = { + access = "syslog"; + error = "syslog"; + debug = "syslog"; + health = "syslog"; + collector = "syslog"; + }; + web = { + "default port" = toString netdataPort; + "bind to" = "localhost:${toString netdataPort}"; + }; + health = { + "script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" '' + set -euo pipefail + + # This humongous list is copied over from netdata's alarm-notify.sh + roles="''${1}" # the roles that should be notified for this event + args_host="''${2}" # the host generated this event + unique_id="''${3}" # the unique id of this event + alarm_id="''${4}" # the unique id of the alarm that generated this event + event_id="''${5}" # the incremental id of the event, for this alarm id + when="''${6}" # the timestamp this event occurred + name="''${7}" # the name of the alarm, as given in netdata health.d entries + chart="''${8}" # the name of the chart (type.id) + status="''${9}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + old_status="''${10}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + value="''${11}" # the current value of the alarm + old_value="''${12}" # the previous value of the alarm + src="''${13}" # the line number and file the alarm has been configured + duration="''${14}" # the duration in seconds of the previous alarm state + non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear + units="''${16}" # the units of the value + info="''${17}" # a short description of the alarm + value_string="''${18}" # friendly value (with units) + # shellcheck disable=SC2034 + # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947 + old_value_string="''${19}" # friendly old value (with units), previously named "old_value_string" + calc_expression="''${20}" # contains the expression that was evaluated to trigger the alarm + calc_param_values="''${21}" # the values of the parameters in the expression, at the time of the evaluation + total_warnings="''${22}" # Total number of alarms in WARNING state + total_critical="''${23}" # Total number of alarms in CRITICAL state + total_warn_alarms="''${24}" # List of alarms in warning state + total_crit_alarms="''${25}" # List of alarms in critical state + classification="''${26}" # The class field from .conf files + edit_command_line="''${27}" # The command to edit the alarm, with the line number + child_machine_guid="''${28}" # the machine_guid of the child + transition_id="''${29}" # the transition_id of the alert + summary="''${30}" # the summary text field of the alert + + # Verify that they haven't extended the arg list + ARG_COUNT_EXPECTED=30 + + if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then + echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2 + fi + + MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string" + + echo "$0: INFO: sending message: $MSG" >&2 + ${ + mkIrcMessager { + name = "trivial-send-to-irc"; + msgExpr = "\"$1\""; + } + } "$MSG" + ''; + }; + }; + }; + }; + }; +} -- cgit 1.4.1