about summary refs log tree commit diff
path: root/users/sterni/machines/ingeborg/monitoring.nix
diff options
context:
space:
mode:
authorsterni <sternenseemann@systemli.org>2023-11-30T16·26+0100
committersterni <sternenseemann@systemli.org>2023-11-30T21·21+0000
commit60ca9ba43715028b3bfb5c64c0b12dbf2a8ee5a4 (patch)
tree08caafa53d2acc32025e03d940648134c7e20790 /users/sterni/machines/ingeborg/monitoring.nix
parentc6c1c9f8fc80ef5c92c44ee72e329a55fb26f7e5 (diff)
feat(sterni/ingeborg): set up monitoring via netdata r/7094
Main objective was to get SMART/md monitoring working, alerts go (via
some awful glue code) to #sterni.lv on hackint. Bot nick should also be
registered in the future.

Change-Id: Ia73c5a64ee9f6df62f5fbe21fc1606477e3d6e73
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10174
Reviewed-by: sterni <sternenseemann@systemli.org>
Tested-by: BuildkiteCI
Diffstat (limited to 'users/sterni/machines/ingeborg/monitoring.nix')
-rw-r--r--users/sterni/machines/ingeborg/monitoring.nix131
1 files changed, 131 insertions, 0 deletions
diff --git a/users/sterni/machines/ingeborg/monitoring.nix b/users/sterni/machines/ingeborg/monitoring.nix
new file mode 100644
index 000000000000..a199a6df25a0
--- /dev/null
+++ b/users/sterni/machines/ingeborg/monitoring.nix
@@ -0,0 +1,131 @@
+{ pkgs, lib, config, ... }:
+
+let
+  ircChannel = "#sterni.lv";
+  irccatPort =
+    builtins.replaceStrings [ ":" ] [ "" ]
+      config.services.depot.irccat.config.tcp.listen;
+
+  mkIrcMessager =
+    { name
+    , msgExpr
+    }:
+    pkgs.writeShellScript name ''
+      set -euo pipefail
+      printf '%s %s\n' ${lib.escapeShellArg ircChannel} ${msgExpr} | \
+        ${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
+    '';
+
+  netdataPort = 19999;
+in
+
+{
+  imports = [
+    ./irccat.nix
+  ];
+
+  config = {
+    services.depot.irccat.config.irc.channels = [
+      ircChannel
+    ];
+
+    # Since we have irccat we can wire up mdadm --monitor
+    boot.swraid.mdadmConf = ''
+      PROGRAM ${
+        mkIrcMessager {
+          name = "mdmonitor-to-irc";
+          # prog EVENT MD_DEVICE COMPONENT_DEVICE
+          msgExpr = ''"mdmonitor: $1($2''${3:+, $3})"'';
+        }
+      }
+    '';
+
+    # TODO(sterni): irc notifications (?)
+    services = {
+      smartd = {
+        enable = true;
+        autodetect = true;
+        # Short self test every day 03:00
+        # Long self test every tuesday 05:00
+        defaults.autodetected = "-a -o on -s (S/../.././03|L/../../2/05)";
+        extraOptions = [
+          "-A"
+          "/var/log/smartd/"
+        ];
+      };
+
+      netdata = {
+        enable = true;
+        config = {
+          logs = {
+            access = "syslog";
+            error = "syslog";
+            debug = "syslog";
+            health = "syslog";
+            collector = "syslog";
+          };
+          web = {
+            "default port" = toString netdataPort;
+            "bind to" = "localhost:${toString netdataPort}";
+          };
+          health = {
+            "script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
+              set -euo pipefail
+
+              # This humongous list is copied over from netdata's alarm-notify.sh
+              roles="''${1}"               # the roles that should be notified for this event
+              args_host="''${2}"           # the host generated this event
+              unique_id="''${3}"           # the unique id of this event
+              alarm_id="''${4}"            # the unique id of the alarm that generated this event
+              event_id="''${5}"            # the incremental id of the event, for this alarm id
+              when="''${6}"                # the timestamp this event occurred
+              name="''${7}"                # the name of the alarm, as given in netdata health.d entries
+              chart="''${8}"               # the name of the chart (type.id)
+              status="''${9}"              # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+              old_status="''${10}"         # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+              value="''${11}"              # the current value of the alarm
+              old_value="''${12}"          # the previous value of the alarm
+              src="''${13}"                # the line number and file the alarm has been configured
+              duration="''${14}"           # the duration in seconds of the previous alarm state
+              non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
+              units="''${16}"              # the units of the value
+              info="''${17}"               # a short description of the alarm
+              value_string="''${18}"       # friendly value (with units)
+              # shellcheck disable=SC2034
+              # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
+              old_value_string="''${19}"   # friendly old value (with units), previously named "old_value_string"
+              calc_expression="''${20}"    # contains the expression that was evaluated to trigger the alarm
+              calc_param_values="''${21}"  # the values of the parameters in the expression, at the time of the evaluation
+              total_warnings="''${22}"     # Total number of alarms in WARNING state
+              total_critical="''${23}"     # Total number of alarms in CRITICAL state
+              total_warn_alarms="''${24}"  # List of alarms in warning state
+              total_crit_alarms="''${25}"  # List of alarms in critical state
+              classification="''${26}"     # The class field from .conf files
+              edit_command_line="''${27}"  # The command to edit the alarm, with the line number
+              child_machine_guid="''${28}" # the machine_guid of the child
+              transition_id="''${29}"      # the transition_id of the alert
+              summary="''${30}"            # the summary text field of the alert
+
+              # Verify that they haven't extended the arg list
+              ARG_COUNT_EXPECTED=30
+
+              if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
+                echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
+              fi
+
+              MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"
+
+              echo "$0: INFO: sending message: $MSG" >&2
+              ${
+                mkIrcMessager {
+                  name = "trivial-send-to-irc";
+                  msgExpr = "\"$1\"";
+                }
+              } "$MSG"
+            '';
+          };
+        };
+      };
+    };
+  };
+}