about summary refs log tree commit diff
path: root/users/sterni/machines/ingeborg/monitoring.nix
diff options
context:
space:
mode:
Diffstat (limited to 'users/sterni/machines/ingeborg/monitoring.nix')
-rw-r--r--users/sterni/machines/ingeborg/monitoring.nix152
1 files changed, 152 insertions, 0 deletions
diff --git a/users/sterni/machines/ingeborg/monitoring.nix b/users/sterni/machines/ingeborg/monitoring.nix
new file mode 100644
index 0000000000..6244bc5e88
--- /dev/null
+++ b/users/sterni/machines/ingeborg/monitoring.nix
@@ -0,0 +1,152 @@
+{ pkgs, lib, config, ... }:
+
+let
+  ircChannel = "#sterni.lv";
+  irccatPort =
+    builtins.replaceStrings [ ":" ] [ "" ]
+      config.services.depot.irccat.config.tcp.listen;
+
+  mkIrcMessager =
+    { name
+    , msgExpr
+    }:
+    pkgs.writeShellScript name ''
+      set -euo pipefail
+      printf '%s %s\n' ${lib.escapeShellArg ircChannel} ${msgExpr} | \
+        ${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
+    '';
+
+  netdataPort = 19999;
+in
+
+{
+  imports = [
+    ./irccat.nix
+  ];
+
+  config = {
+    services.depot.irccat.config.irc.channels = [
+      ircChannel
+    ];
+
+    # Since we have irccat we can wire up mdadm --monitor
+    boot.swraid.mdadmConf = ''
+      PROGRAM ${
+        mkIrcMessager {
+          name = "mdmonitor-to-irc";
+          # prog EVENT MD_DEVICE COMPONENT_DEVICE
+          msgExpr = ''"mdmonitor: $1($2''${3:+, $3})"'';
+        }
+      }
+    '';
+
+    # TODO(sterni): irc notifications (?)
+    services = {
+      smartd = {
+        enable = true;
+        autodetect = true;
+        # Short self test every day 03:00
+        # Long self test every tuesday 05:00
+        defaults.autodetected = "-a -o on -s (S/../.././03|L/../../2/05)";
+        extraOptions = [
+          "-A"
+          "/var/log/smartd/"
+        ];
+      };
+
+      netdata = {
+        enable = true;
+        config = {
+          logs = {
+            access = "syslog";
+            error = "syslog";
+            debug = "syslog";
+            health = "syslog";
+            collector = "syslog";
+          };
+          web = {
+            "default port" = toString netdataPort;
+            "bind to" = "localhost:${toString netdataPort}";
+          };
+          health = {
+            "script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
+              set -euo pipefail
+
+              # This humongous list is copied over from netdata's alarm-notify.sh
+              roles="''${1}"               # the roles that should be notified for this event
+              args_host="''${2}"           # the host generated this event
+              unique_id="''${3}"           # the unique id of this event
+              alarm_id="''${4}"            # the unique id of the alarm that generated this event
+              event_id="''${5}"            # the incremental id of the event, for this alarm id
+              when="''${6}"                # the timestamp this event occurred
+              name="''${7}"                # the name of the alarm, as given in netdata health.d entries
+              chart="''${8}"               # the name of the chart (type.id)
+              status="''${9}"              # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+              old_status="''${10}"         # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+              value="''${11}"              # the current value of the alarm
+              old_value="''${12}"          # the previous value of the alarm
+              src="''${13}"                # the line number and file the alarm has been configured
+              duration="''${14}"           # the duration in seconds of the previous alarm state
+              non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
+              units="''${16}"              # the units of the value
+              info="''${17}"               # a short description of the alarm
+              value_string="''${18}"       # friendly value (with units)
+              # shellcheck disable=SC2034
+              # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
+              old_value_string="''${19}"   # friendly old value (with units), previously named "old_value_string"
+              calc_expression="''${20}"    # contains the expression that was evaluated to trigger the alarm
+              calc_param_values="''${21}"  # the values of the parameters in the expression, at the time of the evaluation
+              total_warnings="''${22}"     # Total number of alarms in WARNING state
+              total_critical="''${23}"     # Total number of alarms in CRITICAL state
+              total_warn_alarms="''${24}"  # List of alarms in warning state
+              total_crit_alarms="''${25}"  # List of alarms in critical state
+              classification="''${26}"     # The class field from .conf files
+              edit_command_line="''${27}"  # The command to edit the alarm, with the line number
+              child_machine_guid="''${28}" # the machine_guid of the child
+              transition_id="''${29}"      # the transition_id of the alert
+              summary="''${30}"            # the summary text field of the alert
+
+              # Verify that they haven't extended the arg list
+              ARG_COUNT_EXPECTED=30
+
+              if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
+                echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
+              fi
+
+              MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"
+
+              # Filter rules by chart name. This is necessary, since the "enabled alarms"
+              # filter only allows for filtering alarm types, not specific alarms
+              # belonging to that alarm.
+              case "$chart" in
+                # netdata prefers the automatically assigned names (dm-<n>, md<n>,
+                # sd<c>) over ids for alerts, so this configuration assumes that
+                # we have two physical disks which we kind of assert using the
+                # grub configuration (it is more difficult with the soft raid
+                # config).
+                # ${assert builtins.length config.boot.loader.grub.devices == 2; ""}
+                disk_util.sda | disk_util.sdb | disk_backlog.sda | disk_backlog.sdb)
+
+                  ;;
+                disk_util.* | disk_backlog.*)
+                  echo "$0: INFO: DISCARDING message: $MSG" >&2
+                  exit 0
+                  ;;
+                *)
+                  ;;
+              esac
+
+              echo "$0: INFO: sending message: $MSG" >&2
+              ${
+                mkIrcMessager {
+                  name = "trivial-send-to-irc";
+                  msgExpr = "\"$1\"";
+                }
+              } "$MSG"
+            '';
+          };
+        };
+      };
+    };
+  };
+}