about summary refs log tree commit diff
path: root/users/sterni/machines/ingeborg/monitoring.nix
blob: e6fcf33a52da0458716be0d4a3ff1d7c04aa2f86 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
{ pkgs, lib, config, depot, ... }:

let
  ircChannel = "#sterni.lv";
  irccatPort =
    builtins.replaceStrings [ ":" ] [ "" ]
      config.services.depot.irccat.config.tcp.listen;

  send-irc-msg = pkgs.writeShellScript "send-irc-msg" ''
    set -euo pipefail
    printf '%s %s\n' ${lib.escapeShellArg ircChannel} "$1" | \
      ${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
  '';

  netdataPort = 19999;
in

{
  imports = [
    ./http/nginx.nix
    ./irccat.nix
  ];

  config = {
    services.depot.irccat.config.irc.channels = [
      ircChannel
    ];

    # Since we have irccat we can wire up mdadm --monitor
    boot.swraid.mdadmConf = ''
      PROGRAM ${
        pkgs.writeShellScript "mdmonitor-to-irc" ''
          ${send-irc-msg} "mdmonitor: $1($2''${3:+, $3})"
        ''
      }
    '';

    # TODO(sterni): irc notifications (?)
    services = {
      smartd = {
        enable = true;
        autodetect = true;
        # Short self test every day 03:00
        # Long self test every tuesday 05:00
        defaults.autodetected = "-a -o on -s (S/../.././03|L/../../2/05)";
        extraOptions = [
          "-A"
          "/var/log/smartd/"
        ];
      };

      netdata = {
        enable = true;
        config = {
          logs = {
            access = "syslog";
            error = "syslog";
            debug = "syslog";
            health = "syslog";
            collector = "syslog";
          };
          web = {
            "default port" = toString netdataPort;
            "bind to" = "localhost:${toString netdataPort}";
          };
          health = {
            "script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
              set -euo pipefail

              # This humongous list is copied over from netdata's alarm-notify.sh
              roles="''${1}"               # the roles that should be notified for this event
              args_host="''${2}"           # the host generated this event
              unique_id="''${3}"           # the unique id of this event
              alarm_id="''${4}"            # the unique id of the alarm that generated this event
              event_id="''${5}"            # the incremental id of the event, for this alarm id
              when="''${6}"                # the timestamp this event occurred
              name="''${7}"                # the name of the alarm, as given in netdata health.d entries
              chart="''${8}"               # the name of the chart (type.id)
              status="''${9}"              # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
              old_status="''${10}"         # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
              value="''${11}"              # the current value of the alarm
              old_value="''${12}"          # the previous value of the alarm
              src="''${13}"                # the line number and file the alarm has been configured
              duration="''${14}"           # the duration in seconds of the previous alarm state
              non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
              units="''${16}"              # the units of the value
              info="''${17}"               # a short description of the alarm
              value_string="''${18}"       # friendly value (with units)
              # shellcheck disable=SC2034
              # variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
              old_value_string="''${19}"   # friendly old value (with units), previously named "old_value_string"
              calc_expression="''${20}"    # contains the expression that was evaluated to trigger the alarm
              calc_param_values="''${21}"  # the values of the parameters in the expression, at the time of the evaluation
              total_warnings="''${22}"     # Total number of alarms in WARNING state
              total_critical="''${23}"     # Total number of alarms in CRITICAL state
              total_warn_alarms="''${24}"  # List of alarms in warning state
              total_crit_alarms="''${25}"  # List of alarms in critical state
              classification="''${26}"     # The class field from .conf files
              edit_command_line="''${27}"  # The command to edit the alarm, with the line number
              child_machine_guid="''${28}" # the machine_guid of the child
              transition_id="''${29}"      # the transition_id of the alert
              summary="''${30}"            # the summary text field of the alert

              # Verify that they haven't extended the arg list
              ARG_COUNT_EXPECTED=30

              if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
                echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
              fi

              MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"

              # Filter rules by chart name. This is necessary, since the "enabled alarms"
              # filter only allows for filtering alarm types, not specific alarms
              # belonging to that alarm.
              case "$chart" in
                # netdata prefers the automatically assigned names (dm-<n>, md<n>,
                # sd<c>) over ids for alerts, so this configuration assumes that
                # we have two physical disks which we kind of assert using the
                # grub configuration (it is more difficult with the soft raid
                # config).
                # ${assert builtins.length config.boot.loader.grub.devices == 2; ""}
                disk_util.sda | disk_util.sdb | disk_backlog.sda | disk_backlog.sdb)

                  ;;
                disk_util.* | disk_backlog.*)
                  echo "$0: INFO: DISCARDING message: $MSG" >&2
                  exit 0
                  ;;
                *)
                  ;;
              esac

              echo "$0: INFO: sending message: $MSG" >&2
              ${send-irc-msg} "$MSG"
            '';
          };
        };
      };

      # https://learn.netdata.cloud/docs/netdata-agent/configuration/running-the-netdata-agent-behind-a-reverse-proxy/nginx
      nginx.virtualHosts."monitoring.sterni.lv" = {
        forceSSL = true;
        enableACME = true;
        extraConfig = ''
          auth_basic "netdata";
          auth_basic_user_file ${config.age.secretsDir}/netdata-htpasswd;

          location / {
            proxy_set_header X-Forwarded-Host $host;
            proxy_set_header X-Forwarded-Server $host;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_pass http://127.0.0.1:${toString netdataPort};
            proxy_http_version 1.1;
            proxy_pass_request_headers on;
            proxy_set_header Connection "keep-alive";
            proxy_store off;
          }
        '';
      };
    };

    age.secrets.netdata-htpasswd = {
      file = depot.users.sterni.secrets."netdata-htpasswd.age";
      inherit (config.services.nginx) group;
      owner = config.services.nginx.user;
      mode = "700";
    };
  };
}