1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
|
{ pkgs, lib, config, ... }:
let
ircChannel = "#sterni.lv";
irccatPort =
builtins.replaceStrings [ ":" ] [ "" ]
config.services.depot.irccat.config.tcp.listen;
mkIrcMessager =
{ name
, msgExpr
}:
pkgs.writeShellScript name ''
set -euo pipefail
printf '%s %s\n' ${lib.escapeShellArg ircChannel} ${msgExpr} | \
${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
'';
netdataPort = 19999;
in
{
imports = [
./irccat.nix
];
config = {
services.depot.irccat.config.irc.channels = [
ircChannel
];
# Since we have irccat we can wire up mdadm --monitor
boot.swraid.mdadmConf = ''
PROGRAM ${
mkIrcMessager {
name = "mdmonitor-to-irc";
# prog EVENT MD_DEVICE COMPONENT_DEVICE
msgExpr = ''"mdmonitor: $1($2''${3:+, $3})"'';
}
}
'';
# TODO(sterni): irc notifications (?)
services = {
smartd = {
enable = true;
autodetect = true;
# Short self test every day 03:00
# Long self test every tuesday 05:00
defaults.autodetected = "-a -o on -s (S/../.././03|L/../../2/05)";
extraOptions = [
"-A"
"/var/log/smartd/"
];
};
netdata = {
enable = true;
config = {
logs = {
access = "syslog";
error = "syslog";
debug = "syslog";
health = "syslog";
collector = "syslog";
};
web = {
"default port" = toString netdataPort;
"bind to" = "localhost:${toString netdataPort}";
};
health = {
"script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
set -euo pipefail
# This humongous list is copied over from netdata's alarm-notify.sh
roles="''${1}" # the roles that should be notified for this event
args_host="''${2}" # the host generated this event
unique_id="''${3}" # the unique id of this event
alarm_id="''${4}" # the unique id of the alarm that generated this event
event_id="''${5}" # the incremental id of the event, for this alarm id
when="''${6}" # the timestamp this event occurred
name="''${7}" # the name of the alarm, as given in netdata health.d entries
chart="''${8}" # the name of the chart (type.id)
status="''${9}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
old_status="''${10}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
value="''${11}" # the current value of the alarm
old_value="''${12}" # the previous value of the alarm
src="''${13}" # the line number and file the alarm has been configured
duration="''${14}" # the duration in seconds of the previous alarm state
non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
units="''${16}" # the units of the value
info="''${17}" # a short description of the alarm
value_string="''${18}" # friendly value (with units)
# shellcheck disable=SC2034
# variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
old_value_string="''${19}" # friendly old value (with units), previously named "old_value_string"
calc_expression="''${20}" # contains the expression that was evaluated to trigger the alarm
calc_param_values="''${21}" # the values of the parameters in the expression, at the time of the evaluation
total_warnings="''${22}" # Total number of alarms in WARNING state
total_critical="''${23}" # Total number of alarms in CRITICAL state
total_warn_alarms="''${24}" # List of alarms in warning state
total_crit_alarms="''${25}" # List of alarms in critical state
classification="''${26}" # The class field from .conf files
edit_command_line="''${27}" # The command to edit the alarm, with the line number
child_machine_guid="''${28}" # the machine_guid of the child
transition_id="''${29}" # the transition_id of the alert
summary="''${30}" # the summary text field of the alert
# Verify that they haven't extended the arg list
ARG_COUNT_EXPECTED=30
if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
fi
MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"
# Filter rules by chart name. This is necessary, since the "enabled alarms"
# filter only allows for filtering alarm types, not specific alarms
# belonging to that alarm.
case "$chart" in
# netdata prefers the automatically assigned names (dm-<n>, md<n>,
# sd<c>) over ids for alerts, so this configuration assumes that
# we have two physical disks which we kind of assert using the
# grub configuration (it is more difficult with the soft raid
# config).
# ${assert builtins.length config.boot.loader.grub.devices == 2; ""}
disk_util.sda | disk_util.sdb | disk_backlog.sda | disk_backlog.sdb)
;;
disk_util.* | disk_backlog.*)
echo "$0: INFO: DISCARDING message: $MSG" >&2
exit 0
;;
*)
;;
esac
echo "$0: INFO: sending message: $MSG" >&2
${
mkIrcMessager {
name = "trivial-send-to-irc";
msgExpr = "\"$1\"";
}
} "$MSG"
'';
};
};
};
};
};
}
|