k shack/alerting: prepare irc-alerts for binaergewitter

This commit is contained in:
makefu 2023-04-16 11:53:14 +02:00
parent 6e44c39fe0
commit d8de7ad706
No known key found for this signature in database
GPG Key ID: 36F7711F3FC0F225
4 changed files with 271 additions and 21 deletions

View File

@ -1,28 +1,12 @@
{ lib,... }: { lib,... }:
let let
disk_free_threshold = "10"; # at least this much free disk percentage disk_free_threshold = "5"; # at least this much free disk percentage
in { in {
services.prometheus.rules = [(builtins.toJSON services.prometheus.rules = [(builtins.toJSON
{ {
groups = [ groups = [
{ name = "shack-env"; { name = "shack-env";
rules = [ rules = [
{
alert = "Wolf RootPartitionFull";
for = "30m";
expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}'';
labels.severity = "warning";
annotations.summary = "{{ $labels.alias }} root disk full";
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and try to clean up the obsolete files on the machine. There are a couple of things you can do:
1. `nix-collect-garbage -d`
2. clean up the shack share folder in `/home/share`
3. check `du -hs /var/ | sort -h`.
4. run `docker system prune`
5. `find /var/lib/containers/news/var/lib/htgen-go/items -mtime +7 -delete;` to clean up the link shortener data
5. If you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete
6. as a last resort the root disk can be expanded via `lvresize -L +10G /dev/pool/root && btrfs filesystem resize max /` '';
}
{ {
alert = "Puyak RootPartitionFull"; alert = "Puyak RootPartitionFull";
for = "30m"; for = "30m";
@ -32,9 +16,8 @@ in {
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak"; annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete''; annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
} }
# wolf.shack is not worth supervising anymore
{ {
alert = "HostDown"; alert = "Infra01 down";
expr = ''up{alias="infra01.shack"} == 0''; expr = ''up{alias="infra01.shack"} == 0'';
for = "5m"; for = "5m";
labels.severity = "page"; labels.severity = "page";

View File

@ -0,0 +1,207 @@
import base64
import cgi
import json
import os
import re
import socket
import ssl
import sys
from http.server import BaseHTTPRequestHandler
from typing import List, Optional, Tuple
from urllib.parse import urlparse
DEBUG = os.environ.get("DEBUG") is not None
def _irc_send(
server: str,
nick: str,
channel: str,
sasl_password: Optional[str] = None,
server_password: Optional[str] = None,
tls: bool = True,
port: int = 6697,
messages: List[str] = [],
) -> None:
if not messages:
return
sock = socket.socket()
if tls:
sock = ssl.wrap_socket(
sock, cert_reqs=ssl.CERT_NONE, ssl_version=ssl.PROTOCOL_TLSv1_2
)
def _send(command: str) -> int:
if DEBUG:
print(command)
return sock.send((f"{command}\r\n").encode())
def _pong(ping: str):
if ping.startswith("PING"):
sock.send(ping.replace("PING", "PONG").encode("ascii"))
recv_file = sock.makefile(mode="r")
print(f"connect {server}:{port}")
sock.connect((server, port))
if server_password:
_send(f"PASS {server_password}")
_send(f"USER {nick} 0 * :{nick}")
_send(f"NICK {nick}")
for line in recv_file.readline():
if re.match(r"^:[^ ]* (MODE|221|376|422) ", line):
break
else:
_pong(line)
if sasl_password:
_send("CAP REQ :sasl")
_send("AUTHENTICATE PLAIN")
auth = base64.encodebytes(f"{nick}\0{nick}\0{sasl_password}".encode("utf-8"))
_send(f"AUTHENTICATE {auth.decode('ascii')}")
_send("CAP END")
_send(f"JOIN :{channel}")
for m in messages:
_send(f"PRIVMSG {channel} :{m}")
_send("INFO")
for line in recv_file:
if DEBUG:
print(line, end="")
# Assume INFO reply means we are done
if "End of /INFO" in line:
break
else:
_pong(line)
sock.send(b"QUIT")
print("disconnect")
sock.close()
def irc_send(
url: str, notifications: List[str], password: Optional[str] = None
) -> None:
parsed = urlparse(f"{url}")
username = parsed.username or "prometheus"
server = parsed.hostname or "chat.freenode.net"
if parsed.fragment != "":
channel = f"#{parsed.fragment}"
else:
channel = "#krebs-announce"
port = parsed.port or 6697
if not password:
password = parsed.password
if len(notifications) == 0:
return
_irc_send(
server=server,
nick=username,
sasl_password=password,
channel=channel,
port=port,
messages=notifications,
tls=parsed.scheme == "irc+tls",
)
class PrometheusWebHook(BaseHTTPRequestHandler):
def __init__(
self,
irc_url: str,
conn: socket.socket,
addr: Tuple[str, int],
password: Optional[str] = None,
) -> None:
self.irc_url = irc_url
self.password = password
self.rfile = conn.makefile("rb")
self.wfile = conn.makefile("wb")
self.client_address = addr
self.handle()
# for testing
def do_GET(self) -> None:
if DEBUG:
print("GET: Request Received")
self.send_response(200)
self.send_header("Content-type", "text/plain")
self.end_headers()
self.wfile.write(b"ok")
def do_POST(self) -> None:
if DEBUG:
print("POST: Request Received")
content_type, _ = cgi.parse_header(self.headers.get("content-type"))
# refuse to receive non-json content
if content_type != "application/json":
if DEBUG:
print(f"POST: wrong content type {content_type}")
self.send_response(400)
self.end_headers()
return
length = int(self.headers.get("content-length"))
payload = json.loads(self.rfile.read(length))
messages = []
for alert in payload["alerts"]:
description = alert["annotations"]["description"]
messages.append(f"{alert['status']}: {description}")
irc_send(self.irc_url, messages, password=self.password)
self.do_GET()
def systemd_socket_response() -> None:
irc_url = os.environ.get("IRC_URL", None)
if irc_url is None:
print(
"IRC_URL environment variable not set: i.e. IRC_URL=irc+tls://mic92-prometheus@chat.freenode.net/#krebs-announce",
file=sys.stderr,
)
sys.exit(1)
password = None
irc_password_file = os.environ.get("IRC_PASSWORD_FILE", None)
if irc_password_file:
with open(irc_password_file) as f:
password = f.read()
msgs = sys.argv[1:]
if msgs != []:
irc_send(irc_url, msgs, password=password)
return
nfds = os.environ.get("LISTEN_FDS", None)
if nfds is None:
print(
"LISTEN_FDS not set. Run me with systemd(TM) socket activation?",
file=sys.stderr,
)
sys.exit(1)
fds = range(3, 3 + int(nfds))
for fd in fds:
sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(0)
try:
while True:
PrometheusWebHook(irc_url, *sock.accept(), password=password)
except BlockingIOError:
# no more connections
pass
if __name__ == "__main__":
if DEBUG:
print("Starting in DEBUG mode")
if len(sys.argv) == 3:
print(f"{sys.argv[1]} {sys.argv[2]}")
irc_send(sys.argv[1], [sys.argv[2]])
else:
systemd_socket_response()

View File

@ -0,0 +1,59 @@
{ config
, lib
, pkgs
, ...
}:
let
irc-alerts = pkgs.writers.writePython3 "irc-alerts" {
flakeIgnore = [ "E501" ];
} (builtins.readFile ./irc-alerts.py);
endpoints = {
binaergewitter = {
url = "irc+tls://puyak-alerts@irc.libera.chat:6697/#binaergewitter-alerts";
port = 9223;
};
};
in
{
systemd.sockets =
lib.mapAttrs'
(name: opts:
lib.nameValuePair "irc-alerts-${name}" {
description = "Receive http hook and send irc message for ${name}";
wantedBy = [ "sockets.target" ];
listenStreams = [ "[::]:${builtins.toString opts.port}" ];
}) endpoints;
systemd.services =
lib.mapAttrs'
(name: opts:
let
serviceName = "irc-alerts-${name}";
hasPassword = opts.passwordFile or null != null;
in
lib.nameValuePair serviceName {
description = "Receive http hook and send irc message for ${name}";
requires = [ "irc-alerts-${name}.socket" ];
serviceConfig =
{
Environment =
[
"IRC_URL=${opts.url}"
"DEBUG=y"
]
++ lib.optional hasPassword "IRC_PASSWORD_FILE=/run/${serviceName}/password";
DynamicUser = true;
User = serviceName;
ExecStart = irc-alerts;
}
// lib.optionalAttrs hasPassword {
PermissionsStartOnly = true;
ExecStartPre =
"${pkgs.coreutils}/bin/install -m400 "
+ "-o ${serviceName} -g ${serviceName} "
+ "${config.sops.secrets.prometheus-irc-password.path} "
+ "/run/${serviceName}/password";
RuntimeDirectory = serviceName;
};
}) endpoints;
}

View File

@ -3,6 +3,7 @@
{ {
imports = [ imports = [
./alert-rules.nix ./alert-rules.nix
./irc-hooks.nix
]; ];
networking = { networking = {
firewall.allowedTCPPorts = [ firewall.allowedTCPPorts = [
@ -129,11 +130,11 @@
"group_wait" = "30s"; "group_wait" = "30s";
"group_interval" = "2m"; "group_interval" = "2m";
"repeat_interval" = "4h"; "repeat_interval" = "4h";
"receiver" = "team-admins"; "receiver" = "shack-admins";
}; };
"receivers" = [ "receivers" = [
{ {
"name" = "team-admins"; "name" = "shack-admins";
"email_configs" = [ ]; "email_configs" = [ ];
"webhook_configs" = [ "webhook_configs" = [
{ {