k shack/alerting: prepare irc-alerts for binaergewitter
This commit is contained in:
parent
6e44c39fe0
commit
d8de7ad706
@ -1,28 +1,12 @@
|
||||
{ lib,... }:
|
||||
let
|
||||
disk_free_threshold = "10"; # at least this much free disk percentage
|
||||
disk_free_threshold = "5"; # at least this much free disk percentage
|
||||
in {
|
||||
services.prometheus.rules = [(builtins.toJSON
|
||||
{
|
||||
groups = [
|
||||
{ name = "shack-env";
|
||||
rules = [
|
||||
{
|
||||
alert = "Wolf RootPartitionFull";
|
||||
for = "30m";
|
||||
expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}'';
|
||||
labels.severity = "warning";
|
||||
annotations.summary = "{{ $labels.alias }} root disk full";
|
||||
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";
|
||||
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and try to clean up the obsolete files on the machine. There are a couple of things you can do:
|
||||
1. `nix-collect-garbage -d`
|
||||
2. clean up the shack share folder in `/home/share`
|
||||
3. check `du -hs /var/ | sort -h`.
|
||||
4. run `docker system prune`
|
||||
5. `find /var/lib/containers/news/var/lib/htgen-go/items -mtime +7 -delete;` to clean up the link shortener data
|
||||
5. If you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete
|
||||
6. as a last resort the root disk can be expanded via `lvresize -L +10G /dev/pool/root && btrfs filesystem resize max /` '';
|
||||
}
|
||||
{
|
||||
alert = "Puyak RootPartitionFull";
|
||||
for = "30m";
|
||||
@ -32,9 +16,8 @@ in {
|
||||
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";
|
||||
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
|
||||
}
|
||||
# wolf.shack is not worth supervising anymore
|
||||
{
|
||||
alert = "HostDown";
|
||||
alert = "Infra01 down";
|
||||
expr = ''up{alias="infra01.shack"} == 0'';
|
||||
for = "5m";
|
||||
labels.severity = "page";
|
||||
|
207
krebs/2configs/shack/prometheus/irc-alerts.py
Normal file
207
krebs/2configs/shack/prometheus/irc-alerts.py
Normal file
@ -0,0 +1,207 @@
|
||||
import base64
|
||||
import cgi
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import ssl
|
||||
import sys
|
||||
from http.server import BaseHTTPRequestHandler
|
||||
from typing import List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
DEBUG = os.environ.get("DEBUG") is not None
|
||||
|
||||
|
||||
def _irc_send(
|
||||
server: str,
|
||||
nick: str,
|
||||
channel: str,
|
||||
sasl_password: Optional[str] = None,
|
||||
server_password: Optional[str] = None,
|
||||
tls: bool = True,
|
||||
port: int = 6697,
|
||||
messages: List[str] = [],
|
||||
) -> None:
|
||||
if not messages:
|
||||
return
|
||||
|
||||
sock = socket.socket()
|
||||
if tls:
|
||||
sock = ssl.wrap_socket(
|
||||
sock, cert_reqs=ssl.CERT_NONE, ssl_version=ssl.PROTOCOL_TLSv1_2
|
||||
)
|
||||
|
||||
def _send(command: str) -> int:
|
||||
if DEBUG:
|
||||
print(command)
|
||||
return sock.send((f"{command}\r\n").encode())
|
||||
|
||||
def _pong(ping: str):
|
||||
if ping.startswith("PING"):
|
||||
sock.send(ping.replace("PING", "PONG").encode("ascii"))
|
||||
|
||||
recv_file = sock.makefile(mode="r")
|
||||
|
||||
print(f"connect {server}:{port}")
|
||||
sock.connect((server, port))
|
||||
if server_password:
|
||||
_send(f"PASS {server_password}")
|
||||
_send(f"USER {nick} 0 * :{nick}")
|
||||
_send(f"NICK {nick}")
|
||||
for line in recv_file.readline():
|
||||
if re.match(r"^:[^ ]* (MODE|221|376|422) ", line):
|
||||
break
|
||||
else:
|
||||
_pong(line)
|
||||
|
||||
if sasl_password:
|
||||
_send("CAP REQ :sasl")
|
||||
_send("AUTHENTICATE PLAIN")
|
||||
auth = base64.encodebytes(f"{nick}\0{nick}\0{sasl_password}".encode("utf-8"))
|
||||
_send(f"AUTHENTICATE {auth.decode('ascii')}")
|
||||
_send("CAP END")
|
||||
_send(f"JOIN :{channel}")
|
||||
|
||||
for m in messages:
|
||||
_send(f"PRIVMSG {channel} :{m}")
|
||||
|
||||
_send("INFO")
|
||||
for line in recv_file:
|
||||
if DEBUG:
|
||||
print(line, end="")
|
||||
# Assume INFO reply means we are done
|
||||
if "End of /INFO" in line:
|
||||
break
|
||||
else:
|
||||
_pong(line)
|
||||
|
||||
sock.send(b"QUIT")
|
||||
print("disconnect")
|
||||
sock.close()
|
||||
|
||||
|
||||
def irc_send(
|
||||
url: str, notifications: List[str], password: Optional[str] = None
|
||||
) -> None:
|
||||
parsed = urlparse(f"{url}")
|
||||
username = parsed.username or "prometheus"
|
||||
server = parsed.hostname or "chat.freenode.net"
|
||||
if parsed.fragment != "":
|
||||
channel = f"#{parsed.fragment}"
|
||||
else:
|
||||
channel = "#krebs-announce"
|
||||
port = parsed.port or 6697
|
||||
if not password:
|
||||
password = parsed.password
|
||||
if len(notifications) == 0:
|
||||
return
|
||||
_irc_send(
|
||||
server=server,
|
||||
nick=username,
|
||||
sasl_password=password,
|
||||
channel=channel,
|
||||
port=port,
|
||||
messages=notifications,
|
||||
tls=parsed.scheme == "irc+tls",
|
||||
)
|
||||
|
||||
|
||||
class PrometheusWebHook(BaseHTTPRequestHandler):
|
||||
def __init__(
|
||||
self,
|
||||
irc_url: str,
|
||||
conn: socket.socket,
|
||||
addr: Tuple[str, int],
|
||||
password: Optional[str] = None,
|
||||
) -> None:
|
||||
self.irc_url = irc_url
|
||||
self.password = password
|
||||
self.rfile = conn.makefile("rb")
|
||||
self.wfile = conn.makefile("wb")
|
||||
self.client_address = addr
|
||||
self.handle()
|
||||
|
||||
# for testing
|
||||
def do_GET(self) -> None:
|
||||
if DEBUG:
|
||||
print("GET: Request Received")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "text/plain")
|
||||
self.end_headers()
|
||||
self.wfile.write(b"ok")
|
||||
|
||||
def do_POST(self) -> None:
|
||||
if DEBUG:
|
||||
print("POST: Request Received")
|
||||
content_type, _ = cgi.parse_header(self.headers.get("content-type"))
|
||||
|
||||
# refuse to receive non-json content
|
||||
if content_type != "application/json":
|
||||
if DEBUG:
|
||||
print(f"POST: wrong content type {content_type}")
|
||||
self.send_response(400)
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
length = int(self.headers.get("content-length"))
|
||||
payload = json.loads(self.rfile.read(length))
|
||||
messages = []
|
||||
for alert in payload["alerts"]:
|
||||
description = alert["annotations"]["description"]
|
||||
messages.append(f"{alert['status']}: {description}")
|
||||
irc_send(self.irc_url, messages, password=self.password)
|
||||
|
||||
self.do_GET()
|
||||
|
||||
|
||||
def systemd_socket_response() -> None:
|
||||
irc_url = os.environ.get("IRC_URL", None)
|
||||
if irc_url is None:
|
||||
print(
|
||||
"IRC_URL environment variable not set: i.e. IRC_URL=irc+tls://mic92-prometheus@chat.freenode.net/#krebs-announce",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
password = None
|
||||
irc_password_file = os.environ.get("IRC_PASSWORD_FILE", None)
|
||||
if irc_password_file:
|
||||
with open(irc_password_file) as f:
|
||||
password = f.read()
|
||||
|
||||
msgs = sys.argv[1:]
|
||||
|
||||
if msgs != []:
|
||||
irc_send(irc_url, msgs, password=password)
|
||||
return
|
||||
|
||||
nfds = os.environ.get("LISTEN_FDS", None)
|
||||
if nfds is None:
|
||||
print(
|
||||
"LISTEN_FDS not set. Run me with systemd(TM) socket activation?",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
fds = range(3, 3 + int(nfds))
|
||||
|
||||
for fd in fds:
|
||||
sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(0)
|
||||
|
||||
try:
|
||||
while True:
|
||||
PrometheusWebHook(irc_url, *sock.accept(), password=password)
|
||||
except BlockingIOError:
|
||||
# no more connections
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if DEBUG:
|
||||
print("Starting in DEBUG mode")
|
||||
if len(sys.argv) == 3:
|
||||
print(f"{sys.argv[1]} {sys.argv[2]}")
|
||||
irc_send(sys.argv[1], [sys.argv[2]])
|
||||
else:
|
||||
systemd_socket_response()
|
59
krebs/2configs/shack/prometheus/irc-hooks.nix
Normal file
59
krebs/2configs/shack/prometheus/irc-hooks.nix
Normal file
@ -0,0 +1,59 @@
|
||||
{ config
|
||||
, lib
|
||||
, pkgs
|
||||
, ...
|
||||
}:
|
||||
let
|
||||
irc-alerts = pkgs.writers.writePython3 "irc-alerts" {
|
||||
flakeIgnore = [ "E501" ];
|
||||
} (builtins.readFile ./irc-alerts.py);
|
||||
endpoints = {
|
||||
binaergewitter = {
|
||||
url = "irc+tls://puyak-alerts@irc.libera.chat:6697/#binaergewitter-alerts";
|
||||
port = 9223;
|
||||
};
|
||||
};
|
||||
in
|
||||
{
|
||||
systemd.sockets =
|
||||
lib.mapAttrs'
|
||||
(name: opts:
|
||||
lib.nameValuePair "irc-alerts-${name}" {
|
||||
description = "Receive http hook and send irc message for ${name}";
|
||||
wantedBy = [ "sockets.target" ];
|
||||
listenStreams = [ "[::]:${builtins.toString opts.port}" ];
|
||||
}) endpoints;
|
||||
|
||||
systemd.services =
|
||||
lib.mapAttrs'
|
||||
(name: opts:
|
||||
let
|
||||
serviceName = "irc-alerts-${name}";
|
||||
hasPassword = opts.passwordFile or null != null;
|
||||
in
|
||||
lib.nameValuePair serviceName {
|
||||
description = "Receive http hook and send irc message for ${name}";
|
||||
requires = [ "irc-alerts-${name}.socket" ];
|
||||
serviceConfig =
|
||||
{
|
||||
Environment =
|
||||
[
|
||||
"IRC_URL=${opts.url}"
|
||||
"DEBUG=y"
|
||||
]
|
||||
++ lib.optional hasPassword "IRC_PASSWORD_FILE=/run/${serviceName}/password";
|
||||
DynamicUser = true;
|
||||
User = serviceName;
|
||||
ExecStart = irc-alerts;
|
||||
}
|
||||
// lib.optionalAttrs hasPassword {
|
||||
PermissionsStartOnly = true;
|
||||
ExecStartPre =
|
||||
"${pkgs.coreutils}/bin/install -m400 "
|
||||
+ "-o ${serviceName} -g ${serviceName} "
|
||||
+ "${config.sops.secrets.prometheus-irc-password.path} "
|
||||
+ "/run/${serviceName}/password";
|
||||
RuntimeDirectory = serviceName;
|
||||
};
|
||||
}) endpoints;
|
||||
}
|
@ -3,6 +3,7 @@
|
||||
{
|
||||
imports = [
|
||||
./alert-rules.nix
|
||||
./irc-hooks.nix
|
||||
];
|
||||
networking = {
|
||||
firewall.allowedTCPPorts = [
|
||||
@ -129,11 +130,11 @@
|
||||
"group_wait" = "30s";
|
||||
"group_interval" = "2m";
|
||||
"repeat_interval" = "4h";
|
||||
"receiver" = "team-admins";
|
||||
"receiver" = "shack-admins";
|
||||
};
|
||||
"receivers" = [
|
||||
{
|
||||
"name" = "team-admins";
|
||||
"name" = "shack-admins";
|
||||
"email_configs" = [ ];
|
||||
"webhook_configs" = [
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user