k shack/alerting: prepare irc-alerts for binaergewitter
This commit is contained in:
parent
6e44c39fe0
commit
d8de7ad706
@ -1,28 +1,12 @@
|
|||||||
{ lib,... }:
|
{ lib,... }:
|
||||||
let
|
let
|
||||||
disk_free_threshold = "10"; # at least this much free disk percentage
|
disk_free_threshold = "5"; # at least this much free disk percentage
|
||||||
in {
|
in {
|
||||||
services.prometheus.rules = [(builtins.toJSON
|
services.prometheus.rules = [(builtins.toJSON
|
||||||
{
|
{
|
||||||
groups = [
|
groups = [
|
||||||
{ name = "shack-env";
|
{ name = "shack-env";
|
||||||
rules = [
|
rules = [
|
||||||
{
|
|
||||||
alert = "Wolf RootPartitionFull";
|
|
||||||
for = "30m";
|
|
||||||
expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}'';
|
|
||||||
labels.severity = "warning";
|
|
||||||
annotations.summary = "{{ $labels.alias }} root disk full";
|
|
||||||
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";
|
|
||||||
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and try to clean up the obsolete files on the machine. There are a couple of things you can do:
|
|
||||||
1. `nix-collect-garbage -d`
|
|
||||||
2. clean up the shack share folder in `/home/share`
|
|
||||||
3. check `du -hs /var/ | sort -h`.
|
|
||||||
4. run `docker system prune`
|
|
||||||
5. `find /var/lib/containers/news/var/lib/htgen-go/items -mtime +7 -delete;` to clean up the link shortener data
|
|
||||||
5. If you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete
|
|
||||||
6. as a last resort the root disk can be expanded via `lvresize -L +10G /dev/pool/root && btrfs filesystem resize max /` '';
|
|
||||||
}
|
|
||||||
{
|
{
|
||||||
alert = "Puyak RootPartitionFull";
|
alert = "Puyak RootPartitionFull";
|
||||||
for = "30m";
|
for = "30m";
|
||||||
@ -32,9 +16,8 @@ in {
|
|||||||
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";
|
annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";
|
||||||
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
|
annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
|
||||||
}
|
}
|
||||||
# wolf.shack is not worth supervising anymore
|
|
||||||
{
|
{
|
||||||
alert = "HostDown";
|
alert = "Infra01 down";
|
||||||
expr = ''up{alias="infra01.shack"} == 0'';
|
expr = ''up{alias="infra01.shack"} == 0'';
|
||||||
for = "5m";
|
for = "5m";
|
||||||
labels.severity = "page";
|
labels.severity = "page";
|
||||||
|
207
krebs/2configs/shack/prometheus/irc-alerts.py
Normal file
207
krebs/2configs/shack/prometheus/irc-alerts.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
import base64
|
||||||
|
import cgi
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
import ssl
|
||||||
|
import sys
|
||||||
|
from http.server import BaseHTTPRequestHandler
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
DEBUG = os.environ.get("DEBUG") is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _irc_send(
|
||||||
|
server: str,
|
||||||
|
nick: str,
|
||||||
|
channel: str,
|
||||||
|
sasl_password: Optional[str] = None,
|
||||||
|
server_password: Optional[str] = None,
|
||||||
|
tls: bool = True,
|
||||||
|
port: int = 6697,
|
||||||
|
messages: List[str] = [],
|
||||||
|
) -> None:
|
||||||
|
if not messages:
|
||||||
|
return
|
||||||
|
|
||||||
|
sock = socket.socket()
|
||||||
|
if tls:
|
||||||
|
sock = ssl.wrap_socket(
|
||||||
|
sock, cert_reqs=ssl.CERT_NONE, ssl_version=ssl.PROTOCOL_TLSv1_2
|
||||||
|
)
|
||||||
|
|
||||||
|
def _send(command: str) -> int:
|
||||||
|
if DEBUG:
|
||||||
|
print(command)
|
||||||
|
return sock.send((f"{command}\r\n").encode())
|
||||||
|
|
||||||
|
def _pong(ping: str):
|
||||||
|
if ping.startswith("PING"):
|
||||||
|
sock.send(ping.replace("PING", "PONG").encode("ascii"))
|
||||||
|
|
||||||
|
recv_file = sock.makefile(mode="r")
|
||||||
|
|
||||||
|
print(f"connect {server}:{port}")
|
||||||
|
sock.connect((server, port))
|
||||||
|
if server_password:
|
||||||
|
_send(f"PASS {server_password}")
|
||||||
|
_send(f"USER {nick} 0 * :{nick}")
|
||||||
|
_send(f"NICK {nick}")
|
||||||
|
for line in recv_file.readline():
|
||||||
|
if re.match(r"^:[^ ]* (MODE|221|376|422) ", line):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
_pong(line)
|
||||||
|
|
||||||
|
if sasl_password:
|
||||||
|
_send("CAP REQ :sasl")
|
||||||
|
_send("AUTHENTICATE PLAIN")
|
||||||
|
auth = base64.encodebytes(f"{nick}\0{nick}\0{sasl_password}".encode("utf-8"))
|
||||||
|
_send(f"AUTHENTICATE {auth.decode('ascii')}")
|
||||||
|
_send("CAP END")
|
||||||
|
_send(f"JOIN :{channel}")
|
||||||
|
|
||||||
|
for m in messages:
|
||||||
|
_send(f"PRIVMSG {channel} :{m}")
|
||||||
|
|
||||||
|
_send("INFO")
|
||||||
|
for line in recv_file:
|
||||||
|
if DEBUG:
|
||||||
|
print(line, end="")
|
||||||
|
# Assume INFO reply means we are done
|
||||||
|
if "End of /INFO" in line:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
_pong(line)
|
||||||
|
|
||||||
|
sock.send(b"QUIT")
|
||||||
|
print("disconnect")
|
||||||
|
sock.close()
|
||||||
|
|
||||||
|
|
||||||
|
def irc_send(
|
||||||
|
url: str, notifications: List[str], password: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
parsed = urlparse(f"{url}")
|
||||||
|
username = parsed.username or "prometheus"
|
||||||
|
server = parsed.hostname or "chat.freenode.net"
|
||||||
|
if parsed.fragment != "":
|
||||||
|
channel = f"#{parsed.fragment}"
|
||||||
|
else:
|
||||||
|
channel = "#krebs-announce"
|
||||||
|
port = parsed.port or 6697
|
||||||
|
if not password:
|
||||||
|
password = parsed.password
|
||||||
|
if len(notifications) == 0:
|
||||||
|
return
|
||||||
|
_irc_send(
|
||||||
|
server=server,
|
||||||
|
nick=username,
|
||||||
|
sasl_password=password,
|
||||||
|
channel=channel,
|
||||||
|
port=port,
|
||||||
|
messages=notifications,
|
||||||
|
tls=parsed.scheme == "irc+tls",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PrometheusWebHook(BaseHTTPRequestHandler):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
irc_url: str,
|
||||||
|
conn: socket.socket,
|
||||||
|
addr: Tuple[str, int],
|
||||||
|
password: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
self.irc_url = irc_url
|
||||||
|
self.password = password
|
||||||
|
self.rfile = conn.makefile("rb")
|
||||||
|
self.wfile = conn.makefile("wb")
|
||||||
|
self.client_address = addr
|
||||||
|
self.handle()
|
||||||
|
|
||||||
|
# for testing
|
||||||
|
def do_GET(self) -> None:
|
||||||
|
if DEBUG:
|
||||||
|
print("GET: Request Received")
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-type", "text/plain")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(b"ok")
|
||||||
|
|
||||||
|
def do_POST(self) -> None:
|
||||||
|
if DEBUG:
|
||||||
|
print("POST: Request Received")
|
||||||
|
content_type, _ = cgi.parse_header(self.headers.get("content-type"))
|
||||||
|
|
||||||
|
# refuse to receive non-json content
|
||||||
|
if content_type != "application/json":
|
||||||
|
if DEBUG:
|
||||||
|
print(f"POST: wrong content type {content_type}")
|
||||||
|
self.send_response(400)
|
||||||
|
self.end_headers()
|
||||||
|
return
|
||||||
|
|
||||||
|
length = int(self.headers.get("content-length"))
|
||||||
|
payload = json.loads(self.rfile.read(length))
|
||||||
|
messages = []
|
||||||
|
for alert in payload["alerts"]:
|
||||||
|
description = alert["annotations"]["description"]
|
||||||
|
messages.append(f"{alert['status']}: {description}")
|
||||||
|
irc_send(self.irc_url, messages, password=self.password)
|
||||||
|
|
||||||
|
self.do_GET()
|
||||||
|
|
||||||
|
|
||||||
|
def systemd_socket_response() -> None:
|
||||||
|
irc_url = os.environ.get("IRC_URL", None)
|
||||||
|
if irc_url is None:
|
||||||
|
print(
|
||||||
|
"IRC_URL environment variable not set: i.e. IRC_URL=irc+tls://mic92-prometheus@chat.freenode.net/#krebs-announce",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
password = None
|
||||||
|
irc_password_file = os.environ.get("IRC_PASSWORD_FILE", None)
|
||||||
|
if irc_password_file:
|
||||||
|
with open(irc_password_file) as f:
|
||||||
|
password = f.read()
|
||||||
|
|
||||||
|
msgs = sys.argv[1:]
|
||||||
|
|
||||||
|
if msgs != []:
|
||||||
|
irc_send(irc_url, msgs, password=password)
|
||||||
|
return
|
||||||
|
|
||||||
|
nfds = os.environ.get("LISTEN_FDS", None)
|
||||||
|
if nfds is None:
|
||||||
|
print(
|
||||||
|
"LISTEN_FDS not set. Run me with systemd(TM) socket activation?",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
fds = range(3, 3 + int(nfds))
|
||||||
|
|
||||||
|
for fd in fds:
|
||||||
|
sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.settimeout(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
PrometheusWebHook(irc_url, *sock.accept(), password=password)
|
||||||
|
except BlockingIOError:
|
||||||
|
# no more connections
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if DEBUG:
|
||||||
|
print("Starting in DEBUG mode")
|
||||||
|
if len(sys.argv) == 3:
|
||||||
|
print(f"{sys.argv[1]} {sys.argv[2]}")
|
||||||
|
irc_send(sys.argv[1], [sys.argv[2]])
|
||||||
|
else:
|
||||||
|
systemd_socket_response()
|
59
krebs/2configs/shack/prometheus/irc-hooks.nix
Normal file
59
krebs/2configs/shack/prometheus/irc-hooks.nix
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
{ config
|
||||||
|
, lib
|
||||||
|
, pkgs
|
||||||
|
, ...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
irc-alerts = pkgs.writers.writePython3 "irc-alerts" {
|
||||||
|
flakeIgnore = [ "E501" ];
|
||||||
|
} (builtins.readFile ./irc-alerts.py);
|
||||||
|
endpoints = {
|
||||||
|
binaergewitter = {
|
||||||
|
url = "irc+tls://puyak-alerts@irc.libera.chat:6697/#binaergewitter-alerts";
|
||||||
|
port = 9223;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
systemd.sockets =
|
||||||
|
lib.mapAttrs'
|
||||||
|
(name: opts:
|
||||||
|
lib.nameValuePair "irc-alerts-${name}" {
|
||||||
|
description = "Receive http hook and send irc message for ${name}";
|
||||||
|
wantedBy = [ "sockets.target" ];
|
||||||
|
listenStreams = [ "[::]:${builtins.toString opts.port}" ];
|
||||||
|
}) endpoints;
|
||||||
|
|
||||||
|
systemd.services =
|
||||||
|
lib.mapAttrs'
|
||||||
|
(name: opts:
|
||||||
|
let
|
||||||
|
serviceName = "irc-alerts-${name}";
|
||||||
|
hasPassword = opts.passwordFile or null != null;
|
||||||
|
in
|
||||||
|
lib.nameValuePair serviceName {
|
||||||
|
description = "Receive http hook and send irc message for ${name}";
|
||||||
|
requires = [ "irc-alerts-${name}.socket" ];
|
||||||
|
serviceConfig =
|
||||||
|
{
|
||||||
|
Environment =
|
||||||
|
[
|
||||||
|
"IRC_URL=${opts.url}"
|
||||||
|
"DEBUG=y"
|
||||||
|
]
|
||||||
|
++ lib.optional hasPassword "IRC_PASSWORD_FILE=/run/${serviceName}/password";
|
||||||
|
DynamicUser = true;
|
||||||
|
User = serviceName;
|
||||||
|
ExecStart = irc-alerts;
|
||||||
|
}
|
||||||
|
// lib.optionalAttrs hasPassword {
|
||||||
|
PermissionsStartOnly = true;
|
||||||
|
ExecStartPre =
|
||||||
|
"${pkgs.coreutils}/bin/install -m400 "
|
||||||
|
+ "-o ${serviceName} -g ${serviceName} "
|
||||||
|
+ "${config.sops.secrets.prometheus-irc-password.path} "
|
||||||
|
+ "/run/${serviceName}/password";
|
||||||
|
RuntimeDirectory = serviceName;
|
||||||
|
};
|
||||||
|
}) endpoints;
|
||||||
|
}
|
@ -3,6 +3,7 @@
|
|||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./alert-rules.nix
|
./alert-rules.nix
|
||||||
|
./irc-hooks.nix
|
||||||
];
|
];
|
||||||
networking = {
|
networking = {
|
||||||
firewall.allowedTCPPorts = [
|
firewall.allowedTCPPorts = [
|
||||||
@ -129,11 +130,11 @@
|
|||||||
"group_wait" = "30s";
|
"group_wait" = "30s";
|
||||||
"group_interval" = "2m";
|
"group_interval" = "2m";
|
||||||
"repeat_interval" = "4h";
|
"repeat_interval" = "4h";
|
||||||
"receiver" = "team-admins";
|
"receiver" = "shack-admins";
|
||||||
};
|
};
|
||||||
"receivers" = [
|
"receivers" = [
|
||||||
{
|
{
|
||||||
"name" = "team-admins";
|
"name" = "shack-admins";
|
||||||
"email_configs" = [ ];
|
"email_configs" = [ ];
|
||||||
"webhook_configs" = [
|
"webhook_configs" = [
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user