stockholm/krebs/2configs/shack/prometheus/alert-rules.nix

{ lib,... }:
let
  disk_free_threshold = "10"; # at least this much free disk percentage
in {
  services.prometheus.rules = [(builtins.toJSON
    {
      groups = [
        { name = "shack-env";
          rules = [
            {
              alert = "RootPartitionFull";
              for = "30m";
              expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}'';
              labels.severity = "warning";
              annotations.summary = "{{ $labels.alias }} root disk full";
              annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";
              annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and run `nix-collect-garbage -d` and clean up the shack share folder in `/home/share` .If this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
            }
            {
              alert = "RootPartitionFull";
              for = "30m";
              expr = ''(node_filesystem_avail_bytes{alias="puyak.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="puyak.shack",mountpoint="/"} < ${disk_free_threshold}'';
              labels.severity = "warning";
              annotations.summary = "{{ $labels.alias }} root disk full";
              annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";
              annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
            }
            # wolf.shack is not worth supervising anymore
            {
              alert = "HostDown";
              expr = ''up{alias="infra01.shack"} == 0'';
              for = "5m";
              labels.severity = "page";
              annotations.summary = "Instance {{ $labels.alias }} down for 5 minutes";
              annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";
              annotations.description = ''Host {{ $labels.alias }} went down and has not been reconnected after 5 minutes. This is probably bad news, as the machine runs one of the DNS servers and the power broadcast proxy which is used to be able to turn off the light via puyak as well as the shutdown listener.'';
            }
          ];
        }
      ];
    }
  )];
}
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`{ lib,... }:`
ma shack/prometheus: import alerting-rules from mayflower see: https://github.com/mayflower/nixexprs/blob/master/modules/monitoring/alert-rules.nix 2019-11-25 07:48:14 +00:00			`let`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`disk_free_threshold = "10"; # at least this much free disk percentage`
			`in {`
			`services.prometheus.rules = [(builtins.toJSON`
			`{`
			`groups = [`
			`{ name = "shack-env";`
			`rules = [`
			`{`
			`alert = "RootPartitionFull";`
			`for = "30m";`
shack/prometheus: use correct alias for rules 2020-09-14 09:42:32 +00:00			`expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}'';`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`labels.severity = "warning";`
			`annotations.summary = "{{ $labels.alias }} root disk full";`
shack/prometheus: add urls to alerts 2020-09-14 13:45:37 +00:00			`annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";`
ma anon-sftp: init 2020-12-16 15:10:08 +00:00			annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value \| printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and run `nix-collect-garbage -d` and clean up the shack share folder in `/home/share` .If this does not help you can check `du -hs /var/ \| sort -h`, run `docker system prune` or if you are really desperate run `du -hs / \| sort -h` and go through the folders recursively until you've found something to delete'';
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`}`
			`{`
			`alert = "RootPartitionFull";`
			`for = "30m";`
shack/prometheus: use correct alias for rules 2020-09-14 09:42:32 +00:00			`expr = ''(node_filesystem_avail_bytes{alias="puyak.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="puyak.shack",mountpoint="/"} < ${disk_free_threshold}'';`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`labels.severity = "warning";`
			`annotations.summary = "{{ $labels.alias }} root disk full";`
shack/prometheus: add urls to alerts 2020-09-14 13:45:37 +00:00			`annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak";`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value \| printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ \| sort -h`, run `docker system prune` or if you are really desperate run `du -hs / \| sort -h` and go through the folders recursively until you've found something to delete'';
			`}`
ma anon-sftp: init 2020-12-16 15:10:08 +00:00			`# wolf.shack is not worth supervising anymore`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`{`
			`alert = "HostDown";`
ma anon-sftp: init 2020-12-16 15:10:08 +00:00			`expr = ''up{alias="infra01.shack"} == 0'';`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`for = "5m";`
			`labels.severity = "page";`
			`annotations.summary = "Instance {{ $labels.alias }} down for 5 minutes";`
shack/prometheus: add urls to alerts 2020-09-14 13:45:37 +00:00			`annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf";`
ma anon-sftp: init 2020-12-16 15:10:08 +00:00			`annotations.description = ''Host {{ $labels.alias }} went down and has not been reconnected after 5 minutes. This is probably bad news, as the machine runs one of the DNS servers and the power broadcast proxy which is used to be able to turn off the light via puyak as well as the shutdown listener.'';`
shack/prometheus: strip down number of alerts to 3 2020-09-14 09:11:58 +00:00			`}`
			`];`
			`}`
			`];`
			`}`
			`)];`
ma shack/prometheus: import alerting-rules from mayflower see: https://github.com/mayflower/nixexprs/blob/master/modules/monitoring/alert-rules.nix 2019-11-25 07:48:14 +00:00			`}`