first commit

2023-06-27 09:16:37 +02:00
commit 3ec5ce616e
77 changed files with 3044 additions and 0 deletions
--- a/containers/files/99-rfxcom-serial.rules
+++ b/containers/files/99-rfxcom-serial.rules
@@ -0,0 +1 @@
+SUBSYSTEM=="tty", ATTRS{idVendor}=="0403", ATTRS{idProduct}=="6001", SYMLINK+="rfxcom", MODE="0666"
--- a/containers/files/99-zigbee-serial.rules
+++ b/containers/files/99-zigbee-serial.rules
@@ -0,0 +1 @@
+SUBSYSTEM=="tty", ATTRS{idVendor}=="0451", ATTRS{idProduct}=="16a8", SYMLINK+="zigbee-serial", MODE="0666"
--- a/containers/files/alertmanager/alertmanager.yml
+++ b/containers/files/alertmanager/alertmanager.yml
@@ -0,0 +1,16 @@
+route:
+  group_by: ['alertname']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 1h
+  receiver: 'web.hook'
+receivers:
+  - name: 'web.hook'
+    webhook_configs:
+      - url: 'http://127.0.0.1:5001/'
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'dev', 'instance']
--- a/containers/files/check_image_updates.service.jinja
+++ b/containers/files/check_image_updates.service.jinja
@@ -0,0 +1,10 @@
+[Unit]
+Description=Check for image updates on configured podman containers
+
+[Service]
+Type=oneshot
+User=root
+ExecStart=/root/bin/check_image_updates.sh
+
+[Install]
+WantedBy=default.target
--- a/containers/files/check_image_updates.sh.jinja
+++ b/containers/files/check_image_updates.sh.jinja
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+URL="{{ pillar['podman']['gotify']['url'] }}"
+TOKEN="{{ pillar['podman']['gotify']['token'] }}"
+TITLE="Updates on $HOSTNAME"
+PRIORITY="{{ pillar['podman']['gotify']['priority'] }}"
+
+{% raw -%}
+function check_update(){
+    IFS=',' read -r -a container_info <<< "$(podman container inspect $1 --format '{{ .Name }},{{ .ImageName }},{{ .Image }}')"
+
+    podman pull "${container_info[1]}"
+    if [[ "$(podman image inspect "${container_info[1]}" --format "{{.Id}}")" != "${container_info[2]}" ]];then
+        containers[${#containers[@]}]="${container_info[0]}"
+    fi
+}
+
+
+IFS=$'\n'
+for line in $(podman container ls -q); do
+    check_update "$line"
+done
+if [[ "${#containers[@]}" == "0" ]]; then
+    exit
+fi
+
+MESSAGE=$(cat << EOM
+Following ${#containers[@]} container(s) has updates:
+${containers[*]}
+EOM
+)
+
+curl "$URL/message?token=$TOKEN" -F "title=$TITLE" -F "priority=$PRIORITY" -F "message=$MESSAGE"
+echo " "
+{% endraw -%}
--- a/containers/files/check_image_updates.timer.jinja
+++ b/containers/files/check_image_updates.timer.jinja
@@ -0,0 +1,9 @@
+[Unit]
+Description=Restic backup timer
+
+[Timer]
+OnCalendar=Sun, 12:00
+Unit=check_image_updates.service
+
+[Install]
+WantedBy=timers.target
--- a/containers/files/container.sh.jinja
+++ b/containers/files/container.sh.jinja
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+
+function pull_image(){
+    if ! podman image exists {{ args['image'] }}:{{ args['tag'] }}; then
+        podman pull {{ args['image'] }}:{{ args['tag'] }}
+    fi
+
+}
+
+function create_container() {
+    if ! podman container exists {{ container }};then
+        podman container create \
+            --name {{ container }} \
+            {%- if args['podman_options'] is defined %}
+            {%- for option, value in args['podman_options'].items() %}
+            --{{ option }} {{ value }} \
+            {%- endfor %}
+            {%- endif %}
+            {%- if args['volumes'] is defined %}
+            {%- for volume, mount in args['volumes'].items() %}
+            -v {{ volume }}:{{ mount }} \
+            {%- endfor %}
+            {%- endif %}
+            {%- if args['ports'] is defined %}
+            {%- for ports in args['ports'] %}
+            -p {{ ports['host'] }}:{{ ports['container'] }}{% if ports['protocol'] is defined %}/{{ ports['protocol'] }}{% endif %} \
+            {%- endfor %}
+            {%- endif %}
+            {%- if args['env'] is defined %}
+            {%- for key, value in args['env'].items() %}
+            -e {{ key }}={{ value }} \
+            {%- endfor %}
+            {%- endif %}
+            {%- if args['devices'] is defined %}
+            {%- for key, value in args['devices'].items() %}
+            --device {{ key }}:{{ value}} \
+            {%- endfor %}
+            {%- endif %}
+            {{ args['image'] }}:{{ args['tag'] }}{%- if args['run'] is defined %} \
+            {{ args['run'] }}
+            {%- endif %}
+    fi
+
+}
+
+function generate_systemd_unit_file() {
+    podman generate systemd --name {{ container }} > /etc/systemd/system/{{ container }}.service
+}
+
+function check_update() {
+    podman pull {{ args['image'] }}:{{ args['tag'] }}
+    if [[ "$(podman image inspect {{ args['image'] }}:{{ args['tag'] }} --format "{% raw %}{{.Id}}{% endraw %}")" == "$(podman inspect {{ container }} --format "{% raw %}{{ .Image }}{% endraw %}")" ]];then
+        echo "No image updates available"
+        return 0
+    else
+        echo "Image update available"
+        return 1
+    fi
+}
+
+function update() {
+    systemctl stop {{ container }}
+    podman container rm {{ container }}
+    create_container
+    generate_systemd_unit_file
+    systemctl daemon-reload
+    systemctl enable --now {{ container }}.service
+}
+
+
+
+function printHelp(){
+    cat << EOF
+Usage ${0##*/} [options..]
+-h,-?, --help           Show help and exit
+-p, --pull              pull container image ({{ container }}:{{ args['tag'] }})
+-v, --volumes           create container volumes
+-c, --create            create {{ container }} containers
+-s, --start             start and enables {{ container }} container
+-S, --stop              stop {{ container }} container
+-i, --is-running        check to see if container service is running
+-u, --check-update      check if there are image updates avaiable
+    --update            perform image update if it exists
+-g, --generate-systemd  generate user systemd service unit file
+EOF
+}
+
+
+while :; do
+    case $1 in
+        -h|-\?|--help)
+            printHelp
+            exit
+            ;;
+        -p|--pull)
+            pull_image
+            shift
+            ;;
+        -v|--volumes)
+            create_volumes
+            shift
+            ;;
+        -c|--create)
+            create_container
+            shift
+            ;;
+        -s|--start)
+            systemctl --user enable --now {{ container }}.service
+            shift
+            ;;
+        -S|--stop)
+            systemctl --user stop {{ container }}.service
+            shift
+            ;;
+        -i|--is-running)
+            systemctl --user is-active {{ container }}.service
+            exit $?
+            shift
+            ;;
+        -g|--generate-systemd)
+            generate_systemd_unit_file
+            shift
+            ;;
+        -u|--check-update)
+            check_update
+            shift
+            ;;
+        --update)
+            update
+            shift
+            ;;
+        --)                 #End of all options
+            shift
+            break
+            ;;
+        -?*)
+            printf "'%s' is not a valid option\n" "$1" >&2
+            exit 1
+            ;;
+        *)                 #Break out of case, no more options
+            break
+    esac
+    shift
+done
--- a/containers/files/env_file.jinja
+++ b/containers/files/env_file.jinja
@@ -0,0 +1,3 @@
+{% for key, value in env_vars.items() -%}
+{{ key }}={{ value }}
+{% endfor -%}
--- a/containers/files/loki-config.yaml
+++ b/containers/files/loki-config.yaml
@@ -0,0 +1,40 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+
+ingester:
+  lifecycler:
+    address: 127.0.0.1
+    ring:
+      kvstore:
+        store: inmemory
+      replication_factor: 1
+    final_sleep: 0s
+  chunk_idle_period: 5m
+  chunk_retain_period: 30s
+  wal:
+    dir: /data/wal
+
+schema_config:
+  configs:
+  - from: 2020-05-15
+    store: boltdb
+    object_store: filesystem
+    schema: v11
+    index:
+      prefix: index_
+      period: 168h
+
+storage_config:
+  boltdb:
+    directory: /data/loki/index
+
+  filesystem:
+    directory: /data/loki/chunks
+
+limits_config:
+  enforce_metric_name: false
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h
+
--- a/containers/files/mysql-dump.service.jinja
+++ b/containers/files/mysql-dump.service.jinja
@@ -0,0 +1,12 @@
+{%- set user = salt['pillar.get']('podman:user', 'root') %}
+{%- set home = salt['user.info'](user).home %}
+[Unit]
+Description=Dump all mariadb databases
+
+[Service]
+Type=oneshot
+User={{ user }}
+ExecStart={{ home }}/bin/mysql-dump.sh
+
+[Install]
+WantedBy=default.target
--- a/containers/files/mysql-dump.sh.jinja
+++ b/containers/files/mysql-dump.sh.jinja
@@ -0,0 +1,15 @@
+#!/bin/bash
+umask 0077
+BACKUP_DIR={{ pillar.containers.mariadb.backup_dir }}
+databases=$(podman exec -it mariadb mysql -B -u root -p{{ pillar.containers.mariadb.env.MYSQL_ROOT_PASSWORD }} -e "SHOW DATABASES;" | tr -d "| " | grep -v Database)
+
+for db in ${databases[@]}; do
+    db=${db::-1}
+    if [[ "$db" != "information_schema" ]] && [[ "$db" != "performance_schema" ]] && [[ "$db" != "mysql" ]] && [[ "$db" != _* ]] && [[ "$db" != "sys" ]]; then
+        echo "Dumping database: $db"
+        podman exec -it mariadb mysqldump -u root -p{{ pillar.containers.mariadb.env.MYSQL_ROOT_PASSWORD }} --databases $db | gzip > ${BACKUP_DIR}/$(date +"%Y-%m-%d_%H-%M-%S")_$db-sql.gz
+    fi
+done
+# Delete the files older than 3 days
+find $BACKUP_DIR/* -type f -name *-sql.gz -mtime +3 -exec rm {} \;
+
--- a/containers/files/mysql-dump.timer.jinja
+++ b/containers/files/mysql-dump.timer.jinja
@@ -0,0 +1,9 @@
+[Unit]
+Description=Restic backup timer
+
+[Timer]
+OnCalendar={{ pillar.containers.mariadb.OnCalendar }}
+Unit=mysql-dump.service
+
+[Install]
+WantedBy=timers.target
--- a/containers/files/npm-container.conf
+++ b/containers/files/npm-container.conf
@@ -0,0 +1 @@
+net.ipv4.ip_unprivileged_port_start=80
--- a/containers/files/prometheus/alert.node.yml
+++ b/containers/files/prometheus/alert.node.yml
@@ -0,0 +1,292 @@
+groups:
+- name: node_exporter_alerts
+  rules:
+  - alert: Node down
+    expr: up{job="monitoring-pi"} == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      title: Node {{ $labels.instance }} is down
+      description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down.
+
+
+  - alert: HostOutOfMemory
+    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of memory (instance {{ $labels.instance }})
+      description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}
+
+  - alert: HostMemoryUnderMemoryPressure
+    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host memory under memory pressure (instance {{ $labels.instance }})
+      description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualNetworkThroughputIn
+    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput in (instance {{ $labels.instance }})
+      description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualNetworkThroughputOut
+    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput out (instance {{ $labels.instance }})
+      description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskReadRate
+    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read rate (instance {{ $labels.instance }})
+      description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskWriteRate
+    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write rate (instance {{ $labels.instance }})
+      description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}
+
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostOutOfDiskSpace
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of disk space (instance {{ $labels.instance }})
+      description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}
+
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostDiskWillFillIn24Hours
+    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+      description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}
+
+  - alert: HostOutOfInodes
+    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of inodes (instance {{ $labels.instance }})
+      description: Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}
+
+  - alert: HostInodesWillFillIn24Hours
+    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+      description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskReadLatency
+    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read latency (instance {{ $labels.instance }})
+      description: Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}
+
+  - alert: HostUnusualDiskWriteLatency
+    expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write latency (instance {{ $labels.instance }})
+      description: Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}
+
+  - alert: HostHighCpuLoad
+    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host high CPU load (instance {{ $labels.instance }})
+      description: CPU load is > 80%\n  VALUE = {{ $value }}
+
+  - alert: HostCpuStealNoisyNeighbor
+    expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+      description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}
+
+  # 1000 context switches is an arbitrary number.
+  # Alert threshold depends on nature of application.
+  # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+  - alert: HostContextSwitching
+    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host context switching (instance {{ $labels.instance }})
+      description: Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}
+
+  - alert: HostSwapIsFillingUp
+    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host swap is filling up (instance {{ $labels.instance }})
+      description: Swap is filling up (>80%)\n  VALUE = {{ $value }}
+
+  - alert: HostSystemdServiceCrashed
+    expr: node_systemd_unit_state{state="failed"} == 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host SystemD service crashed (instance {{ $labels.instance }})
+      description: SystemD service crashed\n  VALUE = {{ $value }}
+
+  - alert: HostPhysicalComponentTooHot
+    expr: node_hwmon_temp_celsius > 75
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host physical component too hot (instance {{ $labels.instance }})
+      description: Physical hardware component too hot\n  VALUE = {{ $value }}
+
+  - alert: HostNodeOvertemperatureAlarm
+    expr: node_hwmon_temp_crit_alarm_celsius == 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+      description: Physical node temperature alarm triggered\n  VALUE = {{ $value }}
+
+  - alert: HostRaidArrayGotInactive
+    expr: node_md_state{state="inactive"} > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host RAID array got inactive (instance {{ $labels.instance }})
+      description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}
+
+  - alert: HostRaidDiskFailure
+    expr: node_md_disks{state="failed"} > 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host RAID disk failure (instance {{ $labels.instance }})
+      description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}
+
+  - alert: HostOomKillDetected
+    expr: increase(node_vmstat_oom_kill[1m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host OOM kill detected (instance {{ $labels.instance }})
+      description: OOM kill detected\n  VALUE = {{ $value }}
+
+  - alert: HostEdacCorrectableErrorsDetected
+    expr: increase(node_edac_correctable_errors_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: info
+    annotations:
+      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+      description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostEdacUncorrectableErrorsDetected
+    expr: node_edac_uncorrectable_errors_total > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+      description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostNetworkReceiveErrors
+    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }})
+      description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostNetworkTransmitErrors
+    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }})
+      description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}
+
+  - alert: HostNetworkInterfaceSaturated
+    expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }})
+      description: The network interface is getting overloaded.\n  VALUE = {{ $value }}
+
+  - alert: HostConntrackLimit
+    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host conntrack limit (instance {{ $labels.instance }})
+      description: The number of conntrack is approching limit\n  VALUE = {{ $value }}
+
+  - alert: HostClockSkew
+    expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock skew (instance {{ $labels.instance }})
+      description: Clock skew detected. Clock is out of sync.\n  VALUE = {{ $value }}
+
+  - alert: HostClockNotSynchronising
+    expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock not synchronising (instance {{ $labels.instance }})
+      description: Clock not synchronising.\n  VALUE = {{ $value }}
+
--- a/containers/files/prometheus/prometheus.yml
+++ b/containers/files/prometheus/prometheus.yml
@@ -0,0 +1,59 @@
+# my global config #
+global:
+  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  # scrape_timeout is set to the global default (10s).
+
+# Alertmanager configuration
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - 10.2.0.22:9093
+
+# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
+rule_files:
+  - "alert.node.yml"
+  # - "first_rules.yml"
+  # - "second_rules.yml"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: "prometheus"
+
+    # metrics_path defaults to '/metrics'
+    # scheme defaults to 'http'.
+
+    static_configs:
+      - targets: ["localhost:9090"]
+  - job_name: "node"
+    static_configs:
+      - targets: 
+        - "poblano.rre.nu:9100"
+        - "salt.rre.nu:9100"
+        - "pepper.rre.nu:9100"
+        - "woody.rre.nu:9100"
+        - "serrano.rre.nu:9100"
+        - "coronado.rre.nu:9100"
+  - job_name: "unpoller"
+    static_configs:
+      - targets:
+        - "unpoller.rre.nu:9130"
+  - job_name: "fail2ban"
+    static_configs:
+      - targets:
+        - "poblano.rre.nu:9191"
+        - "salt.rre.nu:9191"
+        - "pepper.rre.nu:9191"
+  - job_name: "nginx"
+    static_configs:
+      - targets:
+        - "10.2.0.22:9193"
+
+  - job_name: "promtail"
+    static_configs:
+      - targets:
+        - "serrano.rre.nu:9080"
+        - "coronado.rre.nu:9080"
--- a/containers/files/promtail.conf.jinja
+++ b/containers/files/promtail.conf.jinja
@@ -0,0 +1,29 @@
+server:
+  http_listen_port: {{ http_listen_port }}
+  grpc_listen_port: 0
+
+positions:
+  filename: /tmp/positions.yaml
+
+clients:
+  - url: "{{ client_url }}"
+
+scrape_configs:
+  - job_name: journal
+    journal:
+      max_age: 12h
+      path: /var/log/journal
+      labels:
+        job: systemd-journal
+    relabel_configs:
+      - source_labels: ['__journal__systemd_unit']
+        target_label: 'unit'
+      - source_labels: ["__journal__hostname"]
+        target_label: host
+      - source_labels: ["__journal_priority_keyword"]
+        target_label: level
+      - source_labels: ["__journal_syslog_identifier"]
+        target_label: syslog_identifier
+      - source_labels: ["__journal_container_name"]
+        target_label: container_name
+
--- a/containers/files/unpoller.conf
+++ b/containers/files/unpoller.conf
@@ -0,0 +1,234 @@
+# Unpoller v2 primary configuration file. TOML FORMAT #
+###########################################################
+
+[poller]
+  # Turns on line numbers, microsecond logging, and a per-device log.
+  # The default is false, but I personally leave this on at home (four devices).
+  # This may be noisy if you have a lot of devices. It adds one line per device.
+  debug = false
+
+  # Turns off per-interval logs. Only startup and error logs will be emitted.
+  # Recommend enabling debug with this setting for better error logging.
+  quiet = false
+
+  # Load dynamic plugins. Advanced use; only sample mysql plugin provided by default.
+  plugins = []
+
+#### OUTPUTS
+
+    # If you don't use an output, you can disable it.
+
+[prometheus]
+  disable = false
+  # This controls on which ip and port /metrics is exported when mode is "prometheus".
+  # This has no effect in other modes. Must contain a colon and port.
+  http_listen = "0.0.0.0:9130"
+  # Adding an SSL Cert and Cert Key will make Poller listen with SSL/https.
+  ssl_cert_path = ""
+  ssl_key_path  = ""
+  # Errors are rare. Setting this to true will report them to Prometheus.
+  report_errors = false
+  ## Record data for disabled or down (unlinked) switch ports.
+  dead_ports = false
+
+[influxdb]
+  disable = true
+  # InfluxDB does not require auth by default, so the user/password are probably unimportant.
+  url  = "http://127.0.0.1:8086"
+  user = "unifipoller"
+  # Password for InfluxDB user (above).
+  # If the password provided here begins with file:// then the password is read in from
+  # the file path that follows the file:// prefix. ex: file:///etc/influxdb/passwd.file
+  pass = "unifipoller"
+  # Be sure to create this database. See the InfluxDB Wiki page for more info.
+  db = "unifi"
+  # If your InfluxDB uses a valid SSL cert, set this to true.
+  verify_ssl = false
+  # The UniFi Controller only updates traffic stats about every 30 seconds.
+  # Setting this to something lower may lead to "zeros" in your data.
+  # If you're getting zeros now, set this to "1m"
+  interval = "30s"
+  ## Record data for disabled or down (unlinked) switch ports.
+  dead_ports = false
+
+# To enable output of UniFi Events to Loki, add a URL; it's disabled otherwise.
+# User, pass and tenant_id are optional and most folks wont set them.
+# Pick which logs you want per-controller in the [unifi.controller] section.
+# This is a new feature. Feedback welcome!
+[loki]
+  disable = false
+  url = "{{ pillar['containers']['unpoller']['loki_url'] }}"
+  # The rest of this is advanced & optional. See wiki.
+  user       = ""
+  pass       = ""
+  verify_ssl = false
+  tenant_id  = ""
+  interval   = "2m"
+  timeout    = "10s"
+
+[datadog]
+  # How often to poll UniFi and report to Datadog.
+  interval = "2m"
+
+  # To enable this output plugin
+  enable = false
+
+  # Datadog Custom Options
+
+  # address to talk to the datadog agent, by default this uses the local statsd UDP interface
+  # address = "localhost:8125"
+
+  # namespace to prepend to all data, default is no additional prefix.
+  # namespace = ""
+
+  # tags to append to all data
+  # tags = [ "customer:abc_corp" ]
+
+  # For more advanced options for very large amount of data collected see the upstream
+  # github.com/unpoller/unpoller/pkg/datadogunifi repository README.
+
+
+# Unpoller has an optional web server. To turn it on, set enable to true. If you
+# wish to use SSL, provide SSL cert and key paths. This interface is currently
+# read-only; it just displays information, like logs, devices and clients.
+# Notice: Enabling the web server with many sites will increase memory usage.
+# This is a new feature and lacks a UI, enabling only recommended for testing.
+[webserver]
+  enable = false
+  port   = 37288
+  # The HTML path is different on Windows and BSD/macOS.
+  html_path     = "/usr/lib/unifi-poller/web"
+  ssl_cert_path = ""
+  ssl_key_path  = ""
+  # How many events per event group to hold. 200-2000. Use fewer with many sites.
+  # With 1 site, you'll have a max total of 9 event groups; 1 per plugin, 4 per site.
+  # Each site adds 1 event group for each of these inputs that is enabled:
+  #   save_ids, save_events, save_anomalies, save_alarms.
+  max_events = 200
+
+# By default the web interface does not require authentication. You can change
+# that by adding a username and password hash (or multiple) below.
+# To create a hash, run unifi-poller with the -e CLI argument. See Wiki for more!
+[webserver.accounts]
+# username = "password-hash"
+# captain = "$2a$04$mxw6i0LKH6u46oaLK2cq5eCTAAFkfNiRpzNbz.EyvJZZWNa2FzIlS"
+
+#### INPUTS
+
+[unifi]
+  # Setting this to true and providing default credentials allows you to skip
+  # configuring controllers in this config file. Instead you configure them in
+  # your prometheus.yml config. Prometheus then sends the controller URL to
+  # Unpoller when it performs the scrape. This is useful if you have many,
+  # or changing controllers. See wiki for more.
+  dynamic = false
+
+# The following section contains the default credentials/configuration for any
+# dynamic controller (see above section), or the primary controller if you do not
+# provide one and dynamic is disabled. In other words, you can just add your
+# controller here and delete the following section. The internal defaults are
+# shown below. Any missing values will assume these displayed defaults.
+
+[unifi.defaults]
+  # URL for the UniFi Controller. Do not add any paths after the host:port.
+  # Do not use port 8443 if you have a UDM; just use "https://ip".
+  url = "{{ pillar['containers']['unpoller']['unifi_url'] }}"
+
+  # Make a read-only user in the UniFi Admin Settings, allow it access to all sites.
+  user = "{{ pillar['containers']['unpoller']['unifi_user'] }}"
+
+  # Password for UniFi controller user (above).
+  # If the password provided here begins with file:// then the password is read in from
+  # the file path that follows the file:// prefix. ex: file:///etc/unifi/password.file
+  # ex: file:///etc/unifi/passwd.file, windows: file://C:\\UserData\\Unifi\\Passwd.txt
+  pass = "{{ pillar['containers']['unpoller']['unifi_pass'] }}"
+
+  # If the controller has more than one site, specify which sites to poll here.
+  # Set this to ["default"] to poll only the first site on the controller.
+  # A setting of ["all"] will poll all sites; this works if you only have 1 site too.
+  sites = ["all"]
+
+  # Specify a timeout, leave missing to declare infinite wait. This determines the maximum
+  # time to wait for a response from the unifi controller on any API request.
+  # timeout = 60s
+
+  # Enable collection of site data. This data powers the Network Sites dashboard.
+  # It's not valuable to everyone and setting this to false will save resources.
+  save_sites = true
+
+  # Hash, with md5, client names and MAC addresses. This attempts to protect
+  # personally identifiable information. Most users won't want to enable this.
+  hash_pii = false
+
+  # Enable collection of Intrusion Detection System Data (InfluxDB/Loki only).
+  # Only useful if IDS or IPS are enabled on one of the sites. This may store
+  # a lot of information. Only recommended for testing and debugging. There
+  # may not be any dashboards to display this data. It can be used for annotations.
+  # Enable this only if using InfluxDB or Loki. This will leak PII data!
+  save_ids = false
+
+  # Enable collection of UniFi Events (InfluxDB/Loki only).
+  # This may store a lot of information. Only recommended for testing and debugging.
+  # There are no dashboards to display this data. It can be used for annotations.
+  # This is a new (June, 2020) feature. Please provide feedback if you try it out!
+  # Enable this only if using InfluxDB or Loki. This will leak PII data!
+  save_events = true
+
+  # Enable collection of UniFi Alarms (InfluxDB/Loki only).
+  # There are no dashboards to display this data. It can be used for annotations.
+  # This is a new (June, 2020) feature. Please provide feedback if you try it out!
+  # Enable this only if using InfluxDB or Loki. This will leak PII data!
+  save_alarms = true
+
+  # Enable collection of UniFi Anomalies (InfluxDB/Loki only).
+  # There are no dashboards to display this data. It can be used for annotations.
+  # This is a new (June, 2020) feature. Please provide feedback if you try it out!
+  # Enable this only if using InfluxDB or Loki.
+  save_anomalies = true
+
+  # Enable collection of Deep Packet Inspection data. This data breaks down traffic
+  # types for each client and site, it powers a dedicated DPI dashboard.
+  # Enabling this adds roughly 150 data points per client.  That's 6000 metrics for
+  # 40 clients.  This adds a little bit of poller run time per interval and causes
+  # more API requests to your controller(s). Don't let these "cons" sway you:
+  # it's cool data. Please provide feedback on your experience with this feature.
+  save_dpi = false
+
+  ## Enabling save_rogue stores even more data in your time series databases.
+  ## This saves neighboring access point metrics in a dedicated table or namespace.
+  save_rogue = false
+
+  # If your UniFi controller has a valid SSL certificate (like lets encrypt),
+  # you can enable this option to validate it. Otherwise, any SSL certificate is
+  # valid. If you don't know if you have a valid SSL cert, then you don't have one.
+  verify_ssl = false
+
+  ## You may provide a list of SSL cert files (PEM format) that you expect your
+  ## controller to use. As long as one of the certs you provide here shows up in
+  ## the cert trust chain the controller presents it will be accepted and allowed.
+  ## These files may be re-read while poller is running.
+  ## Example: ssl_cert_paths = ["/path/to/cert.pem", "/another/cert.pem"]
+  ssl_cert_paths = []
+
+# The following is optional and used for configurations with multiple UniFi controllers.
+
+# You may repeat the following [[unifi.controller]] section as many times as needed to
+# poll multiple controllers. Uncomment the entire section including [[unifi.controller]].
+# Omitted variables will have their values taken from the defaults, above.
+#
+#[[unifi.controller]]
+#  url         = "https://127.0.0.1:8443"
+#  user        = "unifipoller"
+#  pass        = "unifipoller"
+#  sites       = ["all"]
+#  save_sites  = true
+#  hash_pii    = false
+#  save_ids    = false
+#  save_events = false
+#  save_alarms = false
+#  save_anomalies = false
+#  save_dpi    = false
+#  save_rogue  = false
+#  verify_ssl  = false
+#  ssl_cert_paths = []
+
				`@@ -0,0 +1 @@`
				`SUBSYSTEM=="tty", ATTRS{idVendor}=="0403", ATTRS{idProduct}=="6001", SYMLINK+="rfxcom", MODE="0666"`
				`@@ -0,0 +1 @@`
				`SUBSYSTEM=="tty", ATTRS{idVendor}=="0451", ATTRS{idProduct}=="16a8", SYMLINK+="zigbee-serial", MODE="0666"`