add better logs; one push per run

2022-08-20 16:28:35 +02:00 · 2022-08-20 16:28:35 +02:00 · 2a924f8266
commit 2a924f8266
parent 3f4faf42e3
2 changed files with 121 additions and 51 deletions
--- a/monitor.py
+++ b/monitor.py
@ -1,68 +1,124 @@
 #!/bin/python3
 # HTTP, DNS, and IP monitoring script
 from collections import namedtuple
 import time
 import logging
 import datetime
 import socket
 import json
 import os
-from typing import Callable
+from typing import Callable, List
 import requests
 import pydig
 import git
 import pytz
-logger = logging.getLogger(__name__)
+REPO_ROOT = os.getenv("STATUS_REPO", "status-repo")
 logger.addHandler(logging.StreamHandler())
 logger.setLevel(logging.DEBUG)
-REPO_ROOT = "status-repo"
+
 class Formatter(logging.Formatter):
    COLOR_RST = "\033[0m"
    COLORS = {
        "reset": "\033[0m",
        "cyan": "\033[36m",
        "red": "\033[31m",
        "boldred": "\033[1;31m",
        "green": "\033[32m",
        "blue": "\033[34m",
        "yellow": "\033[33m",
    }
    LOGGING_COLORS = {
        logging.DEBUG: "blue",
        logging.INFO: "green",
        logging.WARNING: "yellow",
        logging.WARN: "yellow",
        logging.ERROR: "red",
        logging.CRITICAL: "boldred",
    }
    def __init__(self, exclude_time_for: int = 1, disable_colors: bool = False) -> None:
        """
        Fancy formatter
        Args:
            exclude_time_for (int): number of seconds that must have passed
                for another timestamp to be shown
                max_width (int): max log width, defaults to 80 characters
        """
        super().__init__()
        self.last_timestamp = 0
        self.exclude_time_for = exclude_time_for
        self.disable_colors = disable_colors
    def c(self, color: str) -> str:
        if self.disable_colors is True:
            return ""
        else:
            return self.COLORS[color]
    def format(self, record: logging.LogRecord) -> str:
        output = ""
        if self.last_timestamp + self.exclude_time_for < record.created:
            dt = datetime.datetime.fromtimestamp(record.created)
            output += (
                self.c("cyan") + dt.strftime("[%d/%m %H:%M:%S]") + self.COLOR_RST + " "
            )
            self.last_timestamp = record.created
        else:
            output += " " * 17
        output += self.c(self.LOGGING_COLORS.get(record.levelno, "reset"))
        output += f"{record.levelname.upper()[:3]}{self.COLOR_RST} "
        output += record.msg % record.args
        return output
 logger = logging.getLogger(__name__)
 # last states of services to keep from detecting downtime repeatedly
 last_states: dict[str, bool] = {}
 RequirementCheck = Callable[..., bool]
 MonitorDict = dict[str, dict[RequirementCheck, dict]]
 Fail = namedtuple("Fail", ("service_name", "failed_requirements"))
 # publish a failed service, no dependents so edit at will
-def fail(service_name: str, failed_requirements: list):
+def fail(failed: List[Fail]):
-    if not last_states.get(service_name, True):
+    repo = git.Repo(REPO_ROOT)  # type: ignore
-        return
+    origin = repo.remote("origin")
    now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
    filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
    repo = git.Repo(REPO_ROOT)
    origin = repo.remote('origin')
    try:
        origin.pull(kill_after_timeout=10)
-    except git.exc.CommandError:
+    except git.CommandError:
-        logger.warning("Failed to pull from origin! Aborting!")
+        logger.error("failed to pull from origin")
        return
-    # noinspection PyShadowingNames
+    for service_name, failed_requirements in failed:
-    with open(REPO_ROOT + "/" + filename, 'w+') as f:
+        if not last_states.get(service_name, True):
-        lines = [
+            continue  # we've already seen the service down
-            "---\n",
+        now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
-            f"title: {service_name} downtime\n",
+        filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
-            f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n",
+        with open(REPO_ROOT + "/" + filename, "w+") as f:
-            "severity: down\n",
+            lines = [
-            "affected:\n",
+                "---\n",
-            f" - {service_name}\n",
+                f"title: {service_name} downtime\n",
-            "---\n",
+                f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n",
-            f"Automatic checks for {service_name} have failed. "
+                "severity: down\n",
-            f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n"
+                "affected:\n",
-        ]
+                f" - {service_name}\n",
-        f.writelines(lines)
+                "---\n",
-    repo.git.add(filename)
+                f"Automatic checks for {service_name} have failed. "
-    repo.git.commit('-m', f'{service_name} downtime')
+                f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n",
            ]
            f.writelines(lines)
        repo.git.add(filename)
        repo.git.commit("-m", f"{service_name} downtime")
    try:
        origin.push(kill_after_timeout=10)
-    except git.exc.CommandError:
+    except git.CommandError:
-        logger.warning("Push to origin failed! Aborting and resetting!")
+        logger.error("failed to push to origin, resetting working tree")
        repo.git.reset("origin/HEAD", working_tree=True)
-
+    logger.info("failed services published")
    logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}")
 def self_check() -> bool:
@ -80,14 +136,15 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
    def inner_retry(func: RequirementCheck) -> RequirementCheck:
        def inner(*args, **kwargs) -> bool:
            passed = False
-            for i in range(n - 1):
+            for _ in range(n - 1):
                passed = func(*args, **kwargs)
                if passed:
                    break
                time.sleep(sleep)
            return passed
-        inner.__name__ = func.__name__  # preserve names in log (instead of each requirement being called "inner")
+        # preserve names in log (instead of each requirement being called "inner")
        inner.__name__ = func.__name__
        return inner
    return inner_retry
@ -96,7 +153,7 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
@retry()
 def http_requirement(url: str, code: int) -> bool:
    try:
-        resp = requests.head(url)
+        resp = requests.head(url, headers={"User-agent": "monitoring (v1)"})
    except requests.exceptions.ConnectionError:
        return False
    else:
@ -108,7 +165,7 @@ def dns_requirement(name: str, ip: str) -> bool:
        query = pydig.query(name, "A")
    except ConnectionError:
        return False
-    return query and (ip == "*" or ip in query)
+    return query is not None and (ip == "*" or ip in query)
@retry()
@ -124,22 +181,33 @@ def ip_requirement(ip: str, port: int, prot: str) -> bool:
 def check(monitors: MonitorDict):
    failed_services: List[Fail] = []
    for service, requirements in monitors.items():
-        logger.debug(f"Checking service {service}")
+        logger.info(f"checking service {service}")
        failed = []
        for requirement, args in requirements.items():
            logger.debug(f"  checking requirement {requirement.__name__}")
            passed = requirement(**args)
            if not passed:
                if not self_check():
-                    logger.warning("Self-check failed, assuming bad connection and aborting")
+                    logger.error(
                        "self-check failed, assuming bad connection and aborting"
                    )
                    return
-                logger.info(f"{service} failed requirement {requirement.__name__}")
+                logger.warning(f"  {requirement.__name__}({args})")
                failed.append(requirement)
            time.sleep(1)
        if failed:
-            fail(service, failed)
+            failed_services.append(Fail(service, failed))
-        last_states[service] = len(failed) == 0
+
    if failed_services:
        fail(failed_services)
    # update last_states
    for service in monitors.keys():
        last_states[service] = True
    for fs in failed_services:
        last_states[fs.service_name] = False
    logger.debug("check complete")
@ -155,19 +223,21 @@ monitors_: MonitorDict = {
    "git.bain.cz": {
        http_requirement: {"url": "https://git.bain.cz/", "code": 200},
    },
-    "ts3.bain.cz": {
+    "ts3.bain.cz": {ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}},
        ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}
    }
 }
-if __name__ == '__main__':
+if __name__ == "__main__":
    handler = logging.StreamHandler()
    handler.setFormatter(Formatter())
    logging.basicConfig(level=logging.INFO, handlers=[handler])
    # we assume this is going to be run in a cron job as the gitpython
    # library is slowly leaking memory apparently
    if os.path.exists("last-state"):
-        with open("last-state", 'r') as f:
+        with open("last-state", "r") as f:
            last_states = json.load(f)
    check(monitors_)
-    with open("last-state", 'w+') as f:
+    with open("last-state", "w+") as f:
        json.dump(last_states, f)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
 requests
 pydig
 gitpython
-pytz
+pytz