add better logs; one push per run

This commit is contained in:
bain 2022-08-20 16:28:35 +02:00
parent 3f4faf42e3
commit 2a924f8266
No known key found for this signature in database
GPG key ID: A708F07AF3D92C02
2 changed files with 121 additions and 51 deletions

View file

@ -1,47 +1,104 @@
#!/bin/python3
# HTTP, DNS, and IP monitoring script
from collections import namedtuple
import time
import logging
import datetime
import socket
import json
import os
from typing import Callable
from typing import Callable, List
import requests
import pydig
import git
import pytz
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
REPO_ROOT = os.getenv("STATUS_REPO", "status-repo")
REPO_ROOT = "status-repo"
class Formatter(logging.Formatter):
COLOR_RST = "\033[0m"
COLORS = {
"reset": "\033[0m",
"cyan": "\033[36m",
"red": "\033[31m",
"boldred": "\033[1;31m",
"green": "\033[32m",
"blue": "\033[34m",
"yellow": "\033[33m",
}
LOGGING_COLORS = {
logging.DEBUG: "blue",
logging.INFO: "green",
logging.WARNING: "yellow",
logging.WARN: "yellow",
logging.ERROR: "red",
logging.CRITICAL: "boldred",
}
def __init__(self, exclude_time_for: int = 1, disable_colors: bool = False) -> None:
"""
Fancy formatter
Args:
exclude_time_for (int): number of seconds that must have passed
for another timestamp to be shown
max_width (int): max log width, defaults to 80 characters
"""
super().__init__()
self.last_timestamp = 0
self.exclude_time_for = exclude_time_for
self.disable_colors = disable_colors
def c(self, color: str) -> str:
if self.disable_colors is True:
return ""
else:
return self.COLORS[color]
def format(self, record: logging.LogRecord) -> str:
output = ""
if self.last_timestamp + self.exclude_time_for < record.created:
dt = datetime.datetime.fromtimestamp(record.created)
output += (
self.c("cyan") + dt.strftime("[%d/%m %H:%M:%S]") + self.COLOR_RST + " "
)
self.last_timestamp = record.created
else:
output += " " * 17
output += self.c(self.LOGGING_COLORS.get(record.levelno, "reset"))
output += f"{record.levelname.upper()[:3]}{self.COLOR_RST} "
output += record.msg % record.args
return output
logger = logging.getLogger(__name__)
# last states of services to keep from detecting downtime repeatedly
last_states: dict[str, bool] = {}
RequirementCheck = Callable[..., bool]
MonitorDict = dict[str, dict[RequirementCheck, dict]]
Fail = namedtuple("Fail", ("service_name", "failed_requirements"))
# publish a failed service, no dependents so edit at will
def fail(service_name: str, failed_requirements: list):
if not last_states.get(service_name, True):
return
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
repo = git.Repo(REPO_ROOT)
origin = repo.remote('origin')
def fail(failed: List[Fail]):
repo = git.Repo(REPO_ROOT) # type: ignore
origin = repo.remote("origin")
try:
origin.pull(kill_after_timeout=10)
except git.exc.CommandError:
logger.warning("Failed to pull from origin! Aborting!")
except git.CommandError:
logger.error("failed to pull from origin")
return
# noinspection PyShadowingNames
with open(REPO_ROOT + "/" + filename, 'w+') as f:
for service_name, failed_requirements in failed:
if not last_states.get(service_name, True):
continue # we've already seen the service down
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
with open(REPO_ROOT + "/" + filename, "w+") as f:
lines = [
"---\n",
f"title: {service_name} downtime\n",
@ -51,18 +108,17 @@ def fail(service_name: str, failed_requirements: list):
f" - {service_name}\n",
"---\n",
f"Automatic checks for {service_name} have failed. "
f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n"
f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n",
]
f.writelines(lines)
repo.git.add(filename)
repo.git.commit('-m', f'{service_name} downtime')
repo.git.commit("-m", f"{service_name} downtime")
try:
origin.push(kill_after_timeout=10)
except git.exc.CommandError:
logger.warning("Push to origin failed! Aborting and resetting!")
except git.CommandError:
logger.error("failed to push to origin, resetting working tree")
repo.git.reset("origin/HEAD", working_tree=True)
logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}")
logger.info("failed services published")
def self_check() -> bool:
@ -80,14 +136,15 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
def inner_retry(func: RequirementCheck) -> RequirementCheck:
def inner(*args, **kwargs) -> bool:
passed = False
for i in range(n - 1):
for _ in range(n - 1):
passed = func(*args, **kwargs)
if passed:
break
time.sleep(sleep)
return passed
inner.__name__ = func.__name__ # preserve names in log (instead of each requirement being called "inner")
# preserve names in log (instead of each requirement being called "inner")
inner.__name__ = func.__name__
return inner
return inner_retry
@ -96,7 +153,7 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
@retry()
def http_requirement(url: str, code: int) -> bool:
try:
resp = requests.head(url)
resp = requests.head(url, headers={"User-agent": "monitoring (v1)"})
except requests.exceptions.ConnectionError:
return False
else:
@ -108,7 +165,7 @@ def dns_requirement(name: str, ip: str) -> bool:
query = pydig.query(name, "A")
except ConnectionError:
return False
return query and (ip == "*" or ip in query)
return query is not None and (ip == "*" or ip in query)
@retry()
@ -124,22 +181,33 @@ def ip_requirement(ip: str, port: int, prot: str) -> bool:
def check(monitors: MonitorDict):
failed_services: List[Fail] = []
for service, requirements in monitors.items():
logger.debug(f"Checking service {service}")
logger.info(f"checking service {service}")
failed = []
for requirement, args in requirements.items():
logger.debug(f" checking requirement {requirement.__name__}")
passed = requirement(**args)
if not passed:
if not self_check():
logger.warning("Self-check failed, assuming bad connection and aborting")
logger.error(
"self-check failed, assuming bad connection and aborting"
)
return
logger.info(f"{service} failed requirement {requirement.__name__}")
logger.warning(f" {requirement.__name__}({args})")
failed.append(requirement)
time.sleep(1)
if failed:
fail(service, failed)
last_states[service] = len(failed) == 0
failed_services.append(Fail(service, failed))
if failed_services:
fail(failed_services)
# update last_states
for service in monitors.keys():
last_states[service] = True
for fs in failed_services:
last_states[fs.service_name] = False
logger.debug("check complete")
@ -155,19 +223,21 @@ monitors_: MonitorDict = {
"git.bain.cz": {
http_requirement: {"url": "https://git.bain.cz/", "code": 200},
},
"ts3.bain.cz": {
ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}
}
"ts3.bain.cz": {ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}},
}
if __name__ == '__main__':
if __name__ == "__main__":
handler = logging.StreamHandler()
handler.setFormatter(Formatter())
logging.basicConfig(level=logging.INFO, handlers=[handler])
# we assume this is going to be run in a cron job as the gitpython
# library is slowly leaking memory apparently
if os.path.exists("last-state"):
with open("last-state", 'r') as f:
with open("last-state", "r") as f:
last_states = json.load(f)
check(monitors_)
with open("last-state", 'w+') as f:
with open("last-state", "w+") as f:
json.dump(last_states, f)