add better logs; one push per run

This commit is contained in:
bain 2022-08-20 16:28:35 +02:00
parent 3f4faf42e3
commit 2a924f8266
No known key found for this signature in database
GPG key ID: A708F07AF3D92C02
2 changed files with 121 additions and 51 deletions

View file

@ -1,47 +1,104 @@
#!/bin/python3 #!/bin/python3
# HTTP, DNS, and IP monitoring script # HTTP, DNS, and IP monitoring script
from collections import namedtuple
import time import time
import logging import logging
import datetime import datetime
import socket import socket
import json import json
import os import os
from typing import Callable from typing import Callable, List
import requests import requests
import pydig import pydig
import git import git
import pytz import pytz
logger = logging.getLogger(__name__) REPO_ROOT = os.getenv("STATUS_REPO", "status-repo")
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
REPO_ROOT = "status-repo"
class Formatter(logging.Formatter):
COLOR_RST = "\033[0m"
COLORS = {
"reset": "\033[0m",
"cyan": "\033[36m",
"red": "\033[31m",
"boldred": "\033[1;31m",
"green": "\033[32m",
"blue": "\033[34m",
"yellow": "\033[33m",
}
LOGGING_COLORS = {
logging.DEBUG: "blue",
logging.INFO: "green",
logging.WARNING: "yellow",
logging.WARN: "yellow",
logging.ERROR: "red",
logging.CRITICAL: "boldred",
}
def __init__(self, exclude_time_for: int = 1, disable_colors: bool = False) -> None:
"""
Fancy formatter
Args:
exclude_time_for (int): number of seconds that must have passed
for another timestamp to be shown
max_width (int): max log width, defaults to 80 characters
"""
super().__init__()
self.last_timestamp = 0
self.exclude_time_for = exclude_time_for
self.disable_colors = disable_colors
def c(self, color: str) -> str:
if self.disable_colors is True:
return ""
else:
return self.COLORS[color]
def format(self, record: logging.LogRecord) -> str:
output = ""
if self.last_timestamp + self.exclude_time_for < record.created:
dt = datetime.datetime.fromtimestamp(record.created)
output += (
self.c("cyan") + dt.strftime("[%d/%m %H:%M:%S]") + self.COLOR_RST + " "
)
self.last_timestamp = record.created
else:
output += " " * 17
output += self.c(self.LOGGING_COLORS.get(record.levelno, "reset"))
output += f"{record.levelname.upper()[:3]}{self.COLOR_RST} "
output += record.msg % record.args
return output
logger = logging.getLogger(__name__)
# last states of services to keep from detecting downtime repeatedly # last states of services to keep from detecting downtime repeatedly
last_states: dict[str, bool] = {} last_states: dict[str, bool] = {}
RequirementCheck = Callable[..., bool] RequirementCheck = Callable[..., bool]
MonitorDict = dict[str, dict[RequirementCheck, dict]] MonitorDict = dict[str, dict[RequirementCheck, dict]]
Fail = namedtuple("Fail", ("service_name", "failed_requirements"))
# publish a failed service, no dependents so edit at will # publish a failed service, no dependents so edit at will
def fail(service_name: str, failed_requirements: list): def fail(failed: List[Fail]):
if not last_states.get(service_name, True): repo = git.Repo(REPO_ROOT) # type: ignore
return origin = repo.remote("origin")
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
repo = git.Repo(REPO_ROOT)
origin = repo.remote('origin')
try: try:
origin.pull(kill_after_timeout=10) origin.pull(kill_after_timeout=10)
except git.exc.CommandError: except git.CommandError:
logger.warning("Failed to pull from origin! Aborting!") logger.error("failed to pull from origin")
return return
# noinspection PyShadowingNames for service_name, failed_requirements in failed:
with open(REPO_ROOT + "/" + filename, 'w+') as f: if not last_states.get(service_name, True):
continue # we've already seen the service down
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
with open(REPO_ROOT + "/" + filename, "w+") as f:
lines = [ lines = [
"---\n", "---\n",
f"title: {service_name} downtime\n", f"title: {service_name} downtime\n",
@ -51,18 +108,17 @@ def fail(service_name: str, failed_requirements: list):
f" - {service_name}\n", f" - {service_name}\n",
"---\n", "---\n",
f"Automatic checks for {service_name} have failed. " f"Automatic checks for {service_name} have failed. "
f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n" f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n",
] ]
f.writelines(lines) f.writelines(lines)
repo.git.add(filename) repo.git.add(filename)
repo.git.commit('-m', f'{service_name} downtime') repo.git.commit("-m", f"{service_name} downtime")
try: try:
origin.push(kill_after_timeout=10) origin.push(kill_after_timeout=10)
except git.exc.CommandError: except git.CommandError:
logger.warning("Push to origin failed! Aborting and resetting!") logger.error("failed to push to origin, resetting working tree")
repo.git.reset("origin/HEAD", working_tree=True) repo.git.reset("origin/HEAD", working_tree=True)
logger.info("failed services published")
logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}")
def self_check() -> bool: def self_check() -> bool:
@ -80,14 +136,15 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
def inner_retry(func: RequirementCheck) -> RequirementCheck: def inner_retry(func: RequirementCheck) -> RequirementCheck:
def inner(*args, **kwargs) -> bool: def inner(*args, **kwargs) -> bool:
passed = False passed = False
for i in range(n - 1): for _ in range(n - 1):
passed = func(*args, **kwargs) passed = func(*args, **kwargs)
if passed: if passed:
break break
time.sleep(sleep) time.sleep(sleep)
return passed return passed
inner.__name__ = func.__name__ # preserve names in log (instead of each requirement being called "inner") # preserve names in log (instead of each requirement being called "inner")
inner.__name__ = func.__name__
return inner return inner
return inner_retry return inner_retry
@ -96,7 +153,7 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
@retry() @retry()
def http_requirement(url: str, code: int) -> bool: def http_requirement(url: str, code: int) -> bool:
try: try:
resp = requests.head(url) resp = requests.head(url, headers={"User-agent": "monitoring (v1)"})
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
return False return False
else: else:
@ -108,7 +165,7 @@ def dns_requirement(name: str, ip: str) -> bool:
query = pydig.query(name, "A") query = pydig.query(name, "A")
except ConnectionError: except ConnectionError:
return False return False
return query and (ip == "*" or ip in query) return query is not None and (ip == "*" or ip in query)
@retry() @retry()
@ -124,22 +181,33 @@ def ip_requirement(ip: str, port: int, prot: str) -> bool:
def check(monitors: MonitorDict): def check(monitors: MonitorDict):
failed_services: List[Fail] = []
for service, requirements in monitors.items(): for service, requirements in monitors.items():
logger.debug(f"Checking service {service}") logger.info(f"checking service {service}")
failed = [] failed = []
for requirement, args in requirements.items(): for requirement, args in requirements.items():
logger.debug(f" checking requirement {requirement.__name__}")
passed = requirement(**args) passed = requirement(**args)
if not passed: if not passed:
if not self_check(): if not self_check():
logger.warning("Self-check failed, assuming bad connection and aborting") logger.error(
"self-check failed, assuming bad connection and aborting"
)
return return
logger.info(f"{service} failed requirement {requirement.__name__}") logger.warning(f" {requirement.__name__}({args})")
failed.append(requirement) failed.append(requirement)
time.sleep(1) time.sleep(1)
if failed: if failed:
fail(service, failed) failed_services.append(Fail(service, failed))
last_states[service] = len(failed) == 0
if failed_services:
fail(failed_services)
# update last_states
for service in monitors.keys():
last_states[service] = True
for fs in failed_services:
last_states[fs.service_name] = False
logger.debug("check complete") logger.debug("check complete")
@ -155,19 +223,21 @@ monitors_: MonitorDict = {
"git.bain.cz": { "git.bain.cz": {
http_requirement: {"url": "https://git.bain.cz/", "code": 200}, http_requirement: {"url": "https://git.bain.cz/", "code": 200},
}, },
"ts3.bain.cz": { "ts3.bain.cz": {ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}},
ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}
}
} }
if __name__ == '__main__': if __name__ == "__main__":
handler = logging.StreamHandler()
handler.setFormatter(Formatter())
logging.basicConfig(level=logging.INFO, handlers=[handler])
# we assume this is going to be run in a cron job as the gitpython # we assume this is going to be run in a cron job as the gitpython
# library is slowly leaking memory apparently # library is slowly leaking memory apparently
if os.path.exists("last-state"): if os.path.exists("last-state"):
with open("last-state", 'r') as f: with open("last-state", "r") as f:
last_states = json.load(f) last_states = json.load(f)
check(monitors_) check(monitors_)
with open("last-state", 'w+') as f: with open("last-state", "w+") as f:
json.dump(last_states, f) json.dump(last_states, f)