monitor/monitor.py
bain 3f4faf42e3
fix: typing; add: retry decoration function
additionaly add retries to ip_requirement
2021-12-28 01:50:13 +01:00

173 lines
5.3 KiB
Python

#!/bin/python3
# HTTP, DNS, and IP monitoring script
import time
import logging
import datetime
import socket
import json
import os
from typing import Callable
import requests
import pydig
import git
import pytz
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
REPO_ROOT = "status-repo"
# last states of services to keep from detecting downtime repeatedly
last_states: dict[str, bool] = {}
RequirementCheck = Callable[..., bool]
MonitorDict = dict[str, dict[RequirementCheck, dict]]
# publish a failed service, no dependents so edit at will
def fail(service_name: str, failed_requirements: list):
if not last_states.get(service_name, True):
return
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
repo = git.Repo(REPO_ROOT)
origin = repo.remote('origin')
try:
origin.pull(kill_after_timeout=10)
except git.exc.CommandError:
logger.warning("Failed to pull from origin! Aborting!")
return
# noinspection PyShadowingNames
with open(REPO_ROOT + "/" + filename, 'w+') as f:
lines = [
"---\n",
f"title: {service_name} downtime\n",
f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n",
"severity: down\n",
"affected:\n",
f" - {service_name}\n",
"---\n",
f"Automatic checks for {service_name} have failed. "
f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n"
]
f.writelines(lines)
repo.git.add(filename)
repo.git.commit('-m', f'{service_name} downtime')
try:
origin.push(kill_after_timeout=10)
except git.exc.CommandError:
logger.warning("Push to origin failed! Aborting and resetting!")
repo.git.reset("origin/HEAD", working_tree=True)
logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}")
def self_check() -> bool:
try:
if requests.get("https://google.com/").status_code != 200:
return False
except requests.exceptions.ConnectionError:
return False
return True
def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], RequirementCheck]:
"""Decorator maker for calling a function multiple times with sleep time between calls."""
def inner_retry(func: RequirementCheck) -> RequirementCheck:
def inner(*args, **kwargs) -> bool:
passed = False
for i in range(n - 1):
passed = func(*args, **kwargs)
if passed:
break
time.sleep(sleep)
return passed
inner.__name__ = func.__name__ # preserve names in log (instead of each requirement being called "inner")
return inner
return inner_retry
@retry()
def http_requirement(url: str, code: int) -> bool:
try:
resp = requests.head(url)
except requests.exceptions.ConnectionError:
return False
else:
return resp.status_code == code
def dns_requirement(name: str, ip: str) -> bool:
try:
query = pydig.query(name, "A")
except ConnectionError:
return False
return query and (ip == "*" or ip in query)
@retry()
def ip_requirement(ip: str, port: int, prot: str) -> bool:
protocol = socket.SOCK_STREAM if prot == "tcp" else socket.SOCK_DGRAM
sock = socket.socket(type=protocol)
try:
sock.connect((ip, port))
except ConnectionError:
return False
sock.close()
return True
def check(monitors: MonitorDict):
for service, requirements in monitors.items():
logger.debug(f"Checking service {service}")
failed = []
for requirement, args in requirements.items():
logger.debug(f" checking requirement {requirement.__name__}")
passed = requirement(**args)
if not passed:
if not self_check():
logger.warning("Self-check failed, assuming bad connection and aborting")
return
logger.info(f"{service} failed requirement {requirement.__name__}")
failed.append(requirement)
time.sleep(1)
if failed:
fail(service, failed)
last_states[service] = len(failed) == 0
logger.debug("check complete")
monitors_: MonitorDict = {
"f.bain.cz": {
http_requirement: {"url": "https://f.bain.cz/status", "code": 200},
# dns_requirement: {"name": "f.bain.cz", "ip": "*"},
# ip_requirement: {"ip": "f.bain.cz", "port": 80, "prot": "tcp"}
},
"s.bain.cz": {
http_requirement: {"url": "https://s.bain.cz/", "code": 200},
},
"git.bain.cz": {
http_requirement: {"url": "https://git.bain.cz/", "code": 200},
},
"ts3.bain.cz": {
ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}
}
}
if __name__ == '__main__':
# we assume this is going to be run in a cron job as the gitpython
# library is slowly leaking memory apparently
if os.path.exists("last-state"):
with open("last-state", 'r') as f:
last_states = json.load(f)
check(monitors_)
with open("last-state", 'w+') as f:
json.dump(last_states, f)