#!/bin/python3 # HTTP, DNS, and IP monitoring script import time import logging import datetime import socket import requests import pydig import git import pytz logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.DEBUG) REPO_ROOT = "status-repo" # last states of services to keep from detecting downtime repeatedly last_states = {} # publish a failed service, no dependents so edit at will def fail(service_name: str, failed_requirements: list): if not last_states.get(service_name, True): return now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague")) filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md" repo = git.Repo(REPO_ROOT) origin = repo.remote('origin') try: origin.pull(kill_after_timeout=10) except git.exc.CommandError: logger.warning("Failed to pull from origin! Aborting!") return with open(REPO_ROOT + "/" + filename, 'w+') as f: lines = [ "---\n", f"title: {service_name} downtime\n", f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n", "severity: down\n", "affected:\n", f" - {service_name}\n", "---\n", f"Automatic checks for {service_name} have failed. " f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n" ] f.writelines(lines) repo.git.add(filename) repo.git.commit('-m', f'{service_name} downtime') try: origin.push(kill_after_timeout=10) except git.exc.CommandError: logger.warning("Push to origin failed! Aborting and resetting!") repo.git.reset("origin/HEAD", working_tree=True) logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}") def self_check(): try: if requests.get("https://google.com/").status_code != 200: return False except ConnectionError: return False return True def http_requirement(url: str, code: int) -> bool: passed = False for i in range(2): try: resp = requests.get(url) except ConnectionError: passed = False else: passed = resp.status_code == code if passed: break return passed def dns_requirement(name: str, ip: str) -> bool: try: query = pydig.query(name, "A") except ConnectionError: return False return query and (ip == "*" or ip in query) def ip_requirement(ip: str, port: int, prot: str) -> bool: protocol = socket.SOCK_STREAM if prot == "tcp" else socket.SOCK_DGRAM sock = socket.socket(type=protocol) try: sock.connect((ip, port)) except ConnectionError: return False sock.close() return True def check(monitors: dict): for service, requirements in monitors.items(): logger.debug(f"Checking service {service}") failed = [] for requirement, args in requirements.items(): logger.debug(f" checking requirement {requirement.__name__}") passed = requirement(**args) if not passed: if not self_check(): logger.warning("Self-check failed, assuming bad connection and aborting") return logger.info(f"{service} failed requirement {requirement.__name__}") failed.append(requirement) time.sleep(1) if failed: fail(service, failed) last_states[service] = len(failed) == 0 logger.debug("check complete") monitors = { "f.bain.cz": { http_requirement: {"url": "https://f.bain.cz/status", "code": 200}, # dns_requirement: {"name": "f.bain.cz", "ip": "*"}, # ip_requirement: {"ip": "f.bain.cz", "port": 80, "prot": "tcp"} }, "s.bain.cz": { http_requirement: {"url": "https://s.bain.cz/", "code": 200}, }, "git.bain.cz": { http_requirement: {"url": "https://git.bain.cz/", "code": 200}, }, "ts3.bain.cz": { ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"} } } if __name__ == '__main__': # we assume this is gonna be run in a cron job as the gitpython # library is slowly leaking memory apparently check(monitors)