add better logs; one push per run
This commit is contained in:
parent
3f4faf42e3
commit
2a924f8266
2 changed files with 121 additions and 51 deletions
170
monitor.py
170
monitor.py
|
@ -1,68 +1,124 @@
|
||||||
#!/bin/python3
|
#!/bin/python3
|
||||||
# HTTP, DNS, and IP monitoring script
|
# HTTP, DNS, and IP monitoring script
|
||||||
|
from collections import namedtuple
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
import socket
|
import socket
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from typing import Callable
|
from typing import Callable, List
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import pydig
|
import pydig
|
||||||
import git
|
import git
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
REPO_ROOT = os.getenv("STATUS_REPO", "status-repo")
|
||||||
logger.addHandler(logging.StreamHandler())
|
|
||||||
logger.setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
REPO_ROOT = "status-repo"
|
|
||||||
|
class Formatter(logging.Formatter):
|
||||||
|
COLOR_RST = "\033[0m"
|
||||||
|
COLORS = {
|
||||||
|
"reset": "\033[0m",
|
||||||
|
"cyan": "\033[36m",
|
||||||
|
"red": "\033[31m",
|
||||||
|
"boldred": "\033[1;31m",
|
||||||
|
"green": "\033[32m",
|
||||||
|
"blue": "\033[34m",
|
||||||
|
"yellow": "\033[33m",
|
||||||
|
}
|
||||||
|
LOGGING_COLORS = {
|
||||||
|
logging.DEBUG: "blue",
|
||||||
|
logging.INFO: "green",
|
||||||
|
logging.WARNING: "yellow",
|
||||||
|
logging.WARN: "yellow",
|
||||||
|
logging.ERROR: "red",
|
||||||
|
logging.CRITICAL: "boldred",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, exclude_time_for: int = 1, disable_colors: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Fancy formatter
|
||||||
|
|
||||||
|
Args:
|
||||||
|
exclude_time_for (int): number of seconds that must have passed
|
||||||
|
for another timestamp to be shown
|
||||||
|
max_width (int): max log width, defaults to 80 characters
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.last_timestamp = 0
|
||||||
|
self.exclude_time_for = exclude_time_for
|
||||||
|
self.disable_colors = disable_colors
|
||||||
|
|
||||||
|
def c(self, color: str) -> str:
|
||||||
|
if self.disable_colors is True:
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return self.COLORS[color]
|
||||||
|
|
||||||
|
def format(self, record: logging.LogRecord) -> str:
|
||||||
|
output = ""
|
||||||
|
if self.last_timestamp + self.exclude_time_for < record.created:
|
||||||
|
dt = datetime.datetime.fromtimestamp(record.created)
|
||||||
|
output += (
|
||||||
|
self.c("cyan") + dt.strftime("[%d/%m %H:%M:%S]") + self.COLOR_RST + " "
|
||||||
|
)
|
||||||
|
self.last_timestamp = record.created
|
||||||
|
else:
|
||||||
|
output += " " * 17
|
||||||
|
output += self.c(self.LOGGING_COLORS.get(record.levelno, "reset"))
|
||||||
|
output += f"{record.levelname.upper()[:3]}{self.COLOR_RST} "
|
||||||
|
output += record.msg % record.args
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# last states of services to keep from detecting downtime repeatedly
|
# last states of services to keep from detecting downtime repeatedly
|
||||||
last_states: dict[str, bool] = {}
|
last_states: dict[str, bool] = {}
|
||||||
|
|
||||||
RequirementCheck = Callable[..., bool]
|
RequirementCheck = Callable[..., bool]
|
||||||
MonitorDict = dict[str, dict[RequirementCheck, dict]]
|
MonitorDict = dict[str, dict[RequirementCheck, dict]]
|
||||||
|
Fail = namedtuple("Fail", ("service_name", "failed_requirements"))
|
||||||
|
|
||||||
|
|
||||||
# publish a failed service, no dependents so edit at will
|
# publish a failed service, no dependents so edit at will
|
||||||
def fail(service_name: str, failed_requirements: list):
|
def fail(failed: List[Fail]):
|
||||||
if not last_states.get(service_name, True):
|
repo = git.Repo(REPO_ROOT) # type: ignore
|
||||||
return
|
origin = repo.remote("origin")
|
||||||
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
|
|
||||||
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
|
|
||||||
repo = git.Repo(REPO_ROOT)
|
|
||||||
origin = repo.remote('origin')
|
|
||||||
try:
|
try:
|
||||||
origin.pull(kill_after_timeout=10)
|
origin.pull(kill_after_timeout=10)
|
||||||
except git.exc.CommandError:
|
except git.CommandError:
|
||||||
logger.warning("Failed to pull from origin! Aborting!")
|
logger.error("failed to pull from origin")
|
||||||
return
|
return
|
||||||
|
|
||||||
# noinspection PyShadowingNames
|
for service_name, failed_requirements in failed:
|
||||||
with open(REPO_ROOT + "/" + filename, 'w+') as f:
|
if not last_states.get(service_name, True):
|
||||||
lines = [
|
continue # we've already seen the service down
|
||||||
"---\n",
|
now = datetime.datetime.now(tz=pytz.timezone("Europe/Prague"))
|
||||||
f"title: {service_name} downtime\n",
|
filename = f"src/content/{now.strftime('%Y-%m-%d-%f')}-downtime.md"
|
||||||
f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n",
|
with open(REPO_ROOT + "/" + filename, "w+") as f:
|
||||||
"severity: down\n",
|
lines = [
|
||||||
"affected:\n",
|
"---\n",
|
||||||
f" - {service_name}\n",
|
f"title: {service_name} downtime\n",
|
||||||
"---\n",
|
f"date: {now.strftime('%Y-%m-%d %H:%M:%S %z')}\n",
|
||||||
f"Automatic checks for {service_name} have failed. "
|
"severity: down\n",
|
||||||
f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n"
|
"affected:\n",
|
||||||
]
|
f" - {service_name}\n",
|
||||||
f.writelines(lines)
|
"---\n",
|
||||||
repo.git.add(filename)
|
f"Automatic checks for {service_name} have failed. "
|
||||||
repo.git.commit('-m', f'{service_name} downtime')
|
f"Requirements {[r.__name__ for r in failed_requirements]} failed.\n",
|
||||||
|
]
|
||||||
|
f.writelines(lines)
|
||||||
|
repo.git.add(filename)
|
||||||
|
repo.git.commit("-m", f"{service_name} downtime")
|
||||||
try:
|
try:
|
||||||
origin.push(kill_after_timeout=10)
|
origin.push(kill_after_timeout=10)
|
||||||
except git.exc.CommandError:
|
except git.CommandError:
|
||||||
logger.warning("Push to origin failed! Aborting and resetting!")
|
logger.error("failed to push to origin, resetting working tree")
|
||||||
repo.git.reset("origin/HEAD", working_tree=True)
|
repo.git.reset("origin/HEAD", working_tree=True)
|
||||||
|
logger.info("failed services published")
|
||||||
logger.warning(f"service {service_name} failed {[r.__name__ for r in failed_requirements]}")
|
|
||||||
|
|
||||||
|
|
||||||
def self_check() -> bool:
|
def self_check() -> bool:
|
||||||
|
@ -80,14 +136,15 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
|
||||||
def inner_retry(func: RequirementCheck) -> RequirementCheck:
|
def inner_retry(func: RequirementCheck) -> RequirementCheck:
|
||||||
def inner(*args, **kwargs) -> bool:
|
def inner(*args, **kwargs) -> bool:
|
||||||
passed = False
|
passed = False
|
||||||
for i in range(n - 1):
|
for _ in range(n - 1):
|
||||||
passed = func(*args, **kwargs)
|
passed = func(*args, **kwargs)
|
||||||
if passed:
|
if passed:
|
||||||
break
|
break
|
||||||
time.sleep(sleep)
|
time.sleep(sleep)
|
||||||
return passed
|
return passed
|
||||||
|
|
||||||
inner.__name__ = func.__name__ # preserve names in log (instead of each requirement being called "inner")
|
# preserve names in log (instead of each requirement being called "inner")
|
||||||
|
inner.__name__ = func.__name__
|
||||||
return inner
|
return inner
|
||||||
|
|
||||||
return inner_retry
|
return inner_retry
|
||||||
|
@ -96,7 +153,7 @@ def retry(n: int = 3, sleep: int = 5) -> Callable[[RequirementCheck], Requiremen
|
||||||
@retry()
|
@retry()
|
||||||
def http_requirement(url: str, code: int) -> bool:
|
def http_requirement(url: str, code: int) -> bool:
|
||||||
try:
|
try:
|
||||||
resp = requests.head(url)
|
resp = requests.head(url, headers={"User-agent": "monitoring (v1)"})
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
|
@ -108,7 +165,7 @@ def dns_requirement(name: str, ip: str) -> bool:
|
||||||
query = pydig.query(name, "A")
|
query = pydig.query(name, "A")
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
return False
|
return False
|
||||||
return query and (ip == "*" or ip in query)
|
return query is not None and (ip == "*" or ip in query)
|
||||||
|
|
||||||
|
|
||||||
@retry()
|
@retry()
|
||||||
|
@ -124,22 +181,33 @@ def ip_requirement(ip: str, port: int, prot: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def check(monitors: MonitorDict):
|
def check(monitors: MonitorDict):
|
||||||
|
failed_services: List[Fail] = []
|
||||||
for service, requirements in monitors.items():
|
for service, requirements in monitors.items():
|
||||||
logger.debug(f"Checking service {service}")
|
logger.info(f"checking service {service}")
|
||||||
failed = []
|
failed = []
|
||||||
for requirement, args in requirements.items():
|
for requirement, args in requirements.items():
|
||||||
logger.debug(f" checking requirement {requirement.__name__}")
|
|
||||||
passed = requirement(**args)
|
passed = requirement(**args)
|
||||||
if not passed:
|
if not passed:
|
||||||
if not self_check():
|
if not self_check():
|
||||||
logger.warning("Self-check failed, assuming bad connection and aborting")
|
logger.error(
|
||||||
|
"self-check failed, assuming bad connection and aborting"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
logger.info(f"{service} failed requirement {requirement.__name__}")
|
logger.warning(f" {requirement.__name__}({args})")
|
||||||
failed.append(requirement)
|
failed.append(requirement)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
if failed:
|
if failed:
|
||||||
fail(service, failed)
|
failed_services.append(Fail(service, failed))
|
||||||
last_states[service] = len(failed) == 0
|
|
||||||
|
if failed_services:
|
||||||
|
fail(failed_services)
|
||||||
|
|
||||||
|
# update last_states
|
||||||
|
for service in monitors.keys():
|
||||||
|
last_states[service] = True
|
||||||
|
for fs in failed_services:
|
||||||
|
last_states[fs.service_name] = False
|
||||||
|
|
||||||
logger.debug("check complete")
|
logger.debug("check complete")
|
||||||
|
|
||||||
|
|
||||||
|
@ -155,19 +223,21 @@ monitors_: MonitorDict = {
|
||||||
"git.bain.cz": {
|
"git.bain.cz": {
|
||||||
http_requirement: {"url": "https://git.bain.cz/", "code": 200},
|
http_requirement: {"url": "https://git.bain.cz/", "code": 200},
|
||||||
},
|
},
|
||||||
"ts3.bain.cz": {
|
"ts3.bain.cz": {ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}},
|
||||||
ip_requirement: {"ip": "ts3.bain.cz", "port": 9987, "prot": "udp"}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setFormatter(Formatter())
|
||||||
|
logging.basicConfig(level=logging.INFO, handlers=[handler])
|
||||||
|
|
||||||
# we assume this is going to be run in a cron job as the gitpython
|
# we assume this is going to be run in a cron job as the gitpython
|
||||||
# library is slowly leaking memory apparently
|
# library is slowly leaking memory apparently
|
||||||
if os.path.exists("last-state"):
|
if os.path.exists("last-state"):
|
||||||
with open("last-state", 'r') as f:
|
with open("last-state", "r") as f:
|
||||||
last_states = json.load(f)
|
last_states = json.load(f)
|
||||||
|
|
||||||
check(monitors_)
|
check(monitors_)
|
||||||
|
|
||||||
with open("last-state", 'w+') as f:
|
with open("last-state", "w+") as f:
|
||||||
json.dump(last_states, f)
|
json.dump(last_states, f)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
requests
|
requests
|
||||||
pydig
|
pydig
|
||||||
gitpython
|
gitpython
|
||||||
pytz
|
pytz
|
||||||
|
|
Loading…
Reference in a new issue