#!/usr/bin/python3 import feedparser import confuse import redis import time import json import sqlite3 import hashlib from diff_match_patch import diff_match_patch class DiffThing(diff_match_patch): def diff_html(self, diffs): """Like diff_prettyHtml, but without inline style attributes (makes it easier to style it with ). """ html = [] for op, data in diffs: text = ( data.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace("\n", "¶
") ) if op == self.DIFF_INSERT: html.append("%s" % text) elif op == self.DIFF_DELETE: html.append("%s" % text) elif op == self.DIFF_EQUAL: html.append("%s" % text) return "".join(html) # # Idea block: # # Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. # Ale nevím jestli to bude reálně efektivnější # # Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku CONFIG_FILE = "../data/config.yaml" REDIS_ARTICLE_EXPIRE_SEC = 604800 config = confuse.Configuration("headline", __name__) config.set_file(CONFIG_FILE) dmp = DiffThing() rc = redis.Redis(host="redis", port=6379, db=0) db_con = sqlite3.connect("../data/diffs.db") db = db_con.cursor() db.executescript( """ PRAGMA journal_mode=WAL; CREATE TABLE IF NOT EXISTS diffs ( diff_id INTEGER PRIMARY KEY, article_id TEXT, feed_name TEXT NOT NULL, article_url TEXT NOT NULL, title_orig TEXT NOT NULL, title_new TEXT NOT NULL, diff_html TEXT NOT NULL, diff_time TEXT ); CREATE VIRTUAL TABLE IF NOT EXISTS diffs_fts USING fts5( title_orig, title_new, content="diffs", content_rowid="diff_id", tokenize="trigram case_sensitive 0" ); -- rebuild search index -- useful when creating the table, or when it is externally updated INSERT INTO diffs_fts(diffs_fts) VALUES ('rebuild'); CREATE TRIGGER IF NOT EXISTS diffs_ainsert AFTER INSERT ON diffs BEGIN INSERT INTO diffs_fts (rowid, title_orig, title_new) VALUES (new.diff_id, new.title_orig, new.title_new); END; CREATE TRIGGER IF NOT EXISTS diffs_adelete AFTER DELETE ON diffs BEGIN INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new) VALUES ('delete', old.diff_id, old.title_orig, old.title_new); END; CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs BEGIN INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new) VALUES ('delete', old.diff_id, old.title_orig, old.title_new); INSERT INTO diffs_fts (rowid, title_orig, title_new) VALUES (new.diff_id, new.title_orig, new.title_new); END; """ ) article_count = 0 def write_article(article, rc): rval = json.dumps(article["content"]) rc.set(article["rss_id"], rval, ex=REDIS_ARTICLE_EXPIRE_SEC) def process_diff(old, new, rss_id): diff = dmp.diff_main(old["title"], new["title"]) dmp.diff_cleanupSemantic(diff) html_diff = dmp.diff_html(diff) # print(old['link']) # print(diff) sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))" sql_data = ( new["article_id"], old["medium"], old["link"], old["title"], new["title"], html_diff, ) db.execute(sql, sql_data) db_con.commit() return True def process_item(article, rc): if rc.exists(article["rss_id"]): old = json.loads(rc.get(article["rss_id"])) new = article["content"] if old["title"] != new["title"]: # print('Article changed. World is fucked.') diff = process_diff(old, new, article["rss_id"]) write_article(article, rc) return True else: # Article is the same. All good! return True else: # Article is new, just create it and exit write_article(article, rc) def create_article_id(uid, feed): # Create a unique ID from RSS unique tag and feed name to reference the article in database id_string = str(uid) + str(feed) id_bytes = id_string.encode("utf-8") article_id = hashlib.sha256(id_bytes).hexdigest() return article_id for feed in config["feeds"]: try: rss_source = str(feed["rss_source"]) unique_tag = str(feed["unique_tag"]) name = str(feed["name"]) rss = feedparser.parse(rss_source) for item in rss["entries"]: try: rss_id = item[unique_tag] title = item["title"] article_id = create_article_id(rss_id, name) # description = item['description'] ## Don't store description for now, as we don't need it and it's big. published = time.strftime( "%Y:%m:%d %H:%M:%S %Z %z", item["published_parsed"] ) link = item["link"] article_data = { "title": title, "article_id": article_id, #'description': description, "published": published, "link": link, "medium": name, } article = {"rss_id": rss_id, "content": article_data} article_count += 1 process_item(article, rc) except Exception as e: print("Parsing article failed") print(e) print(item) except Exception as e: print("Parsing feed failed.") print(e) print(feed) pass print("Processed articles: " + str(article_count))