#!/usr/bin/python3 import feedparser import confuse import redis import time import json import sqlite3 import hashlib from diff_match_patch import diff_match_patch # # Idea block: # # Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. # Ale nevím jestli to bude reálně efektivnější # # Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku CONFIG_FILE = "../data/config.yaml" REDIS_ARTICLE_EXPIRE_SEC = 604800 config = confuse.Configuration('headline', __name__) config.set_file(CONFIG_FILE) dmp = diff_match_patch() rc = redis.Redis(host='redis', port=6379, db=0) db_con = sqlite3.connect("../data/diffs.db") db = db_con.cursor() db.executescript(""" PRAGMA journal_mode=WAL; CREATE TABLE IF NOT EXISTS diffs ( diff_id INTEGER PRIMARY KEY, article_id TEXT, feed_name TEXT NOT NULL, article_url TEXT NOT NULL, title_orig TEXT NOT NULL, title_new TEXT NOT NULL, diff_html TEXT NOT NULL, diff_time TEXT ); CREATE VIRTUAL TABLE IF NOT EXISTS diffs_fts USING fts5( title_orig, title_new, content="diffs", content_rowid="diff_id", tokenize="trigram case_sensitive 0" ); -- rebuild search index -- useful when creating the table, or when it is externally updated INSERT INTO diffs_fts(diffs_fts) VALUES ('rebuild'); CREATE TRIGGER IF NOT EXISTS diffs_ainsert AFTER INSERT ON diffs BEGIN INSERT INTO diffs_fts (rowid, title_orig, title_new) VALUES (new.diff_id, new.title_orig, new.title_new); END; CREATE TRIGGER IF NOT EXISTS diffs_adelete AFTER DELETE ON diffs BEGIN INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new) VALUES ('delete', old.diff_id, old.title_orig, old.title_new); END; CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs BEGIN INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new) VALUES ('delete', old.diff_id, old.title_orig, old.title_new); INSERT INTO diffs_fts (rowid, title_orig, title_new) VALUES (new.diff_id, new.title_orig, new.title_new); END; """) article_count = 0 def write_article(article, rc): rval = json.dumps(article['content']) rc.set(article['rss_id'], rval, ex=REDIS_ARTICLE_EXPIRE_SEC) def process_diff(old, new, rss_id): diff = dmp.diff_main(old['title'], new['title']) dmp.diff_cleanupSemantic(diff) html_diff = dmp.diff_prettyHtml(diff) # print(old['link']) # print(diff) sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))" sql_data = (new['article_id'], old['medium'], old['link'], old['title'], new['title'], html_diff) db.execute(sql, sql_data) db_con.commit() return(True) def process_item(article, rc): if rc.exists(article['rss_id']): old = json.loads(rc.get(article['rss_id'])) new = article['content'] if old['title'] != new['title']: # print('Article changed. World is fucked.') diff = process_diff(old, new, article['rss_id']) write_article(article, rc) return(True) else: # Article is the same. All good! return(True) else: # Article is new, just create it and exit write_article(article, rc) def create_article_id(uid, feed): # Create a unique ID from RSS unique tag and feed name to reference the article in database id_string = str(uid) + str(feed) id_bytes = id_string.encode('utf-8') article_id = hashlib.sha256(id_bytes).hexdigest() return(article_id) for feed in config['feeds']: try: rss_source = str(feed['rss_source']) unique_tag = str(feed['unique_tag']) name = str(feed['name']) rss = feedparser.parse(rss_source) for item in rss['entries']: try: rss_id = item[unique_tag] title = item['title'] article_id = create_article_id(rss_id, name) #description = item['description'] ## Don't store description for now, as we don't need it and it's big. published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed']) link = item['link'] article_data = { 'title' : title, 'article_id': article_id, #'description': description, 'published' : published, 'link' : link, 'medium' : name } article = { 'rss_id' : rss_id, 'content' : article_data } article_count += 1 process_item(article, rc) except Exception as e: print("Parsing article failed") print(e) print(item) except Exception as e: print("Parsing feed failed.") print(e) print(feed) pass print("Processed articles: " + str(article_count))