headline/processor/app.py

164 lines
4.6 KiB
Python
Raw Normal View History

2022-07-08 20:07:26 +02:00
#!/usr/bin/python3
import feedparser
import confuse
2022-07-09 19:47:11 +02:00
import redis
import time
import json
2022-08-25 15:10:08 +02:00
import sqlite3
import hashlib
2022-07-09 19:47:11 +02:00
from diff_match_patch import diff_match_patch
2022-07-08 20:07:26 +02:00
2022-07-09 19:47:11 +02:00
#
# Idea block:
#
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
# Ale nevím jestli to bude reálně efektivnější
#
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
2022-07-08 20:07:26 +02:00
2022-08-27 12:47:06 +02:00
CONFIG_FILE = "../data/config.yaml"
REDIS_ARTICLE_EXPIRE_SEC = 604800
2022-08-27 12:47:06 +02:00
2022-07-08 20:07:26 +02:00
config = confuse.Configuration('headline', __name__)
2022-08-27 12:47:06 +02:00
config.set_file(CONFIG_FILE)
2022-07-08 20:07:26 +02:00
2022-07-09 19:47:11 +02:00
dmp = diff_match_patch()
2022-08-25 15:10:08 +02:00
rc = redis.Redis(host='redis', port=6379, db=0)
db_con = sqlite3.connect("../data/diffs.db")
db = db_con.cursor()
db.executescript("""
PRAGMA journal_mode=WAL;
CREATE TABLE IF NOT EXISTS diffs (
2022-08-25 15:10:08 +02:00
diff_id INTEGER PRIMARY KEY,
article_id TEXT,
2022-08-25 15:10:08 +02:00
feed_name TEXT NOT NULL,
article_url TEXT NOT NULL,
title_orig TEXT NOT NULL,
title_new TEXT NOT NULL,
diff_html TEXT NOT NULL,
diff_time TEXT
);
CREATE VIRTUAL TABLE IF NOT EXISTS diffs_fts USING fts5(
title_orig,
title_new,
content="diffs",
content_rowid="diff_id",
tokenize="trigram case_sensitive 0"
);
-- rebuild search index
-- useful when creating the table, or when it is externally updated
INSERT INTO diffs_fts(diffs_fts) VALUES ('rebuild');
CREATE TRIGGER IF NOT EXISTS diffs_ainsert AFTER INSERT ON diffs
BEGIN
INSERT INTO diffs_fts (rowid, title_orig, title_new)
VALUES (new.diff_id, new.title_orig, new.title_new);
END;
CREATE TRIGGER IF NOT EXISTS diffs_adelete AFTER DELETE ON diffs
BEGIN
INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)
VALUES ('delete', old.diff_id, old.title_orig, old.title_new);
END;
CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs
BEGIN
INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)
VALUES ('delete', old.diff_id, old.title_orig, old.title_new);
INSERT INTO diffs_fts (rowid, title_orig, title_new)
VALUES (new.diff_id, new.title_orig, new.title_new);
END;
""")
2022-07-09 20:50:33 +02:00
article_count = 0
2022-07-09 19:47:11 +02:00
def write_article(article, rc):
rval = json.dumps(article['content'])
rc.set(article['rss_id'], rval, ex=REDIS_ARTICLE_EXPIRE_SEC)
2022-07-09 19:47:11 +02:00
2022-07-09 20:50:33 +02:00
def process_diff(old, new, rss_id):
diff = dmp.diff_main(old['title'], new['title'])
2022-07-09 19:47:11 +02:00
dmp.diff_cleanupSemantic(diff)
html_diff = dmp.diff_prettyHtml(diff)
2022-08-25 16:16:42 +02:00
# print(old['link'])
# print(diff)
2022-08-25 15:10:08 +02:00
sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))"
sql_data = (new['article_id'], old['medium'], old['link'], old['title'], new['title'], html_diff)
2022-08-25 15:10:08 +02:00
db.execute(sql, sql_data)
db_con.commit()
2022-07-09 19:47:11 +02:00
return(True)
2022-07-08 20:07:26 +02:00
2022-07-09 19:47:11 +02:00
def process_item(article, rc):
if rc.exists(article['rss_id']):
old = json.loads(rc.get(article['rss_id']))
new = article['content']
if old['title'] != new['title']:
2022-08-25 16:16:42 +02:00
# print('Article changed. World is fucked.')
2022-07-09 20:50:33 +02:00
diff = process_diff(old, new, article['rss_id'])
2022-08-25 15:10:08 +02:00
write_article(article, rc)
2022-07-09 19:47:11 +02:00
return(True)
else:
# Article is the same. All good!
return(True)
else:
# Article is new, just create it and exit
write_article(article, rc)
def create_article_id(uid, feed):
# Create a unique ID from RSS unique tag and feed name to reference the article in database
id_string = str(uid) + str(feed)
id_bytes = id_string.encode('utf-8')
article_id = hashlib.sha256(id_bytes).hexdigest()
return(article_id)
2022-07-09 19:47:11 +02:00
2022-07-09 20:50:33 +02:00
2022-07-08 20:07:26 +02:00
for feed in config['feeds']:
try:
rss_source = str(feed['rss_source'])
unique_tag = str(feed['unique_tag'])
name = str(feed['name'])
rss = feedparser.parse(rss_source)
for item in rss['entries']:
try:
rss_id = item[unique_tag]
title = item['title']
article_id = create_article_id(rss_id, name)
#description = item['description'] ## Don't store description for now, as we don't need it and it's big.
published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
link = item['link']
article_data = {
'title' : title,
'article_id': article_id,
#'description': description,
'published' : published,
'link' : link,
'medium' : name
}
article = {
'rss_id' : rss_id,
'content' : article_data
}
article_count += 1
process_item(article, rc)
except Exception as e:
print("Parsing article failed")
print(e)
print(item)
except Exception as e:
print("Parsing feed failed.")
print(e)
print(feed)
pass
2022-07-09 20:50:33 +02:00
print("Processed articles: " + str(article_count))