2022-07-08 20:07:26 +02:00
|
|
|
#!/usr/bin/python3
|
|
|
|
|
|
|
|
import feedparser
|
|
|
|
import confuse
|
2022-07-09 19:47:11 +02:00
|
|
|
import redis
|
|
|
|
import time
|
|
|
|
import json
|
2022-08-25 15:10:08 +02:00
|
|
|
import sqlite3
|
2023-08-17 11:19:12 +02:00
|
|
|
import hashlib
|
2022-07-09 19:47:11 +02:00
|
|
|
|
|
|
|
from diff_match_patch import diff_match_patch
|
|
|
|
|
2022-07-08 20:07:26 +02:00
|
|
|
|
2023-08-25 22:41:25 +02:00
|
|
|
class DiffThing(diff_match_patch):
|
|
|
|
def diff_html(self, diffs):
|
|
|
|
"""Like diff_prettyHtml, but without inline style attributes
|
2023-08-25 22:46:16 +02:00
|
|
|
(makes it easier to style it with ).
|
2023-08-25 22:41:25 +02:00
|
|
|
"""
|
|
|
|
html = []
|
|
|
|
for op, data in diffs:
|
|
|
|
text = (
|
|
|
|
data.replace("&", "&")
|
|
|
|
.replace("<", "<")
|
|
|
|
.replace(">", ">")
|
|
|
|
.replace("\n", "¶<br>")
|
|
|
|
)
|
|
|
|
if op == self.DIFF_INSERT:
|
|
|
|
html.append("<ins>%s</ins>" % text)
|
|
|
|
elif op == self.DIFF_DELETE:
|
2023-08-25 22:46:16 +02:00
|
|
|
html.append("<del>%s</del>" % text)
|
2023-08-25 22:41:25 +02:00
|
|
|
elif op == self.DIFF_EQUAL:
|
|
|
|
html.append("<span>%s</span>" % text)
|
|
|
|
return "".join(html)
|
|
|
|
|
|
|
|
|
2022-07-09 19:47:11 +02:00
|
|
|
#
|
|
|
|
# Idea block:
|
|
|
|
#
|
2023-08-21 10:38:13 +02:00
|
|
|
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
|
|
|
|
# Ale nevím jestli to bude reálně efektivnější
|
2022-07-09 19:47:11 +02:00
|
|
|
#
|
2023-08-21 10:38:13 +02:00
|
|
|
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
|
2022-07-08 20:07:26 +02:00
|
|
|
|
2022-08-27 12:47:06 +02:00
|
|
|
CONFIG_FILE = "../data/config.yaml"
|
2022-08-27 13:45:25 +02:00
|
|
|
REDIS_ARTICLE_EXPIRE_SEC = 604800
|
2022-08-27 12:47:06 +02:00
|
|
|
|
2023-08-21 10:38:13 +02:00
|
|
|
config = confuse.Configuration("headline", __name__)
|
2022-08-27 12:47:06 +02:00
|
|
|
config.set_file(CONFIG_FILE)
|
2022-07-08 20:07:26 +02:00
|
|
|
|
2023-08-25 22:41:25 +02:00
|
|
|
dmp = DiffThing()
|
2022-07-09 19:47:11 +02:00
|
|
|
|
2023-08-21 10:38:13 +02:00
|
|
|
rc = redis.Redis(host="redis", port=6379, db=0)
|
2022-08-25 15:10:08 +02:00
|
|
|
|
|
|
|
db_con = sqlite3.connect("../data/diffs.db")
|
|
|
|
db = db_con.cursor()
|
|
|
|
|
2023-08-21 10:38:13 +02:00
|
|
|
db.executescript(
|
|
|
|
"""
|
2023-08-17 12:25:01 +02:00
|
|
|
PRAGMA journal_mode=WAL;
|
|
|
|
|
2023-08-16 10:28:20 +02:00
|
|
|
CREATE TABLE IF NOT EXISTS diffs (
|
2022-08-25 15:10:08 +02:00
|
|
|
diff_id INTEGER PRIMARY KEY,
|
2023-08-17 11:19:12 +02:00
|
|
|
article_id TEXT,
|
2022-08-25 15:10:08 +02:00
|
|
|
feed_name TEXT NOT NULL,
|
|
|
|
article_url TEXT NOT NULL,
|
|
|
|
title_orig TEXT NOT NULL,
|
|
|
|
title_new TEXT NOT NULL,
|
|
|
|
diff_html TEXT NOT NULL,
|
|
|
|
diff_time TEXT
|
2023-08-16 10:28:20 +02:00
|
|
|
);
|
|
|
|
|
|
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS diffs_fts USING fts5(
|
|
|
|
title_orig,
|
|
|
|
title_new,
|
|
|
|
content="diffs",
|
|
|
|
content_rowid="diff_id",
|
|
|
|
tokenize="trigram case_sensitive 0"
|
|
|
|
);
|
|
|
|
-- rebuild search index
|
|
|
|
-- useful when creating the table, or when it is externally updated
|
|
|
|
INSERT INTO diffs_fts(diffs_fts) VALUES ('rebuild');
|
|
|
|
|
|
|
|
CREATE TRIGGER IF NOT EXISTS diffs_ainsert AFTER INSERT ON diffs
|
|
|
|
BEGIN
|
|
|
|
INSERT INTO diffs_fts (rowid, title_orig, title_new)
|
|
|
|
VALUES (new.diff_id, new.title_orig, new.title_new);
|
|
|
|
END;
|
|
|
|
CREATE TRIGGER IF NOT EXISTS diffs_adelete AFTER DELETE ON diffs
|
|
|
|
BEGIN
|
|
|
|
INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)
|
|
|
|
VALUES ('delete', old.diff_id, old.title_orig, old.title_new);
|
|
|
|
END;
|
|
|
|
CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs
|
|
|
|
BEGIN
|
|
|
|
INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)
|
|
|
|
VALUES ('delete', old.diff_id, old.title_orig, old.title_new);
|
|
|
|
INSERT INTO diffs_fts (rowid, title_orig, title_new)
|
|
|
|
VALUES (new.diff_id, new.title_orig, new.title_new);
|
|
|
|
END;
|
2023-08-21 10:38:13 +02:00
|
|
|
"""
|
|
|
|
)
|
2022-07-09 20:50:33 +02:00
|
|
|
article_count = 0
|
2022-07-09 19:47:11 +02:00
|
|
|
|
|
|
|
|
2023-08-21 10:38:13 +02:00
|
|
|
def write_article(article, rc):
|
|
|
|
rval = json.dumps(article["content"])
|
|
|
|
rc.set(article["rss_id"], rval, ex=REDIS_ARTICLE_EXPIRE_SEC)
|
2022-08-25 15:10:08 +02:00
|
|
|
|
|
|
|
|
2023-08-21 10:38:13 +02:00
|
|
|
def process_diff(old, new, rss_id):
|
|
|
|
diff = dmp.diff_main(old["title"], new["title"])
|
|
|
|
dmp.diff_cleanupSemantic(diff)
|
2023-08-25 22:41:25 +02:00
|
|
|
html_diff = dmp.diff_html(diff)
|
2023-08-21 10:38:13 +02:00
|
|
|
# print(old['link'])
|
|
|
|
# print(diff)
|
|
|
|
|
|
|
|
sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))"
|
|
|
|
sql_data = (
|
|
|
|
new["article_id"],
|
|
|
|
old["medium"],
|
|
|
|
old["link"],
|
|
|
|
old["title"],
|
|
|
|
new["title"],
|
|
|
|
html_diff,
|
|
|
|
)
|
|
|
|
db.execute(sql, sql_data)
|
|
|
|
db_con.commit()
|
|
|
|
|
|
|
|
return True
|
2022-07-08 20:07:26 +02:00
|
|
|
|
2022-07-09 19:47:11 +02:00
|
|
|
|
|
|
|
def process_item(article, rc):
|
2023-08-21 10:38:13 +02:00
|
|
|
if rc.exists(article["rss_id"]):
|
|
|
|
old = json.loads(rc.get(article["rss_id"]))
|
|
|
|
new = article["content"]
|
|
|
|
if old["title"] != new["title"]:
|
|
|
|
# print('Article changed. World is fucked.')
|
|
|
|
diff = process_diff(old, new, article["rss_id"])
|
|
|
|
write_article(article, rc)
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
# Article is the same. All good!
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
# Article is new, just create it and exit
|
|
|
|
write_article(article, rc)
|
|
|
|
|
2022-07-09 19:47:11 +02:00
|
|
|
|
2023-08-17 11:19:12 +02:00
|
|
|
def create_article_id(uid, feed):
|
2023-08-21 10:38:13 +02:00
|
|
|
# Create a unique ID from RSS unique tag and feed name to reference the article in database
|
|
|
|
id_string = str(uid) + str(feed)
|
|
|
|
id_bytes = id_string.encode("utf-8")
|
|
|
|
article_id = hashlib.sha256(id_bytes).hexdigest()
|
|
|
|
return article_id
|
|
|
|
|
|
|
|
|
|
|
|
for feed in config["feeds"]:
|
|
|
|
try:
|
|
|
|
rss_source = str(feed["rss_source"])
|
|
|
|
unique_tag = str(feed["unique_tag"])
|
|
|
|
name = str(feed["name"])
|
|
|
|
|
|
|
|
rss = feedparser.parse(rss_source)
|
|
|
|
|
|
|
|
for item in rss["entries"]:
|
|
|
|
try:
|
|
|
|
rss_id = item[unique_tag]
|
|
|
|
title = item["title"]
|
|
|
|
article_id = create_article_id(rss_id, name)
|
|
|
|
# description = item['description'] ## Don't store description for now, as we don't need it and it's big.
|
|
|
|
published = time.strftime(
|
|
|
|
"%Y:%m:%d %H:%M:%S %Z %z", item["published_parsed"]
|
|
|
|
)
|
|
|
|
link = item["link"]
|
|
|
|
article_data = {
|
|
|
|
"title": title,
|
|
|
|
"article_id": article_id,
|
|
|
|
#'description': description,
|
|
|
|
"published": published,
|
|
|
|
"link": link,
|
|
|
|
"medium": name,
|
|
|
|
}
|
|
|
|
article = {"rss_id": rss_id, "content": article_data}
|
|
|
|
article_count += 1
|
|
|
|
process_item(article, rc)
|
|
|
|
except Exception as e:
|
|
|
|
print("Parsing article failed")
|
|
|
|
print(e)
|
|
|
|
print(item)
|
|
|
|
except Exception as e:
|
|
|
|
print("Parsing feed failed.")
|
|
|
|
print(e)
|
|
|
|
print(feed)
|
|
|
|
pass
|
2022-07-09 20:50:33 +02:00
|
|
|
|
2023-08-16 10:28:20 +02:00
|
|
|
print("Processed articles: " + str(article_count))
|