headline/processor/app.py

#!/usr/bin/python3

import feedparser
import confuse
import redis
import time
import json
import sqlite3
import hashlib

from diff_match_patch import diff_match_patch


#
# Idea block:
#
#	Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. 
#	Ale nevím jestli to bude reálně efektivnější
#
#	Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku

CONFIG_FILE = "../data/config.yaml"
REDIS_ARTICLE_EXPIRE_SEC = 604800

config = confuse.Configuration('headline', __name__)
config.set_file(CONFIG_FILE)

dmp = diff_match_patch()

rc = redis.Redis(host='redis', port=6379, db=0)

db_con = sqlite3.connect("../data/diffs.db")
db = db_con.cursor()

db.executescript("""
PRAGMA journal_mode=WAL;

CREATE TABLE IF NOT EXISTS diffs (
	diff_id INTEGER PRIMARY KEY,
	article_id TEXT,
	feed_name TEXT NOT NULL,
	article_url TEXT NOT NULL,
	title_orig TEXT NOT NULL,
	title_new TEXT NOT NULL,
    diff_html TEXT NOT NULL,
    diff_time TEXT
);

CREATE VIRTUAL TABLE IF NOT EXISTS diffs_fts USING fts5(
    title_orig,
    title_new,
    content="diffs",
    content_rowid="diff_id",
    tokenize="trigram case_sensitive 0"
);
-- rebuild search index
-- useful when creating the table, or when it is externally updated
INSERT INTO diffs_fts(diffs_fts) VALUES ('rebuild');

CREATE TRIGGER IF NOT EXISTS diffs_ainsert AFTER INSERT ON diffs
    BEGIN
        INSERT INTO diffs_fts (rowid, title_orig, title_new)
        VALUES (new.diff_id, new.title_orig, new.title_new);
    END;
CREATE TRIGGER IF NOT EXISTS diffs_adelete AFTER DELETE ON diffs
    BEGIN
        INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)
        VALUES ('delete', old.diff_id, old.title_orig, old.title_new);
    END;
CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs
    BEGIN
        INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)
        VALUES ('delete', old.diff_id, old.title_orig, old.title_new);
        INSERT INTO diffs_fts (rowid, title_orig, title_new)
        VALUES (new.diff_id, new.title_orig, new.title_new);
    END;
""")
article_count = 0

def write_article(article, rc):
	rval = json.dumps(article['content'])
	rc.set(article['rss_id'], rval, ex=REDIS_ARTICLE_EXPIRE_SEC)

def process_diff(old, new, rss_id):
	diff = dmp.diff_main(old['title'], new['title'])
	dmp.diff_cleanupSemantic(diff)
	html_diff = dmp.diff_prettyHtml(diff)
	# print(old['link'])
	# print(diff)

	sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))"
	sql_data = (new['article_id'], old['medium'], old['link'], old['title'], new['title'], html_diff)
	db.execute(sql, sql_data)
	db_con.commit()

	return(True)


def process_item(article, rc):
	if rc.exists(article['rss_id']):
		old = json.loads(rc.get(article['rss_id']))
		new = article['content']
		if old['title'] != new['title']:
			# print('Article changed. World is fucked.')
			diff = process_diff(old, new, article['rss_id'])
			write_article(article, rc)
			return(True)
		else:
			# Article is the same. All good!
			return(True)
	else:
		# Article is new, just create it and exit
		write_article(article, rc)

def create_article_id(uid, feed):
	# Create a unique ID from RSS unique tag and feed name to reference the article in database
	id_string = str(uid) + str(feed)
	id_bytes = id_string.encode('utf-8')
	article_id = hashlib.sha256(id_bytes).hexdigest()
	return(article_id)


for feed in config['feeds']:
	try:
		rss_source = str(feed['rss_source'])
		unique_tag = str(feed['unique_tag'])
		name = str(feed['name'])

		rss = feedparser.parse(rss_source)

		for item in rss['entries']:
			try:
				rss_id = item[unique_tag]
				title = item['title']
				article_id = create_article_id(rss_id, name)
				#description = item['description'] ## Don't store description for now, as we don't need it and it's big.
				published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
				link = item['link']
				article_data = {
					'title'		:	title,
					'article_id':	article_id,
					#'description':	description,
					'published'	:	published,
					'link'		:	link,
					'medium'	:	name
				}
				article = {
					'rss_id'	: rss_id, 
					'content'	: article_data
				}
				article_count += 1
				process_item(article, rc)
			except Exception as e:
				print("Parsing article failed")
				print(e)
				print(item)
	except Exception as e:
		print("Parsing feed failed.")
		print(e)
		print(feed)
		pass

print("Processed articles: " + str(article_count))
Initial testing 2022-07-08 20:07:26 +02:00			`#!/usr/bin/python3`

			`import feedparser`
			`import confuse`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00			`import redis`
			`import time`
			`import json`
flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00			`import sqlite3`
give every article ID to enable grouping changes by article 2023-08-17 11:19:12 +02:00			`import hashlib`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00
			`from diff_match_patch import diff_match_patch`

Initial testing 2022-07-08 20:07:26 +02:00
Simple version with weird html diff 2022-07-09 19:47:11 +02:00			`#`
			`# Idea block:`
			`#`
			`# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.`
			`# Ale nevím jestli to bude reálně efektivnější`
			`#`
			`# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku`
Initial testing 2022-07-08 20:07:26 +02:00
move config file to variable 2022-08-27 12:47:06 +02:00			`CONFIG_FILE = "../data/config.yaml"`
add expire value to redis keys (7 days) 2022-08-27 13:45:25 +02:00			`REDIS_ARTICLE_EXPIRE_SEC = 604800`
move config file to variable 2022-08-27 12:47:06 +02:00
Initial testing 2022-07-08 20:07:26 +02:00			`config = confuse.Configuration('headline', __name__)`
move config file to variable 2022-08-27 12:47:06 +02:00			`config.set_file(CONFIG_FILE)`
Initial testing 2022-07-08 20:07:26 +02:00
Simple version with weird html diff 2022-07-09 19:47:11 +02:00			`dmp = diff_match_patch()`

flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00			`rc = redis.Redis(host='redis', port=6379, db=0)`

			`db_con = sqlite3.connect("../data/diffs.db")`
			`db = db_con.cursor()`

Add full-text search to the main page. The user can now search for parts of the changed headline. The search uses each word (text separated by space) as a separate query (logical OR). The user can use quotes to specify that they want to keep multiple words as a single query. 2023-08-16 10:28:20 +02:00			`db.executescript("""`
Use WAL in sqlite3 to avoid database locks that prevent reads 2023-08-17 12:25:01 +02:00			`PRAGMA journal_mode=WAL;`

Add full-text search to the main page. The user can now search for parts of the changed headline. The search uses each word (text separated by space) as a separate query (logical OR). The user can use quotes to specify that they want to keep multiple words as a single query. 2023-08-16 10:28:20 +02:00			`CREATE TABLE IF NOT EXISTS diffs (`
flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00			`diff_id INTEGER PRIMARY KEY,`
give every article ID to enable grouping changes by article 2023-08-17 11:19:12 +02:00			`article_id TEXT,`
flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00			`feed_name TEXT NOT NULL,`
			`article_url TEXT NOT NULL,`
			`title_orig TEXT NOT NULL,`
			`title_new TEXT NOT NULL,`
			`diff_html TEXT NOT NULL,`
			`diff_time TEXT`
Add full-text search to the main page. The user can now search for parts of the changed headline. The search uses each word (text separated by space) as a separate query (logical OR). The user can use quotes to specify that they want to keep multiple words as a single query. 2023-08-16 10:28:20 +02:00			`);`

			`CREATE VIRTUAL TABLE IF NOT EXISTS diffs_fts USING fts5(`
			`title_orig,`
			`title_new,`
			`content="diffs",`
			`content_rowid="diff_id",`
			`tokenize="trigram case_sensitive 0"`
			`);`
			`-- rebuild search index`
			`-- useful when creating the table, or when it is externally updated`
			`INSERT INTO diffs_fts(diffs_fts) VALUES ('rebuild');`

			`CREATE TRIGGER IF NOT EXISTS diffs_ainsert AFTER INSERT ON diffs`
			`BEGIN`
			`INSERT INTO diffs_fts (rowid, title_orig, title_new)`
			`VALUES (new.diff_id, new.title_orig, new.title_new);`
			`END;`
			`CREATE TRIGGER IF NOT EXISTS diffs_adelete AFTER DELETE ON diffs`
			`BEGIN`
			`INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)`
			`VALUES ('delete', old.diff_id, old.title_orig, old.title_new);`
			`END;`
			`CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs`
			`BEGIN`
			`INSERT INTO diffs_fts (diffs_fts, rowid, title_orig, title_new)`
			`VALUES ('delete', old.diff_id, old.title_orig, old.title_new);`
			`INSERT INTO diffs_fts (rowid, title_orig, title_new)`
			`VALUES (new.diff_id, new.title_orig, new.title_new);`
			`END;`
			`""")`
Update 2022-07-09 20:50:33 +02:00			`article_count = 0`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00
			`def write_article(article, rc):`
			`rval = json.dumps(article['content'])`
add expire value to redis keys (7 days) 2022-08-27 13:45:25 +02:00			`rc.set(article['rss_id'], rval, ex=REDIS_ARTICLE_EXPIRE_SEC)`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00
Update 2022-07-09 20:50:33 +02:00			`def process_diff(old, new, rss_id):`
			`diff = dmp.diff_main(old['title'], new['title'])`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00			`dmp.diff_cleanupSemantic(diff)`
			`html_diff = dmp.diff_prettyHtml(diff)`
limit messages to console 2022-08-25 16:16:42 +02:00			`# print(old['link'])`
			`# print(diff)`
flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00
Fix missing field in database insert, causing writes to fail 2023-08-17 14:18:08 +02:00			`sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))"`
give every article ID to enable grouping changes by article 2023-08-17 11:19:12 +02:00			`sql_data = (new['article_id'], old['medium'], old['link'], old['title'], new['title'], html_diff)`
flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00			`db.execute(sql, sql_data)`
			`db_con.commit()`

Simple version with weird html diff 2022-07-09 19:47:11 +02:00			`return(True)`
Initial testing 2022-07-08 20:07:26 +02:00
Simple version with weird html diff 2022-07-09 19:47:11 +02:00
			`def process_item(article, rc):`
			`if rc.exists(article['rss_id']):`
			`old = json.loads(rc.get(article['rss_id']))`
			`new = article['content']`
			`if old['title'] != new['title']:`
limit messages to console 2022-08-25 16:16:42 +02:00			`# print('Article changed. World is fucked.')`
Update 2022-07-09 20:50:33 +02:00			`diff = process_diff(old, new, article['rss_id'])`
flask UI and dockerization:) Sorry. 2022-08-25 15:10:08 +02:00			`write_article(article, rc)`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00			`return(True)`
			`else:`
			`# Article is the same. All good!`
			`return(True)`
			`else:`
			`# Article is new, just create it and exit`
			`write_article(article, rc)`

give every article ID to enable grouping changes by article 2023-08-17 11:19:12 +02:00			`def create_article_id(uid, feed):`
			`# Create a unique ID from RSS unique tag and feed name to reference the article in database`
			`id_string = str(uid) + str(feed)`
			`id_bytes = id_string.encode('utf-8')`
			`article_id = hashlib.sha256(id_bytes).hexdigest()`
			`return(article_id)`
Simple version with weird html diff 2022-07-09 19:47:11 +02:00
Update 2022-07-09 20:50:33 +02:00
Initial testing 2022-07-08 20:07:26 +02:00			`for feed in config['feeds']:`
add dirty exceptions to skip broken RSS feeds 2023-08-16 12:41:29 +02:00			`try:`
			`rss_source = str(feed['rss_source'])`
			`unique_tag = str(feed['unique_tag'])`
			`name = str(feed['name'])`

			`rss = feedparser.parse(rss_source)`

			`for item in rss['entries']:`
			`try:`
			`rss_id = item[unique_tag]`
			`title = item['title']`
give every article ID to enable grouping changes by article 2023-08-17 11:19:12 +02:00			`article_id = create_article_id(rss_id, name)`
add dirty exceptions to skip broken RSS feeds 2023-08-16 12:41:29 +02:00			`#description = item['description'] ## Don't store description for now, as we don't need it and it's big.`
			`published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])`
			`link = item['link']`
			`article_data = {`
			`'title' : title,`
give every article ID to enable grouping changes by article 2023-08-17 11:19:12 +02:00			`'article_id': article_id,`
add dirty exceptions to skip broken RSS feeds 2023-08-16 12:41:29 +02:00			`#'description': description,`
			`'published' : published,`
			`'link' : link,`
			`'medium' : name`
			`}`
			`article = {`
			`'rss_id' : rss_id,`
			`'content' : article_data`
			`}`
			`article_count += 1`
			`process_item(article, rc)`
			`except Exception as e:`
			`print("Parsing article failed")`
			`print(e)`
			`print(item)`
			`except Exception as e:`
			`print("Parsing feed failed.")`
			`print(e)`
			`print(feed)`
			`pass`
Update 2022-07-09 20:50:33 +02:00
Add full-text search to the main page. The user can now search for parts of the changed headline. The search uses each word (text separated by space) as a separate query (logical OR). The user can use quotes to specify that they want to keep multiple words as a single query. 2023-08-16 10:28:20 +02:00			`print("Processed articles: " + str(article_count))`