From d30baa49f6e5b5a7f28950888ac97385c55dcf69 Mon Sep 17 00:00:00 2001
From: mdivecky <matej@divecky.com>
Date: Sat, 9 Jul 2022 19:47:11 +0200
Subject: [PATCH] Simple version with weird html diff

---
 README.md        |  1 +
 config.yaml      |  4 +++
 headline.py      | 73 ++++++++++++++++++++++++++++++++++++++++++++----
 requirements.txt |  5 +++-
 4 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e69de29..8558eda 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
+Zatím je potřeba v debianu instalovat wkhtmltopdf balíček
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index fd812cd..68a3244 100644
--- a/config.yaml
+++ b/config.yaml
@@ -10,3 +10,7 @@ feeds:
   - name: "aktualne.cz"
     rss_source: "https://www.aktualne.cz/rss/"
     unique_tag: "guid"
+
+  - name: "novinky.cz"
+    rss_source: "https://www.novinky.cz/rss"
+    unique_tag: "guid"
diff --git a/headline.py b/headline.py
index f8af816..3857648 100644
--- a/headline.py
+++ b/headline.py
@@ -2,14 +2,71 @@
 
 import feedparser
 import confuse
+import redis
+import time
+import json
+import imgkit
+
+from diff_match_patch import diff_match_patch
+import difflib
+
 from pprint import pprint
+import hashlib
 
 
+#
+# Idea block:
+#
+#	Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. 
+#	Ale nevím jestli to bude reálně efektivnější
+#
+#	Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
 
 config = confuse.Configuration('headline', __name__)
 config.set_file('config.yaml')
 
+dmp = diff_match_patch()
 
+rc = redis.Redis(host='localhost', port=6379, db=0)
+
+image_options = {
+	'width': '450',
+}
+
+
+def write_article(article, rc):
+	rval = json.dumps(article['content'])
+	rc.set(article['rss_id'], rval)
+
+
+def process_diff(diff, article):
+	dmp.diff_cleanupSemantic(diff)
+	html_diff = dmp.diff_prettyHtml(diff)
+	filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg"
+	image = imgkit.from_string(html_diff, filename, options = {'width': '450'})
+	return(True)
+
+
+def process_item(article, rc):
+	if rc.exists(article['rss_id']):
+		old = json.loads(rc.get(article['rss_id']))
+		new = article['content']
+		if old['title'] != new['title']:
+			print('Article changed. Fuck the world.')
+			diff = dmp.diff_main(old['title'], new['title'])
+			process_diff(diff, article)
+			#write_article(article_rc)
+			return(True)
+		else:
+			# Article is the same. All good!
+			return(True)
+	else:
+		# Article is new, just create it and exit
+		write_article(article, rc)
+
+
+
+article_count = 0
 for feed in config['feeds']:
 	rss_source = str(feed['rss_source'])
 	unique_tag = str(feed['unique_tag'])
@@ -21,16 +78,20 @@ for feed in config['feeds']:
 		rss_id = item[unique_tag]
 		title = item['title']
 		description = item['description']
-		published = item['published_parsed']
+		published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
 		link = item['link']
-		article = {
-			'rss_id'	:	rss_id,
+		article_data = {
 			'title'		:	title,
 			'description':	description,
 			'published'	:	published,
 			'link'		:	link,
 			'medium'	:	name
 		}
-		pprint(article)
-		break
-
+		article = {
+			'rss_id'	: rss_id, 
+			'content'	: article_data
+		}
+		article_count += 1
+		process_item(article, rc)
+print("Processed articles:")
+print(article_count)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4304fb8..fb3153f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 feedparser
-confuse
\ No newline at end of file
+confuse
+redis
+diff-match-patch
+imgkit
\ No newline at end of file