Simple version with weird html diff

2025-01-31 11:53:35 +01:00 · 2022-07-09 19:47:11 +02:00 · 2022-07-09 19:47:11 +02:00 · d30baa49f6
commit d30baa49f6
parent 3274f82616
4 changed files with 76 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+Zatím je potřeba v debianu instalovat wkhtmltopdf balíček
--- a/config.yaml
+++ b/config.yaml
@ -10,3 +10,7 @@ feeds:
  - name: "aktualne.cz"
    rss_source: "https://www.aktualne.cz/rss/"
    unique_tag: "guid"
+
+  - name: "novinky.cz"
+    rss_source: "https://www.novinky.cz/rss"
+    unique_tag: "guid"
--- a/headline.py
+++ b/headline.py
@ -2,14 +2,71 @@

 import feedparser
 import confuse
+import redis
+import time
+import json
+import imgkit
+
+from diff_match_patch import diff_match_patch
+import difflib
+
 from pprint import pprint
+import hashlib


+#
+# Idea block:
+#
+#	Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. 
+#	Ale nevím jestli to bude reálně efektivnější
+#
+#	Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku

 config = confuse.Configuration('headline', __name__)
 config.set_file('config.yaml')

+dmp = diff_match_patch()

+rc = redis.Redis(host='localhost', port=6379, db=0)
+
+image_options = {
+	'width': '450',
+}
+
+
+def write_article(article, rc):
+	rval = json.dumps(article['content'])
+	rc.set(article['rss_id'], rval)
+
+
+def process_diff(diff, article):
+	dmp.diff_cleanupSemantic(diff)
+	html_diff = dmp.diff_prettyHtml(diff)
+	filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg"
+	image = imgkit.from_string(html_diff, filename, options = {'width': '450'})
+	return(True)
+
+
+def process_item(article, rc):
+	if rc.exists(article['rss_id']):
+		old = json.loads(rc.get(article['rss_id']))
+		new = article['content']
+		if old['title'] != new['title']:
+			print('Article changed. Fuck the world.')
+			diff = dmp.diff_main(old['title'], new['title'])
+			process_diff(diff, article)
+			#write_article(article_rc)
+			return(True)
+		else:
+			# Article is the same. All good!
+			return(True)
+	else:
+		# Article is new, just create it and exit
+		write_article(article, rc)
+
+
+
+article_count = 0
 for feed in config['feeds']:
 	rss_source = str(feed['rss_source'])
 	unique_tag = str(feed['unique_tag'])
@ -21,16 +78,20 @@ for feed in config['feeds']:
 		rss_id = item[unique_tag]
 		title = item['title']
 		description = item['description']
-		published = item['published_parsed']
+		published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
 		link = item['link']
-		article = {
-			'rss_id'	:	rss_id,
+		article_data = {
 			'title'		:	title,
 			'description':	description,
 			'published'	:	published,
 			'link'		:	link,
 			'medium'	:	name
 		}
-		pprint(article)
-		break
-
+		article = {
+			'rss_id'	: rss_id, 
+			'content'	: article_data
+		}
+		article_count += 1
+		process_item(article, rc)
+print("Processed articles:")
+print(article_count)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,5 @@
 feedparser
-confuse
+confuse
+redis
+diff-match-patch
+imgkit
				`@ -0,0 +1 @@`
				`Zatím je potřeba v debianu instalovat wkhtmltopdf balíček`