Simple version with weird html diff

2025-01-31 11:53:35 +01:00 · 2022-07-09 19:47:11 +02:00 · 2022-07-09 19:47:11 +02:00 · d30baa49f6
commit d30baa49f6
parent 3274f82616
4 changed files with 76 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 Zatím je potřeba v debianu instalovat wkhtmltopdf balíček
--- a/config.yaml
+++ b/config.yaml
@ -10,3 +10,7 @@ feeds:
  - name: "aktualne.cz"
    rss_source: "https://www.aktualne.cz/rss/"
    unique_tag: "guid"
  - name: "novinky.cz"
    rss_source: "https://www.novinky.cz/rss"
    unique_tag: "guid"
--- a/headline.py
+++ b/headline.py
@ -2,14 +2,71 @@
 import feedparser
 import confuse
 import redis
 import time
 import json
 import imgkit
 from diff_match_patch import diff_match_patch
 import difflib
 from pprint import pprint
 import hashlib
 #
 # Idea block:
 #
 #	Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. 
 #	Ale nevím jestli to bude reálně efektivnější
 #
 #	Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
 config = confuse.Configuration('headline', __name__)
 config.set_file('config.yaml')
 dmp = diff_match_patch()
 rc = redis.Redis(host='localhost', port=6379, db=0)
 image_options = {
 	'width': '450',
 }
 def write_article(article, rc):
 	rval = json.dumps(article['content'])
 	rc.set(article['rss_id'], rval)
 def process_diff(diff, article):
 	dmp.diff_cleanupSemantic(diff)
 	html_diff = dmp.diff_prettyHtml(diff)
 	filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg"
 	image = imgkit.from_string(html_diff, filename, options = {'width': '450'})
 	return(True)
 def process_item(article, rc):
 	if rc.exists(article['rss_id']):
 		old = json.loads(rc.get(article['rss_id']))
 		new = article['content']
 		if old['title'] != new['title']:
 			print('Article changed. Fuck the world.')
 			diff = dmp.diff_main(old['title'], new['title'])
 			process_diff(diff, article)
 			#write_article(article_rc)
 			return(True)
 		else:
 			# Article is the same. All good!
 			return(True)
 	else:
 		# Article is new, just create it and exit
 		write_article(article, rc)
 article_count = 0
 for feed in config['feeds']:
 	rss_source = str(feed['rss_source'])
 	unique_tag = str(feed['unique_tag'])
@ -21,16 +78,20 @@ for feed in config['feeds']:
 		rss_id = item[unique_tag]
 		title = item['title']
 		description = item['description']
-		published = item['published_parsed']
+		published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
 		link = item['link']
-		article = {
+		article_data = {
 			'rss_id'	:	rss_id,
 			'title'		:	title,
 			'description':	description,
 			'published'	:	published,
 			'link'		:	link,
 			'medium'	:	name
 		}
-		pprint(article)
+		article = {
-		break
+			'rss_id'	: rss_id, 
-
+			'content'	: article_data
 		}
 		article_count += 1
 		process_item(article, rc)
 print("Processed articles:")
 print(article_count)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,5 @@
 feedparser
-confuse
+confuse
 redis
 diff-match-patch
 imgkit
		`@ -0,0 +1 @@`
							`Zatím je potřeba v debianu instalovat wkhtmltopdf balíček`