From d30baa49f6e5b5a7f28950888ac97385c55dcf69 Mon Sep 17 00:00:00 2001 From: mdivecky Date: Sat, 9 Jul 2022 19:47:11 +0200 Subject: [PATCH] Simple version with weird html diff --- README.md | 1 + config.yaml | 4 +++ headline.py | 73 ++++++++++++++++++++++++++++++++++++++++++++---- requirements.txt | 5 +++- 4 files changed, 76 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e69de29..8558eda 100644 --- a/README.md +++ b/README.md @@ -0,0 +1 @@ +Zatím je potřeba v debianu instalovat wkhtmltopdf balíček \ No newline at end of file diff --git a/config.yaml b/config.yaml index fd812cd..68a3244 100644 --- a/config.yaml +++ b/config.yaml @@ -10,3 +10,7 @@ feeds: - name: "aktualne.cz" rss_source: "https://www.aktualne.cz/rss/" unique_tag: "guid" + + - name: "novinky.cz" + rss_source: "https://www.novinky.cz/rss" + unique_tag: "guid" diff --git a/headline.py b/headline.py index f8af816..3857648 100644 --- a/headline.py +++ b/headline.py @@ -2,14 +2,71 @@ import feedparser import confuse +import redis +import time +import json +import imgkit + +from diff_match_patch import diff_match_patch +import difflib + from pprint import pprint +import hashlib +# +# Idea block: +# +# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. +# Ale nevím jestli to bude reálně efektivnější +# +# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku config = confuse.Configuration('headline', __name__) config.set_file('config.yaml') +dmp = diff_match_patch() +rc = redis.Redis(host='localhost', port=6379, db=0) + +image_options = { + 'width': '450', +} + + +def write_article(article, rc): + rval = json.dumps(article['content']) + rc.set(article['rss_id'], rval) + + +def process_diff(diff, article): + dmp.diff_cleanupSemantic(diff) + html_diff = dmp.diff_prettyHtml(diff) + filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg" + image = imgkit.from_string(html_diff, filename, options = {'width': '450'}) + return(True) + + +def process_item(article, rc): + if rc.exists(article['rss_id']): + old = json.loads(rc.get(article['rss_id'])) + new = article['content'] + if old['title'] != new['title']: + print('Article changed. Fuck the world.') + diff = dmp.diff_main(old['title'], new['title']) + process_diff(diff, article) + #write_article(article_rc) + return(True) + else: + # Article is the same. All good! + return(True) + else: + # Article is new, just create it and exit + write_article(article, rc) + + + +article_count = 0 for feed in config['feeds']: rss_source = str(feed['rss_source']) unique_tag = str(feed['unique_tag']) @@ -21,16 +78,20 @@ for feed in config['feeds']: rss_id = item[unique_tag] title = item['title'] description = item['description'] - published = item['published_parsed'] + published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed']) link = item['link'] - article = { - 'rss_id' : rss_id, + article_data = { 'title' : title, 'description': description, 'published' : published, 'link' : link, 'medium' : name } - pprint(article) - break - + article = { + 'rss_id' : rss_id, + 'content' : article_data + } + article_count += 1 + process_item(article, rc) +print("Processed articles:") +print(article_count) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4304fb8..fb3153f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ feedparser -confuse \ No newline at end of file +confuse +redis +diff-match-patch +imgkit \ No newline at end of file