Simple version with weird html diff

This commit is contained in:
mdivecky 2022-07-09 19:47:11 +02:00
parent 3274f82616
commit d30baa49f6
4 changed files with 76 additions and 7 deletions

View file

@ -0,0 +1 @@
Zatím je potřeba v debianu instalovat wkhtmltopdf balíček

View file

@ -10,3 +10,7 @@ feeds:
- name: "aktualne.cz" - name: "aktualne.cz"
rss_source: "https://www.aktualne.cz/rss/" rss_source: "https://www.aktualne.cz/rss/"
unique_tag: "guid" unique_tag: "guid"
- name: "novinky.cz"
rss_source: "https://www.novinky.cz/rss"
unique_tag: "guid"

View file

@ -2,14 +2,71 @@
import feedparser import feedparser
import confuse import confuse
import redis
import time
import json
import imgkit
from diff_match_patch import diff_match_patch
import difflib
from pprint import pprint from pprint import pprint
import hashlib
#
# Idea block:
#
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
# Ale nevím jestli to bude reálně efektivnější
#
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
config = confuse.Configuration('headline', __name__) config = confuse.Configuration('headline', __name__)
config.set_file('config.yaml') config.set_file('config.yaml')
dmp = diff_match_patch()
rc = redis.Redis(host='localhost', port=6379, db=0)
image_options = {
'width': '450',
}
def write_article(article, rc):
rval = json.dumps(article['content'])
rc.set(article['rss_id'], rval)
def process_diff(diff, article):
dmp.diff_cleanupSemantic(diff)
html_diff = dmp.diff_prettyHtml(diff)
filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg"
image = imgkit.from_string(html_diff, filename, options = {'width': '450'})
return(True)
def process_item(article, rc):
if rc.exists(article['rss_id']):
old = json.loads(rc.get(article['rss_id']))
new = article['content']
if old['title'] != new['title']:
print('Article changed. Fuck the world.')
diff = dmp.diff_main(old['title'], new['title'])
process_diff(diff, article)
#write_article(article_rc)
return(True)
else:
# Article is the same. All good!
return(True)
else:
# Article is new, just create it and exit
write_article(article, rc)
article_count = 0
for feed in config['feeds']: for feed in config['feeds']:
rss_source = str(feed['rss_source']) rss_source = str(feed['rss_source'])
unique_tag = str(feed['unique_tag']) unique_tag = str(feed['unique_tag'])
@ -21,16 +78,20 @@ for feed in config['feeds']:
rss_id = item[unique_tag] rss_id = item[unique_tag]
title = item['title'] title = item['title']
description = item['description'] description = item['description']
published = item['published_parsed'] published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
link = item['link'] link = item['link']
article = { article_data = {
'rss_id' : rss_id,
'title' : title, 'title' : title,
'description': description, 'description': description,
'published' : published, 'published' : published,
'link' : link, 'link' : link,
'medium' : name 'medium' : name
} }
pprint(article) article = {
break 'rss_id' : rss_id,
'content' : article_data
}
article_count += 1
process_item(article, rc)
print("Processed articles:")
print(article_count)

View file

@ -1,2 +1,5 @@
feedparser feedparser
confuse confuse
redis
diff-match-patch
imgkit