mirror of
https://git.nolog.cz/NoLog.cz/headline.git
synced 2025-01-31 11:53:35 +01:00
Simple version with weird html diff
This commit is contained in:
parent
3274f82616
commit
d30baa49f6
4 changed files with 76 additions and 7 deletions
|
@ -0,0 +1 @@
|
||||||
|
Zatím je potřeba v debianu instalovat wkhtmltopdf balíček
|
|
@ -10,3 +10,7 @@ feeds:
|
||||||
- name: "aktualne.cz"
|
- name: "aktualne.cz"
|
||||||
rss_source: "https://www.aktualne.cz/rss/"
|
rss_source: "https://www.aktualne.cz/rss/"
|
||||||
unique_tag: "guid"
|
unique_tag: "guid"
|
||||||
|
|
||||||
|
- name: "novinky.cz"
|
||||||
|
rss_source: "https://www.novinky.cz/rss"
|
||||||
|
unique_tag: "guid"
|
||||||
|
|
73
headline.py
73
headline.py
|
@ -2,14 +2,71 @@
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
import confuse
|
import confuse
|
||||||
|
import redis
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import imgkit
|
||||||
|
|
||||||
|
from diff_match_patch import diff_match_patch
|
||||||
|
import difflib
|
||||||
|
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Idea block:
|
||||||
|
#
|
||||||
|
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
|
||||||
|
# Ale nevím jestli to bude reálně efektivnější
|
||||||
|
#
|
||||||
|
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
|
||||||
|
|
||||||
config = confuse.Configuration('headline', __name__)
|
config = confuse.Configuration('headline', __name__)
|
||||||
config.set_file('config.yaml')
|
config.set_file('config.yaml')
|
||||||
|
|
||||||
|
dmp = diff_match_patch()
|
||||||
|
|
||||||
|
rc = redis.Redis(host='localhost', port=6379, db=0)
|
||||||
|
|
||||||
|
image_options = {
|
||||||
|
'width': '450',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def write_article(article, rc):
|
||||||
|
rval = json.dumps(article['content'])
|
||||||
|
rc.set(article['rss_id'], rval)
|
||||||
|
|
||||||
|
|
||||||
|
def process_diff(diff, article):
|
||||||
|
dmp.diff_cleanupSemantic(diff)
|
||||||
|
html_diff = dmp.diff_prettyHtml(diff)
|
||||||
|
filename = hashlib.md5(article['rss_id'].encode()).hexdigest() + ".jpg"
|
||||||
|
image = imgkit.from_string(html_diff, filename, options = {'width': '450'})
|
||||||
|
return(True)
|
||||||
|
|
||||||
|
|
||||||
|
def process_item(article, rc):
|
||||||
|
if rc.exists(article['rss_id']):
|
||||||
|
old = json.loads(rc.get(article['rss_id']))
|
||||||
|
new = article['content']
|
||||||
|
if old['title'] != new['title']:
|
||||||
|
print('Article changed. Fuck the world.')
|
||||||
|
diff = dmp.diff_main(old['title'], new['title'])
|
||||||
|
process_diff(diff, article)
|
||||||
|
#write_article(article_rc)
|
||||||
|
return(True)
|
||||||
|
else:
|
||||||
|
# Article is the same. All good!
|
||||||
|
return(True)
|
||||||
|
else:
|
||||||
|
# Article is new, just create it and exit
|
||||||
|
write_article(article, rc)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
article_count = 0
|
||||||
for feed in config['feeds']:
|
for feed in config['feeds']:
|
||||||
rss_source = str(feed['rss_source'])
|
rss_source = str(feed['rss_source'])
|
||||||
unique_tag = str(feed['unique_tag'])
|
unique_tag = str(feed['unique_tag'])
|
||||||
|
@ -21,16 +78,20 @@ for feed in config['feeds']:
|
||||||
rss_id = item[unique_tag]
|
rss_id = item[unique_tag]
|
||||||
title = item['title']
|
title = item['title']
|
||||||
description = item['description']
|
description = item['description']
|
||||||
published = item['published_parsed']
|
published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
|
||||||
link = item['link']
|
link = item['link']
|
||||||
article = {
|
article_data = {
|
||||||
'rss_id' : rss_id,
|
|
||||||
'title' : title,
|
'title' : title,
|
||||||
'description': description,
|
'description': description,
|
||||||
'published' : published,
|
'published' : published,
|
||||||
'link' : link,
|
'link' : link,
|
||||||
'medium' : name
|
'medium' : name
|
||||||
}
|
}
|
||||||
pprint(article)
|
article = {
|
||||||
break
|
'rss_id' : rss_id,
|
||||||
|
'content' : article_data
|
||||||
|
}
|
||||||
|
article_count += 1
|
||||||
|
process_item(article, rc)
|
||||||
|
print("Processed articles:")
|
||||||
|
print(article_count)
|
|
@ -1,2 +1,5 @@
|
||||||
feedparser
|
feedparser
|
||||||
confuse
|
confuse
|
||||||
|
redis
|
||||||
|
diff-match-patch
|
||||||
|
imgkit
|
Loading…
Reference in a new issue