mirror of
https://git.nolog.cz/NoLog.cz/headline.git
synced 2025-01-31 11:53:35 +01:00
108 lines
No EOL
2.6 KiB
Python
108 lines
No EOL
2.6 KiB
Python
#!/usr/bin/python3
|
|
|
|
import feedparser
|
|
import confuse
|
|
import redis
|
|
import time
|
|
import json
|
|
import sqlite3
|
|
|
|
from diff_match_patch import diff_match_patch
|
|
|
|
|
|
#
|
|
# Idea block:
|
|
#
|
|
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
|
|
# Ale nevím jestli to bude reálně efektivnější
|
|
#
|
|
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
|
|
|
|
config = confuse.Configuration('headline', __name__)
|
|
config.set_file('config.yaml')
|
|
|
|
dmp = diff_match_patch()
|
|
|
|
rc = redis.Redis(host='redis', port=6379, db=0)
|
|
|
|
db_con = sqlite3.connect("../data/diffs.db")
|
|
db = db_con.cursor()
|
|
|
|
db.execute("""CREATE TABLE IF NOT EXISTS diffs (
|
|
diff_id INTEGER PRIMARY KEY,
|
|
feed_name TEXT NOT NULL,
|
|
article_url TEXT NOT NULL,
|
|
title_orig TEXT NOT NULL,
|
|
title_new TEXT NOT NULL,
|
|
diff_html TEXT NOT NULL,
|
|
diff_time TEXT
|
|
);""")
|
|
|
|
article_count = 0
|
|
|
|
def write_article(article, rc):
|
|
rval = json.dumps(article['content'])
|
|
rc.set(article['rss_id'], rval)
|
|
|
|
def process_diff(old, new, rss_id):
|
|
diff = dmp.diff_main(old['title'], new['title'])
|
|
dmp.diff_cleanupSemantic(diff)
|
|
html_diff = dmp.diff_prettyHtml(diff)
|
|
# print(old['link'])
|
|
# print(diff)
|
|
|
|
sql = "INSERT INTO diffs(feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,datetime('now', 'localtime'))"
|
|
sql_data = (old['medium'], old['link'], old['title'], new['title'], html_diff)
|
|
db.execute(sql, sql_data)
|
|
db_con.commit()
|
|
|
|
return(True)
|
|
|
|
|
|
def process_item(article, rc):
|
|
if rc.exists(article['rss_id']):
|
|
old = json.loads(rc.get(article['rss_id']))
|
|
new = article['content']
|
|
if old['title'] != new['title']:
|
|
# print('Article changed. World is fucked.')
|
|
diff = process_diff(old, new, article['rss_id'])
|
|
write_article(article, rc)
|
|
return(True)
|
|
else:
|
|
# Article is the same. All good!
|
|
return(True)
|
|
else:
|
|
# Article is new, just create it and exit
|
|
write_article(article, rc)
|
|
|
|
|
|
|
|
|
|
for feed in config['feeds']:
|
|
rss_source = str(feed['rss_source'])
|
|
unique_tag = str(feed['unique_tag'])
|
|
name = str(feed['name'])
|
|
|
|
rss = feedparser.parse(rss_source)
|
|
|
|
for item in rss['entries']:
|
|
rss_id = item[unique_tag]
|
|
title = item['title']
|
|
description = item['description']
|
|
published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
|
|
link = item['link']
|
|
article_data = {
|
|
'title' : title,
|
|
'description': description,
|
|
'published' : published,
|
|
'link' : link,
|
|
'medium' : name
|
|
}
|
|
article = {
|
|
'rss_id' : rss_id,
|
|
'content' : article_data
|
|
}
|
|
article_count += 1
|
|
process_item(article, rc)
|
|
|
|
print("Processed articles: " + article_count) |