From 6cccea04fb813933959c721812dfcc6a6c8230e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ondr=CC=8Cej=20Ny=CC=81vlt?= Date: Mon, 21 Aug 2023 10:38:13 +0200 Subject: [PATCH] Migrate code style to Black --- .editorconfig | 2 +- .vscode/settings.json | 4 + misc/article_id_generator.py | 34 +++----- processor/app.py | 161 +++++++++++++++++++---------------- view/app.py | 153 ++++++++++++++++++--------------- 5 files changed, 189 insertions(+), 165 deletions(-) diff --git a/.editorconfig b/.editorconfig index 6757a0e..1303c1e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -7,7 +7,7 @@ insert_final_newline = true trim_trailing_whitespace = true [*.py] -indent_style = tab +indent_style = space indent_size = 4 [*.{html,css}] diff --git a/.vscode/settings.json b/.vscode/settings.json index c659b65..60edef8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,9 @@ { "files.associations": { "*.html": "jinja-html" + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true } } diff --git a/misc/article_id_generator.py b/misc/article_id_generator.py index ab9e445..e886ef1 100644 --- a/misc/article_id_generator.py +++ b/misc/article_id_generator.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # -# Create a UID of the article in old articles where we don't have RSS UID and where we can't generate the article_id on the fly. +# Create a UID of the article in old articles where we don't have RSS UID and where we can't generate the article_id on the fly. # It takes a while, but it's a one-shot. # @@ -12,33 +12,27 @@ db_con = sqlite3.connect("../data/diffs.db") db = db_con.cursor() - - - - - def create_article_id(uid, feed): - # Create a fake unique ID from RSS unique tag and feed name to reference the article in database - id_string = str(uid) + str(feed) - id_bytes = id_string.encode('utf-8') - article_id = hashlib.sha256(id_bytes).hexdigest() - return(article_id) + # Create a fake unique ID from RSS unique tag and feed name to reference the article in database + id_string = str(uid) + str(feed) + id_bytes = id_string.encode("utf-8") + article_id = hashlib.sha256(id_bytes).hexdigest() + return article_id def update_diff(diff_id, article_id): - sql = "UPDATE diffs SET article_id = ? WHERE diff_id = ?" - sql_data = (article_id, diff_id) - db.execute(sql, sql_data) - db_con.commit() - + sql = "UPDATE diffs SET article_id = ? WHERE diff_id = ?" + sql_data = (article_id, diff_id) + db.execute(sql, sql_data) + db_con.commit() db.execute( - "SELECT * FROM diffs WHERE NOT 'article_id' ORDER BY diff_id DESC ", + "SELECT * FROM diffs WHERE NOT 'article_id' ORDER BY diff_id DESC ", ) diffs = db.fetchall() for diff in diffs: - article_id = create_article_id(diff[1], diff[2]) - update_diff(diff[0], article_id) - print(article_id) \ No newline at end of file + article_id = create_article_id(diff[1], diff[2]) + update_diff(diff[0], article_id) + print(article_id) diff --git a/processor/app.py b/processor/app.py index c553bb6..292b110 100644 --- a/processor/app.py +++ b/processor/app.py @@ -14,25 +14,26 @@ from diff_match_patch import diff_match_patch # # Idea block: # -# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. -# Ale nevím jestli to bude reálně efektivnější +# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů. +# Ale nevím jestli to bude reálně efektivnější # -# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku +# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku CONFIG_FILE = "../data/config.yaml" REDIS_ARTICLE_EXPIRE_SEC = 604800 -config = confuse.Configuration('headline', __name__) +config = confuse.Configuration("headline", __name__) config.set_file(CONFIG_FILE) dmp = diff_match_patch() -rc = redis.Redis(host='redis', port=6379, db=0) +rc = redis.Redis(host="redis", port=6379, db=0) db_con = sqlite3.connect("../data/diffs.db") db = db_con.cursor() -db.executescript(""" +db.executescript( + """ PRAGMA journal_mode=WAL; CREATE TABLE IF NOT EXISTS diffs ( @@ -74,90 +75,100 @@ CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs INSERT INTO diffs_fts (rowid, title_orig, title_new) VALUES (new.diff_id, new.title_orig, new.title_new); END; -""") +""" +) article_count = 0 + def write_article(article, rc): - rval = json.dumps(article['content']) - rc.set(article['rss_id'], rval, ex=REDIS_ARTICLE_EXPIRE_SEC) + rval = json.dumps(article["content"]) + rc.set(article["rss_id"], rval, ex=REDIS_ARTICLE_EXPIRE_SEC) + def process_diff(old, new, rss_id): - diff = dmp.diff_main(old['title'], new['title']) - dmp.diff_cleanupSemantic(diff) - html_diff = dmp.diff_prettyHtml(diff) - # print(old['link']) - # print(diff) + diff = dmp.diff_main(old["title"], new["title"]) + dmp.diff_cleanupSemantic(diff) + html_diff = dmp.diff_prettyHtml(diff) + # print(old['link']) + # print(diff) - sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))" - sql_data = (new['article_id'], old['medium'], old['link'], old['title'], new['title'], html_diff) - db.execute(sql, sql_data) - db_con.commit() + sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))" + sql_data = ( + new["article_id"], + old["medium"], + old["link"], + old["title"], + new["title"], + html_diff, + ) + db.execute(sql, sql_data) + db_con.commit() - return(True) + return True def process_item(article, rc): - if rc.exists(article['rss_id']): - old = json.loads(rc.get(article['rss_id'])) - new = article['content'] - if old['title'] != new['title']: - # print('Article changed. World is fucked.') - diff = process_diff(old, new, article['rss_id']) - write_article(article, rc) - return(True) - else: - # Article is the same. All good! - return(True) - else: - # Article is new, just create it and exit - write_article(article, rc) + if rc.exists(article["rss_id"]): + old = json.loads(rc.get(article["rss_id"])) + new = article["content"] + if old["title"] != new["title"]: + # print('Article changed. World is fucked.') + diff = process_diff(old, new, article["rss_id"]) + write_article(article, rc) + return True + else: + # Article is the same. All good! + return True + else: + # Article is new, just create it and exit + write_article(article, rc) + def create_article_id(uid, feed): - # Create a unique ID from RSS unique tag and feed name to reference the article in database - id_string = str(uid) + str(feed) - id_bytes = id_string.encode('utf-8') - article_id = hashlib.sha256(id_bytes).hexdigest() - return(article_id) + # Create a unique ID from RSS unique tag and feed name to reference the article in database + id_string = str(uid) + str(feed) + id_bytes = id_string.encode("utf-8") + article_id = hashlib.sha256(id_bytes).hexdigest() + return article_id -for feed in config['feeds']: - try: - rss_source = str(feed['rss_source']) - unique_tag = str(feed['unique_tag']) - name = str(feed['name']) +for feed in config["feeds"]: + try: + rss_source = str(feed["rss_source"]) + unique_tag = str(feed["unique_tag"]) + name = str(feed["name"]) - rss = feedparser.parse(rss_source) + rss = feedparser.parse(rss_source) - for item in rss['entries']: - try: - rss_id = item[unique_tag] - title = item['title'] - article_id = create_article_id(rss_id, name) - #description = item['description'] ## Don't store description for now, as we don't need it and it's big. - published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed']) - link = item['link'] - article_data = { - 'title' : title, - 'article_id': article_id, - #'description': description, - 'published' : published, - 'link' : link, - 'medium' : name - } - article = { - 'rss_id' : rss_id, - 'content' : article_data - } - article_count += 1 - process_item(article, rc) - except Exception as e: - print("Parsing article failed") - print(e) - print(item) - except Exception as e: - print("Parsing feed failed.") - print(e) - print(feed) - pass + for item in rss["entries"]: + try: + rss_id = item[unique_tag] + title = item["title"] + article_id = create_article_id(rss_id, name) + # description = item['description'] ## Don't store description for now, as we don't need it and it's big. + published = time.strftime( + "%Y:%m:%d %H:%M:%S %Z %z", item["published_parsed"] + ) + link = item["link"] + article_data = { + "title": title, + "article_id": article_id, + #'description': description, + "published": published, + "link": link, + "medium": name, + } + article = {"rss_id": rss_id, "content": article_data} + article_count += 1 + process_item(article, rc) + except Exception as e: + print("Parsing article failed") + print(e) + print(item) + except Exception as e: + print("Parsing feed failed.") + print(e) + print(feed) + pass print("Processed articles: " + str(article_count)) diff --git a/view/app.py b/view/app.py index f8c5836..28cd3dd 100644 --- a/view/app.py +++ b/view/app.py @@ -10,7 +10,7 @@ import re DATABASE = "../data/diffs.db" CONFIG_FILE = "../data/config.yaml" -config = confuse.Configuration('headline', __name__) +config = confuse.Configuration("headline", __name__) config.set_file(CONFIG_FILE) @@ -18,104 +18,119 @@ app = Flask(__name__) def get_db(): - db = getattr(g, '_database', None) - if db is None: - db = g._database = sqlite3.connect(DATABASE) - db.row_factory = sqlite3.Row - return db + db = getattr(g, "_database", None) + if db is None: + db = g._database = sqlite3.connect(DATABASE) + db.row_factory = sqlite3.Row + return db @app.teardown_appcontext def close_connection(exception): - db = getattr(g, '_database', None) - if db is not None: - db.close() + db = getattr(g, "_database", None) + if db is not None: + db.close() def websearch_to_fts_query(search: str): - """ - Converts web searches into fts queries: - 'this is "a test"' -> '"this" OR "is" OR "a test"' - """ - return ' OR '.join(['"'+m.group(0)+'"' for m in re.finditer(r'(?<=")[^"]+(?=")|[^\s"]+', search)]) + """ + Converts web searches into fts queries: + 'this is "a test"' -> '"this" OR "is" OR "a test"' + """ + return " OR ".join( + [ + '"' + m.group(0) + '"' + for m in re.finditer(r'(?<=")[^"]+(?=")|[^\s"]+', search) + ] + ) -@app.route('/') +@app.route("/") def index(): - db = get_db().cursor() + db = get_db().cursor() - search = request.args.get("search", type=str, default="") - query = websearch_to_fts_query(search) if search else None + search = request.args.get("search", type=str, default="") + query = websearch_to_fts_query(search) if search else None - # View options - expand_diffs = request.args.get("expand_diffs") is not None + # View options + expand_diffs = request.args.get("expand_diffs") is not None - db.execute(f"SELECT count(*) FROM diffs{'_fts(?)' if query else ''}", (query,) if query else ()) + db.execute( + f"SELECT count(*) FROM diffs{'_fts(?)' if query else ''}", + (query,) if query else (), + ) - diff_count = db.fetchall()[0][0] + diff_count = db.fetchall()[0][0] + # flask-paginate + page = request.args.get(get_page_parameter(), type=int, default=1) - #flask-paginate - page = request.args.get(get_page_parameter(), type=int, default=1) + pagination = Pagination( + page=page, total=diff_count, record_name="diffs", css_framework="bootstrap5" + ) - pagination = Pagination(page=page, total=diff_count, record_name='diffs', css_framework='bootstrap5') + page_skip = pagination.skip + per_page = pagination.per_page + if query: + db.execute( + "SELECT * FROM diffs JOIN (SELECT rowid FROM diffs_fts(?)) filter ON filter.rowid = diffs.diff_id ORDER BY diff_id DESC LIMIT ? OFFSET ?", + (query, per_page, page_skip), + ) + else: + db.execute( + "SELECT * FROM diffs ORDER BY diff_id DESC LIMIT ? OFFSET ?", + (per_page, page_skip), + ) + diffs = db.fetchall() - - page_skip = pagination.skip - per_page = pagination.per_page - if query: - db.execute( - "SELECT * FROM diffs JOIN (SELECT rowid FROM diffs_fts(?)) filter ON filter.rowid = diffs.diff_id ORDER BY diff_id DESC LIMIT ? OFFSET ?", - (query,per_page,page_skip) - ) - else: - db.execute( - "SELECT * FROM diffs ORDER BY diff_id DESC LIMIT ? OFFSET ?", - (per_page,page_skip) - ) - diffs = db.fetchall() - - return render_template('./index.html', - diffs=diffs, - page=page, - pagination=pagination, - diff_count = diff_count, - search=search, - expand_diffs=expand_diffs, - ) + return render_template( + "./index.html", + diffs=diffs, + page=page, + pagination=pagination, + diff_count=diff_count, + search=search, + expand_diffs=expand_diffs, + ) @app.route("/article/") def article_detail(article_id: str): - db = get_db().cursor() - db.execute("SELECT * FROM diffs WHERE article_id = ?", (article_id,)) - result = db.fetchall() - article_url = result[0]['article_url'] - # TODO: Handle if nothing is found and return 404 in that case. - return render_template("article_detail.html", article_id=article_id, article_url=article_url, diffs=result ) + db = get_db().cursor() + db.execute("SELECT * FROM diffs WHERE article_id = ?", (article_id,)) + result = db.fetchall() + article_url = result[0]["article_url"] + # TODO: Handle if nothing is found and return 404 in that case. + return render_template( + "article_detail.html", + article_id=article_id, + article_url=article_url, + diffs=result, + ) -@app.route('/about') +@app.route("/about") def about(): - return render_template('about.html') + return render_template("about.html") -@app.route('/feeds') +@app.route("/feeds") def feed_list(): - feeds = [] - for conf in config['feeds']: - feed = { - 'rss_source' : str(conf['rss_source']), - 'unique_tag' : str(conf['unique_tag']), - 'feed_name' : str(conf['name']) - } - feeds.append(feed) - return render_template('feeds.html', feeds=feeds) + feeds = [] + for conf in config["feeds"]: + feed = { + "rss_source": str(conf["rss_source"]), + "unique_tag": str(conf["unique_tag"]), + "feed_name": str(conf["name"]), + } + feeds.append(feed) + return render_template("feeds.html", feeds=feeds) -@app.route('/robots.txt') +@app.route("/robots.txt") def static_from_root(): - return send_from_directory(app.static_folder, request.path[1:]) + return send_from_directory(app.static_folder or "static", request.path[1:]) + if __name__ == "__main__": - app.run(host="0.0.0.0") + app.run(host="0.0.0.0")