mirror of
https://git.nolog.cz/NoLog.cz/headline.git
synced 2025-01-31 03:43:35 +01:00
Migrate code style to Black
This commit is contained in:
parent
986dd93550
commit
6cccea04fb
5 changed files with 189 additions and 165 deletions
|
@ -7,7 +7,7 @@ insert_final_newline = true
|
|||
trim_trailing_whitespace = true
|
||||
|
||||
[*.py]
|
||||
indent_style = tab
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
|
||||
[*.{html,css}]
|
||||
|
|
4
.vscode/settings.json
vendored
4
.vscode/settings.json
vendored
|
@ -1,5 +1,9 @@
|
|||
{
|
||||
"files.associations": {
|
||||
"*.html": "jinja-html"
|
||||
},
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.black-formatter",
|
||||
"editor.formatOnSave": true
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,33 +12,27 @@ db_con = sqlite3.connect("../data/diffs.db")
|
|||
db = db_con.cursor()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def create_article_id(uid, feed):
|
||||
# Create a fake unique ID from RSS unique tag and feed name to reference the article in database
|
||||
id_string = str(uid) + str(feed)
|
||||
id_bytes = id_string.encode('utf-8')
|
||||
article_id = hashlib.sha256(id_bytes).hexdigest()
|
||||
return(article_id)
|
||||
# Create a fake unique ID from RSS unique tag and feed name to reference the article in database
|
||||
id_string = str(uid) + str(feed)
|
||||
id_bytes = id_string.encode("utf-8")
|
||||
article_id = hashlib.sha256(id_bytes).hexdigest()
|
||||
return article_id
|
||||
|
||||
|
||||
def update_diff(diff_id, article_id):
|
||||
sql = "UPDATE diffs SET article_id = ? WHERE diff_id = ?"
|
||||
sql_data = (article_id, diff_id)
|
||||
db.execute(sql, sql_data)
|
||||
db_con.commit()
|
||||
|
||||
sql = "UPDATE diffs SET article_id = ? WHERE diff_id = ?"
|
||||
sql_data = (article_id, diff_id)
|
||||
db.execute(sql, sql_data)
|
||||
db_con.commit()
|
||||
|
||||
|
||||
db.execute(
|
||||
"SELECT * FROM diffs WHERE NOT 'article_id' ORDER BY diff_id DESC ",
|
||||
"SELECT * FROM diffs WHERE NOT 'article_id' ORDER BY diff_id DESC ",
|
||||
)
|
||||
diffs = db.fetchall()
|
||||
|
||||
for diff in diffs:
|
||||
article_id = create_article_id(diff[1], diff[2])
|
||||
update_diff(diff[0], article_id)
|
||||
print(article_id)
|
||||
article_id = create_article_id(diff[1], diff[2])
|
||||
update_diff(diff[0], article_id)
|
||||
print(article_id)
|
||||
|
|
161
processor/app.py
161
processor/app.py
|
@ -14,25 +14,26 @@ from diff_match_patch import diff_match_patch
|
|||
#
|
||||
# Idea block:
|
||||
#
|
||||
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
|
||||
# Ale nevím jestli to bude reálně efektivnější
|
||||
# Můžeme zkusit ke každému ID článku přidat taky hash obsahu, s tím že v začátku budeme kontrolovat jenom změnu hashe a až pak obsah stringů.
|
||||
# Ale nevím jestli to bude reálně efektivnější
|
||||
#
|
||||
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
|
||||
# Teď budeme kontrolovat jen titulky, ale postupně můžeme přidat i description článku
|
||||
|
||||
CONFIG_FILE = "../data/config.yaml"
|
||||
REDIS_ARTICLE_EXPIRE_SEC = 604800
|
||||
|
||||
config = confuse.Configuration('headline', __name__)
|
||||
config = confuse.Configuration("headline", __name__)
|
||||
config.set_file(CONFIG_FILE)
|
||||
|
||||
dmp = diff_match_patch()
|
||||
|
||||
rc = redis.Redis(host='redis', port=6379, db=0)
|
||||
rc = redis.Redis(host="redis", port=6379, db=0)
|
||||
|
||||
db_con = sqlite3.connect("../data/diffs.db")
|
||||
db = db_con.cursor()
|
||||
|
||||
db.executescript("""
|
||||
db.executescript(
|
||||
"""
|
||||
PRAGMA journal_mode=WAL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS diffs (
|
||||
|
@ -74,90 +75,100 @@ CREATE TRIGGER IF NOT EXISTS diffs_aupdate AFTER UPDATE ON diffs
|
|||
INSERT INTO diffs_fts (rowid, title_orig, title_new)
|
||||
VALUES (new.diff_id, new.title_orig, new.title_new);
|
||||
END;
|
||||
""")
|
||||
"""
|
||||
)
|
||||
article_count = 0
|
||||
|
||||
|
||||
def write_article(article, rc):
|
||||
rval = json.dumps(article['content'])
|
||||
rc.set(article['rss_id'], rval, ex=REDIS_ARTICLE_EXPIRE_SEC)
|
||||
rval = json.dumps(article["content"])
|
||||
rc.set(article["rss_id"], rval, ex=REDIS_ARTICLE_EXPIRE_SEC)
|
||||
|
||||
|
||||
def process_diff(old, new, rss_id):
|
||||
diff = dmp.diff_main(old['title'], new['title'])
|
||||
dmp.diff_cleanupSemantic(diff)
|
||||
html_diff = dmp.diff_prettyHtml(diff)
|
||||
# print(old['link'])
|
||||
# print(diff)
|
||||
diff = dmp.diff_main(old["title"], new["title"])
|
||||
dmp.diff_cleanupSemantic(diff)
|
||||
html_diff = dmp.diff_prettyHtml(diff)
|
||||
# print(old['link'])
|
||||
# print(diff)
|
||||
|
||||
sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))"
|
||||
sql_data = (new['article_id'], old['medium'], old['link'], old['title'], new['title'], html_diff)
|
||||
db.execute(sql, sql_data)
|
||||
db_con.commit()
|
||||
sql = "INSERT INTO diffs(article_id, feed_name, article_url, title_orig, title_new, diff_html, diff_time) VALUES (?,?,?,?,?,?,datetime('now', 'localtime'))"
|
||||
sql_data = (
|
||||
new["article_id"],
|
||||
old["medium"],
|
||||
old["link"],
|
||||
old["title"],
|
||||
new["title"],
|
||||
html_diff,
|
||||
)
|
||||
db.execute(sql, sql_data)
|
||||
db_con.commit()
|
||||
|
||||
return(True)
|
||||
return True
|
||||
|
||||
|
||||
def process_item(article, rc):
|
||||
if rc.exists(article['rss_id']):
|
||||
old = json.loads(rc.get(article['rss_id']))
|
||||
new = article['content']
|
||||
if old['title'] != new['title']:
|
||||
# print('Article changed. World is fucked.')
|
||||
diff = process_diff(old, new, article['rss_id'])
|
||||
write_article(article, rc)
|
||||
return(True)
|
||||
else:
|
||||
# Article is the same. All good!
|
||||
return(True)
|
||||
else:
|
||||
# Article is new, just create it and exit
|
||||
write_article(article, rc)
|
||||
if rc.exists(article["rss_id"]):
|
||||
old = json.loads(rc.get(article["rss_id"]))
|
||||
new = article["content"]
|
||||
if old["title"] != new["title"]:
|
||||
# print('Article changed. World is fucked.')
|
||||
diff = process_diff(old, new, article["rss_id"])
|
||||
write_article(article, rc)
|
||||
return True
|
||||
else:
|
||||
# Article is the same. All good!
|
||||
return True
|
||||
else:
|
||||
# Article is new, just create it and exit
|
||||
write_article(article, rc)
|
||||
|
||||
|
||||
def create_article_id(uid, feed):
|
||||
# Create a unique ID from RSS unique tag and feed name to reference the article in database
|
||||
id_string = str(uid) + str(feed)
|
||||
id_bytes = id_string.encode('utf-8')
|
||||
article_id = hashlib.sha256(id_bytes).hexdigest()
|
||||
return(article_id)
|
||||
# Create a unique ID from RSS unique tag and feed name to reference the article in database
|
||||
id_string = str(uid) + str(feed)
|
||||
id_bytes = id_string.encode("utf-8")
|
||||
article_id = hashlib.sha256(id_bytes).hexdigest()
|
||||
return article_id
|
||||
|
||||
|
||||
for feed in config['feeds']:
|
||||
try:
|
||||
rss_source = str(feed['rss_source'])
|
||||
unique_tag = str(feed['unique_tag'])
|
||||
name = str(feed['name'])
|
||||
for feed in config["feeds"]:
|
||||
try:
|
||||
rss_source = str(feed["rss_source"])
|
||||
unique_tag = str(feed["unique_tag"])
|
||||
name = str(feed["name"])
|
||||
|
||||
rss = feedparser.parse(rss_source)
|
||||
rss = feedparser.parse(rss_source)
|
||||
|
||||
for item in rss['entries']:
|
||||
try:
|
||||
rss_id = item[unique_tag]
|
||||
title = item['title']
|
||||
article_id = create_article_id(rss_id, name)
|
||||
#description = item['description'] ## Don't store description for now, as we don't need it and it's big.
|
||||
published = time.strftime('%Y:%m:%d %H:%M:%S %Z %z', item['published_parsed'])
|
||||
link = item['link']
|
||||
article_data = {
|
||||
'title' : title,
|
||||
'article_id': article_id,
|
||||
#'description': description,
|
||||
'published' : published,
|
||||
'link' : link,
|
||||
'medium' : name
|
||||
}
|
||||
article = {
|
||||
'rss_id' : rss_id,
|
||||
'content' : article_data
|
||||
}
|
||||
article_count += 1
|
||||
process_item(article, rc)
|
||||
except Exception as e:
|
||||
print("Parsing article failed")
|
||||
print(e)
|
||||
print(item)
|
||||
except Exception as e:
|
||||
print("Parsing feed failed.")
|
||||
print(e)
|
||||
print(feed)
|
||||
pass
|
||||
for item in rss["entries"]:
|
||||
try:
|
||||
rss_id = item[unique_tag]
|
||||
title = item["title"]
|
||||
article_id = create_article_id(rss_id, name)
|
||||
# description = item['description'] ## Don't store description for now, as we don't need it and it's big.
|
||||
published = time.strftime(
|
||||
"%Y:%m:%d %H:%M:%S %Z %z", item["published_parsed"]
|
||||
)
|
||||
link = item["link"]
|
||||
article_data = {
|
||||
"title": title,
|
||||
"article_id": article_id,
|
||||
#'description': description,
|
||||
"published": published,
|
||||
"link": link,
|
||||
"medium": name,
|
||||
}
|
||||
article = {"rss_id": rss_id, "content": article_data}
|
||||
article_count += 1
|
||||
process_item(article, rc)
|
||||
except Exception as e:
|
||||
print("Parsing article failed")
|
||||
print(e)
|
||||
print(item)
|
||||
except Exception as e:
|
||||
print("Parsing feed failed.")
|
||||
print(e)
|
||||
print(feed)
|
||||
pass
|
||||
|
||||
print("Processed articles: " + str(article_count))
|
||||
|
|
153
view/app.py
153
view/app.py
|
@ -10,7 +10,7 @@ import re
|
|||
DATABASE = "../data/diffs.db"
|
||||
CONFIG_FILE = "../data/config.yaml"
|
||||
|
||||
config = confuse.Configuration('headline', __name__)
|
||||
config = confuse.Configuration("headline", __name__)
|
||||
config.set_file(CONFIG_FILE)
|
||||
|
||||
|
||||
|
@ -18,104 +18,119 @@ app = Flask(__name__)
|
|||
|
||||
|
||||
def get_db():
|
||||
db = getattr(g, '_database', None)
|
||||
if db is None:
|
||||
db = g._database = sqlite3.connect(DATABASE)
|
||||
db.row_factory = sqlite3.Row
|
||||
return db
|
||||
db = getattr(g, "_database", None)
|
||||
if db is None:
|
||||
db = g._database = sqlite3.connect(DATABASE)
|
||||
db.row_factory = sqlite3.Row
|
||||
return db
|
||||
|
||||
|
||||
@app.teardown_appcontext
|
||||
def close_connection(exception):
|
||||
db = getattr(g, '_database', None)
|
||||
if db is not None:
|
||||
db.close()
|
||||
db = getattr(g, "_database", None)
|
||||
if db is not None:
|
||||
db.close()
|
||||
|
||||
|
||||
def websearch_to_fts_query(search: str):
|
||||
"""
|
||||
Converts web searches into fts queries:
|
||||
'this is "a test"' -> '"this" OR "is" OR "a test"'
|
||||
"""
|
||||
return ' OR '.join(['"'+m.group(0)+'"' for m in re.finditer(r'(?<=")[^"]+(?=")|[^\s"]+', search)])
|
||||
"""
|
||||
Converts web searches into fts queries:
|
||||
'this is "a test"' -> '"this" OR "is" OR "a test"'
|
||||
"""
|
||||
return " OR ".join(
|
||||
[
|
||||
'"' + m.group(0) + '"'
|
||||
for m in re.finditer(r'(?<=")[^"]+(?=")|[^\s"]+', search)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@app.route('/')
|
||||
@app.route("/")
|
||||
def index():
|
||||
db = get_db().cursor()
|
||||
db = get_db().cursor()
|
||||
|
||||
search = request.args.get("search", type=str, default="")
|
||||
query = websearch_to_fts_query(search) if search else None
|
||||
search = request.args.get("search", type=str, default="")
|
||||
query = websearch_to_fts_query(search) if search else None
|
||||
|
||||
# View options
|
||||
expand_diffs = request.args.get("expand_diffs") is not None
|
||||
# View options
|
||||
expand_diffs = request.args.get("expand_diffs") is not None
|
||||
|
||||
db.execute(f"SELECT count(*) FROM diffs{'_fts(?)' if query else ''}", (query,) if query else ())
|
||||
db.execute(
|
||||
f"SELECT count(*) FROM diffs{'_fts(?)' if query else ''}",
|
||||
(query,) if query else (),
|
||||
)
|
||||
|
||||
diff_count = db.fetchall()[0][0]
|
||||
diff_count = db.fetchall()[0][0]
|
||||
|
||||
# flask-paginate
|
||||
page = request.args.get(get_page_parameter(), type=int, default=1)
|
||||
|
||||
#flask-paginate
|
||||
page = request.args.get(get_page_parameter(), type=int, default=1)
|
||||
pagination = Pagination(
|
||||
page=page, total=diff_count, record_name="diffs", css_framework="bootstrap5"
|
||||
)
|
||||
|
||||
pagination = Pagination(page=page, total=diff_count, record_name='diffs', css_framework='bootstrap5')
|
||||
page_skip = pagination.skip
|
||||
per_page = pagination.per_page
|
||||
if query:
|
||||
db.execute(
|
||||
"SELECT * FROM diffs JOIN (SELECT rowid FROM diffs_fts(?)) filter ON filter.rowid = diffs.diff_id ORDER BY diff_id DESC LIMIT ? OFFSET ?",
|
||||
(query, per_page, page_skip),
|
||||
)
|
||||
else:
|
||||
db.execute(
|
||||
"SELECT * FROM diffs ORDER BY diff_id DESC LIMIT ? OFFSET ?",
|
||||
(per_page, page_skip),
|
||||
)
|
||||
diffs = db.fetchall()
|
||||
|
||||
|
||||
page_skip = pagination.skip
|
||||
per_page = pagination.per_page
|
||||
if query:
|
||||
db.execute(
|
||||
"SELECT * FROM diffs JOIN (SELECT rowid FROM diffs_fts(?)) filter ON filter.rowid = diffs.diff_id ORDER BY diff_id DESC LIMIT ? OFFSET ?",
|
||||
(query,per_page,page_skip)
|
||||
)
|
||||
else:
|
||||
db.execute(
|
||||
"SELECT * FROM diffs ORDER BY diff_id DESC LIMIT ? OFFSET ?",
|
||||
(per_page,page_skip)
|
||||
)
|
||||
diffs = db.fetchall()
|
||||
|
||||
return render_template('./index.html',
|
||||
diffs=diffs,
|
||||
page=page,
|
||||
pagination=pagination,
|
||||
diff_count = diff_count,
|
||||
search=search,
|
||||
expand_diffs=expand_diffs,
|
||||
)
|
||||
return render_template(
|
||||
"./index.html",
|
||||
diffs=diffs,
|
||||
page=page,
|
||||
pagination=pagination,
|
||||
diff_count=diff_count,
|
||||
search=search,
|
||||
expand_diffs=expand_diffs,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/article/<path:article_id>")
|
||||
def article_detail(article_id: str):
|
||||
db = get_db().cursor()
|
||||
db.execute("SELECT * FROM diffs WHERE article_id = ?", (article_id,))
|
||||
result = db.fetchall()
|
||||
article_url = result[0]['article_url']
|
||||
# TODO: Handle if nothing is found and return 404 in that case.
|
||||
return render_template("article_detail.html", article_id=article_id, article_url=article_url, diffs=result )
|
||||
db = get_db().cursor()
|
||||
db.execute("SELECT * FROM diffs WHERE article_id = ?", (article_id,))
|
||||
result = db.fetchall()
|
||||
article_url = result[0]["article_url"]
|
||||
# TODO: Handle if nothing is found and return 404 in that case.
|
||||
return render_template(
|
||||
"article_detail.html",
|
||||
article_id=article_id,
|
||||
article_url=article_url,
|
||||
diffs=result,
|
||||
)
|
||||
|
||||
|
||||
@app.route('/about')
|
||||
@app.route("/about")
|
||||
def about():
|
||||
return render_template('about.html')
|
||||
return render_template("about.html")
|
||||
|
||||
|
||||
@app.route('/feeds')
|
||||
@app.route("/feeds")
|
||||
def feed_list():
|
||||
feeds = []
|
||||
for conf in config['feeds']:
|
||||
feed = {
|
||||
'rss_source' : str(conf['rss_source']),
|
||||
'unique_tag' : str(conf['unique_tag']),
|
||||
'feed_name' : str(conf['name'])
|
||||
}
|
||||
feeds.append(feed)
|
||||
return render_template('feeds.html', feeds=feeds)
|
||||
feeds = []
|
||||
for conf in config["feeds"]:
|
||||
feed = {
|
||||
"rss_source": str(conf["rss_source"]),
|
||||
"unique_tag": str(conf["unique_tag"]),
|
||||
"feed_name": str(conf["name"]),
|
||||
}
|
||||
feeds.append(feed)
|
||||
return render_template("feeds.html", feeds=feeds)
|
||||
|
||||
|
||||
@app.route('/robots.txt')
|
||||
@app.route("/robots.txt")
|
||||
def static_from_root():
|
||||
return send_from_directory(app.static_folder, request.path[1:])
|
||||
return send_from_directory(app.static_folder or "static", request.path[1:])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0")
|
||||
app.run(host="0.0.0.0")
|
||||
|
|
Loading…
Reference in a new issue