From 918271926afa4047a3d6b03de354102860627642 Mon Sep 17 00:00:00 2001 From: Alek Ratzloff Date: Sat, 29 Jul 2023 21:02:18 -0700 Subject: [PATCH] Add duplicate checks for the "pull" command This checks the received JSON against following columns: * now * time * md5 * com * sub * board and will add them to the database if they don't exist. Signed-off-by: Alek Ratzloff --- chanbans/pull.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/chanbans/pull.py b/chanbans/pull.py index 7686895..752e9dc 100644 --- a/chanbans/pull.py +++ b/chanbans/pull.py @@ -1,4 +1,5 @@ import asyncio +from collections import defaultdict import json import logging from pathlib import Path @@ -74,8 +75,7 @@ async def pull(): for key, value in zip(labels, [c for c in row if c != "\n"]) if key != "time" } - post = posts[cols["post"]["data-pid"]] - cols["post"] = post + post = defaultdict(lambda: None, posts[cols["post"]["data-pid"]]) if "thumb" in post: thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg") @@ -86,7 +86,32 @@ async def pull(): # Try to create post in database try: with db: - post = cols["post"] + + # Check the last N bans for the given board. If the following columns are equal: + # * now + # * time + # * md5 + # * com + # * sub + # * board + # Then we consider it to be a duplicate. + result = db.execute( + """ + select id, now, time, md5, com, sub, board + from bans + where + now = :now + and time = :time + and md5 = :md5 + and com = :com + and sub = :sub + and board = :board + """, + post, + ) + if _row := result.fetchone(): + log.debug("duplicate found, skipping - %s", _row["id"]) + continue post["action"] = cols["action"] # post['board'] = cols['board'] @@ -167,6 +192,6 @@ async def pull(): msg = str(ex) if "UNIQUE" not in msg: log.exception("error inserting data") - raise SystemExit() + log.info("Continuing") # Finish off thumbnail jobs await asyncio.gather(*download_jobs)