Add duplicate checks for the "pull" command

This checks the received JSON against following columns:
* now
* time
* md5
* com
* sub
* board
and will add them to the database if they don't exist.

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2023-07-29 21:02:18 -07:00
parent e956dc460d
commit 918271926a

View File

@@ -1,4 +1,5 @@
import asyncio import asyncio
from collections import defaultdict
import json import json
import logging import logging
from pathlib import Path from pathlib import Path
@@ -74,8 +75,7 @@ async def pull():
for key, value in zip(labels, [c for c in row if c != "\n"]) for key, value in zip(labels, [c for c in row if c != "\n"])
if key != "time" if key != "time"
} }
post = posts[cols["post"]["data-pid"]] post = defaultdict(lambda: None, posts[cols["post"]["data-pid"]])
cols["post"] = post
if "thumb" in post: if "thumb" in post:
thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg") thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg")
@@ -86,7 +86,32 @@ async def pull():
# Try to create post in database # Try to create post in database
try: try:
with db: with db:
post = cols["post"]
# Check the last N bans for the given board. If the following columns are equal:
# * now
# * time
# * md5
# * com
# * sub
# * board
# Then we consider it to be a duplicate.
result = db.execute(
"""
select id, now, time, md5, com, sub, board
from bans
where
now = :now
and time = :time
and md5 = :md5
and com = :com
and sub = :sub
and board = :board
""",
post,
)
if _row := result.fetchone():
log.debug("duplicate found, skipping - %s", _row["id"])
continue
post["action"] = cols["action"] post["action"] = cols["action"]
# post['board'] = cols['board'] # post['board'] = cols['board']
@@ -167,6 +192,6 @@ async def pull():
msg = str(ex) msg = str(ex)
if "UNIQUE" not in msg: if "UNIQUE" not in msg:
log.exception("error inserting data") log.exception("error inserting data")
raise SystemExit() log.info("Continuing")
# Finish off thumbnail jobs # Finish off thumbnail jobs
await asyncio.gather(*download_jobs) await asyncio.gather(*download_jobs)