Add duplicate checks for the "pull" command
This checks the received JSON against following columns: * now * time * md5 * com * sub * board and will add them to the database if they don't exist. Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
@@ -74,8 +75,7 @@ async def pull():
|
||||
for key, value in zip(labels, [c for c in row if c != "\n"])
|
||||
if key != "time"
|
||||
}
|
||||
post = posts[cols["post"]["data-pid"]]
|
||||
cols["post"] = post
|
||||
post = defaultdict(lambda: None, posts[cols["post"]["data-pid"]])
|
||||
|
||||
if "thumb" in post:
|
||||
thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg")
|
||||
@@ -86,7 +86,32 @@ async def pull():
|
||||
# Try to create post in database
|
||||
try:
|
||||
with db:
|
||||
post = cols["post"]
|
||||
|
||||
# Check the last N bans for the given board. If the following columns are equal:
|
||||
# * now
|
||||
# * time
|
||||
# * md5
|
||||
# * com
|
||||
# * sub
|
||||
# * board
|
||||
# Then we consider it to be a duplicate.
|
||||
result = db.execute(
|
||||
"""
|
||||
select id, now, time, md5, com, sub, board
|
||||
from bans
|
||||
where
|
||||
now = :now
|
||||
and time = :time
|
||||
and md5 = :md5
|
||||
and com = :com
|
||||
and sub = :sub
|
||||
and board = :board
|
||||
""",
|
||||
post,
|
||||
)
|
||||
if _row := result.fetchone():
|
||||
log.debug("duplicate found, skipping - %s", _row["id"])
|
||||
continue
|
||||
|
||||
post["action"] = cols["action"]
|
||||
# post['board'] = cols['board']
|
||||
@@ -167,6 +192,6 @@ async def pull():
|
||||
msg = str(ex)
|
||||
if "UNIQUE" not in msg:
|
||||
log.exception("error inserting data")
|
||||
raise SystemExit()
|
||||
log.info("Continuing")
|
||||
# Finish off thumbnail jobs
|
||||
await asyncio.gather(*download_jobs)
|
||||
|
||||
Reference in New Issue
Block a user