Add thumbnail downloading

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2023-02-19 01:03:16 -08:00
parent 478fc187dc
commit 26a17f7bbc

View File

@@ -14,6 +14,8 @@ from bs4 import BeautifulSoup as Soup
BANS_URL = "https://4chan.org/bans"
PREVIEW_RE = re.compile(r"var postPreviews = (.+)")
DB_PATH = "bans.db"
THUMBS_DIR = Path("thumbs")
CACHE_DIR = Path("bans")
def get_db():
@@ -38,14 +40,14 @@ def file_cache(
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
async def wrapper(*args, **kwargs):
d = Path(directory)
d.mkdir(parents=True, exist_ok=True)
path = Path(directory, time.strftime(format) + suffix)
if path.exists():
return path.read_text()
else:
text = func(*args, **kwargs)
text = await func(*args, **kwargs)
if deduplicate:
# find the most recent file in the path
newest_path = newest_in_dir(directory)
@@ -64,13 +66,30 @@ def file_cache(
return decorator
@file_cache(directory="bans", suffix=".html")
def get_bans_html() -> str:
r = httpx.get(BANS_URL)
@file_cache(directory=CACHE_DIR, suffix=".html")
async def get_bans_html() -> str:
async with httpx.AsyncClient() as c:
r = await c.get(BANS_URL)
return r.text
def main():
async def get_thumb(thumb_path: Union[str, Path], post: dict) -> Optional[bytes]:
if "thumb" in post:
if thumb_path.exists():
return thumb_path.read_bytes()
else:
url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg"
print("Downloading", url)
async with httpx.AsyncClient() as c:
r = await c.get(url)
THUMBS_DIR.mkdir(parents=True, exist_ok=True)
thumb_path.write_bytes(r.content)
return r.content
else:
return None
async def main():
# Ensure DB tables
db = get_db()
db.executescript(
@@ -81,13 +100,14 @@ def main():
board varchar(10),
length varchar(10),
post text unique,
thumb_path text,
reason varchar(200)
);
"""
)
# Get HTML
html = get_bans_html()
html = await get_bans_html()
# Get post JSON
m = PREVIEW_RE.search(html)
posts = json.loads(m[1][:-1])
@@ -100,6 +120,7 @@ def main():
cur = db.cursor()
download_jobs = []
for i, row in enumerate(rows):
# Labels
# ['board', 'action', 'length', 'post', 'reason', 'time']
@@ -108,16 +129,31 @@ def main():
for key, value in zip(labels, [c for c in row if c != "\n"])
if key != "time"
}
cols["post"] = posts[cols["post"]["data-pid"]]
post = posts[cols["post"]["data-pid"]]
cols["post"] = post
if 'thumb' in post:
thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg")
download_jobs += [get_thumb(thumb_path, post)]
# Try to create post in database
try:
with db:
curs = db.execute(
"insert into bans (action, board, length, post, reason) values(?, ?, ?, ?, ?)",
(cols['action'], cols['board'], cols['length'], json.dumps(cols['post']), cols['reason']),
"insert into bans (action, board, length, post, thumb_path, reason) values(?, ?, ?, ?, ?)",
(
cols["action"],
cols["board"],
cols["length"],
json.dumps(cols["post"]),
cols["thumb_path"],
cols["reason"],
),
)
except:
pass
# Finish off thumbnail jobs
await asyncio.gather(*download_jobs)
main()
asyncio.run(main())