diff --git a/chanbans/__main__.py b/chanbans/__main__.py index 08dfe22..b720b6d 100644 --- a/chanbans/__main__.py +++ b/chanbans/__main__.py @@ -14,6 +14,8 @@ from bs4 import BeautifulSoup as Soup BANS_URL = "https://4chan.org/bans" PREVIEW_RE = re.compile(r"var postPreviews = (.+)") DB_PATH = "bans.db" +THUMBS_DIR = Path("thumbs") +CACHE_DIR = Path("bans") def get_db(): @@ -38,14 +40,14 @@ def file_cache( def decorator(func): @functools.wraps(func) - def wrapper(*args, **kwargs): + async def wrapper(*args, **kwargs): d = Path(directory) d.mkdir(parents=True, exist_ok=True) path = Path(directory, time.strftime(format) + suffix) if path.exists(): return path.read_text() else: - text = func(*args, **kwargs) + text = await func(*args, **kwargs) if deduplicate: # find the most recent file in the path newest_path = newest_in_dir(directory) @@ -64,13 +66,30 @@ def file_cache( return decorator -@file_cache(directory="bans", suffix=".html") -def get_bans_html() -> str: - r = httpx.get(BANS_URL) +@file_cache(directory=CACHE_DIR, suffix=".html") +async def get_bans_html() -> str: + async with httpx.AsyncClient() as c: + r = await c.get(BANS_URL) return r.text -def main(): +async def get_thumb(thumb_path: Union[str, Path], post: dict) -> Optional[bytes]: + if "thumb" in post: + if thumb_path.exists(): + return thumb_path.read_bytes() + else: + url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg" + print("Downloading", url) + async with httpx.AsyncClient() as c: + r = await c.get(url) + THUMBS_DIR.mkdir(parents=True, exist_ok=True) + thumb_path.write_bytes(r.content) + return r.content + else: + return None + + +async def main(): # Ensure DB tables db = get_db() db.executescript( @@ -81,13 +100,14 @@ def main(): board varchar(10), length varchar(10), post text unique, + thumb_path text, reason varchar(200) ); """ ) # Get HTML - html = get_bans_html() + html = await get_bans_html() # Get post JSON m = PREVIEW_RE.search(html) posts = json.loads(m[1][:-1]) @@ -100,6 +120,7 @@ def main(): cur = db.cursor() + download_jobs = [] for i, row in enumerate(rows): # Labels # ['board', 'action', 'length', 'post', 'reason', 'time'] @@ -108,16 +129,31 @@ def main(): for key, value in zip(labels, [c for c in row if c != "\n"]) if key != "time" } - cols["post"] = posts[cols["post"]["data-pid"]] + post = posts[cols["post"]["data-pid"]] + cols["post"] = post + + if 'thumb' in post: + thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg") + download_jobs += [get_thumb(thumb_path, post)] + # Try to create post in database try: with db: curs = db.execute( - "insert into bans (action, board, length, post, reason) values(?, ?, ?, ?, ?)", - (cols['action'], cols['board'], cols['length'], json.dumps(cols['post']), cols['reason']), + "insert into bans (action, board, length, post, thumb_path, reason) values(?, ?, ?, ?, ?)", + ( + cols["action"], + cols["board"], + cols["length"], + json.dumps(cols["post"]), + cols["thumb_path"], + cols["reason"], + ), ) except: pass + # Finish off thumbnail jobs + await asyncio.gather(*download_jobs) -main() +asyncio.run(main())