Add thumbnail downloading
Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
@@ -14,6 +14,8 @@ from bs4 import BeautifulSoup as Soup
|
||||
BANS_URL = "https://4chan.org/bans"
|
||||
PREVIEW_RE = re.compile(r"var postPreviews = (.+)")
|
||||
DB_PATH = "bans.db"
|
||||
THUMBS_DIR = Path("thumbs")
|
||||
CACHE_DIR = Path("bans")
|
||||
|
||||
|
||||
def get_db():
|
||||
@@ -38,14 +40,14 @@ def file_cache(
|
||||
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
async def wrapper(*args, **kwargs):
|
||||
d = Path(directory)
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
path = Path(directory, time.strftime(format) + suffix)
|
||||
if path.exists():
|
||||
return path.read_text()
|
||||
else:
|
||||
text = func(*args, **kwargs)
|
||||
text = await func(*args, **kwargs)
|
||||
if deduplicate:
|
||||
# find the most recent file in the path
|
||||
newest_path = newest_in_dir(directory)
|
||||
@@ -64,13 +66,30 @@ def file_cache(
|
||||
return decorator
|
||||
|
||||
|
||||
@file_cache(directory="bans", suffix=".html")
|
||||
def get_bans_html() -> str:
|
||||
r = httpx.get(BANS_URL)
|
||||
@file_cache(directory=CACHE_DIR, suffix=".html")
|
||||
async def get_bans_html() -> str:
|
||||
async with httpx.AsyncClient() as c:
|
||||
r = await c.get(BANS_URL)
|
||||
return r.text
|
||||
|
||||
|
||||
def main():
|
||||
async def get_thumb(thumb_path: Union[str, Path], post: dict) -> Optional[bytes]:
|
||||
if "thumb" in post:
|
||||
if thumb_path.exists():
|
||||
return thumb_path.read_bytes()
|
||||
else:
|
||||
url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg"
|
||||
print("Downloading", url)
|
||||
async with httpx.AsyncClient() as c:
|
||||
r = await c.get(url)
|
||||
THUMBS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
thumb_path.write_bytes(r.content)
|
||||
return r.content
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
# Ensure DB tables
|
||||
db = get_db()
|
||||
db.executescript(
|
||||
@@ -81,13 +100,14 @@ def main():
|
||||
board varchar(10),
|
||||
length varchar(10),
|
||||
post text unique,
|
||||
thumb_path text,
|
||||
reason varchar(200)
|
||||
);
|
||||
"""
|
||||
)
|
||||
|
||||
# Get HTML
|
||||
html = get_bans_html()
|
||||
html = await get_bans_html()
|
||||
# Get post JSON
|
||||
m = PREVIEW_RE.search(html)
|
||||
posts = json.loads(m[1][:-1])
|
||||
@@ -100,6 +120,7 @@ def main():
|
||||
|
||||
cur = db.cursor()
|
||||
|
||||
download_jobs = []
|
||||
for i, row in enumerate(rows):
|
||||
# Labels
|
||||
# ['board', 'action', 'length', 'post', 'reason', 'time']
|
||||
@@ -108,16 +129,31 @@ def main():
|
||||
for key, value in zip(labels, [c for c in row if c != "\n"])
|
||||
if key != "time"
|
||||
}
|
||||
cols["post"] = posts[cols["post"]["data-pid"]]
|
||||
post = posts[cols["post"]["data-pid"]]
|
||||
cols["post"] = post
|
||||
|
||||
if 'thumb' in post:
|
||||
thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg")
|
||||
download_jobs += [get_thumb(thumb_path, post)]
|
||||
|
||||
# Try to create post in database
|
||||
try:
|
||||
with db:
|
||||
curs = db.execute(
|
||||
"insert into bans (action, board, length, post, reason) values(?, ?, ?, ?, ?)",
|
||||
(cols['action'], cols['board'], cols['length'], json.dumps(cols['post']), cols['reason']),
|
||||
"insert into bans (action, board, length, post, thumb_path, reason) values(?, ?, ?, ?, ?)",
|
||||
(
|
||||
cols["action"],
|
||||
cols["board"],
|
||||
cols["length"],
|
||||
json.dumps(cols["post"]),
|
||||
cols["thumb_path"],
|
||||
cols["reason"],
|
||||
),
|
||||
)
|
||||
except:
|
||||
pass
|
||||
# Finish off thumbnail jobs
|
||||
await asyncio.gather(*download_jobs)
|
||||
|
||||
|
||||
main()
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user