Add thumbnail downloading
Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
@@ -14,6 +14,8 @@ from bs4 import BeautifulSoup as Soup
|
|||||||
BANS_URL = "https://4chan.org/bans"
|
BANS_URL = "https://4chan.org/bans"
|
||||||
PREVIEW_RE = re.compile(r"var postPreviews = (.+)")
|
PREVIEW_RE = re.compile(r"var postPreviews = (.+)")
|
||||||
DB_PATH = "bans.db"
|
DB_PATH = "bans.db"
|
||||||
|
THUMBS_DIR = Path("thumbs")
|
||||||
|
CACHE_DIR = Path("bans")
|
||||||
|
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
@@ -38,14 +40,14 @@ def file_cache(
|
|||||||
|
|
||||||
def decorator(func):
|
def decorator(func):
|
||||||
@functools.wraps(func)
|
@functools.wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
async def wrapper(*args, **kwargs):
|
||||||
d = Path(directory)
|
d = Path(directory)
|
||||||
d.mkdir(parents=True, exist_ok=True)
|
d.mkdir(parents=True, exist_ok=True)
|
||||||
path = Path(directory, time.strftime(format) + suffix)
|
path = Path(directory, time.strftime(format) + suffix)
|
||||||
if path.exists():
|
if path.exists():
|
||||||
return path.read_text()
|
return path.read_text()
|
||||||
else:
|
else:
|
||||||
text = func(*args, **kwargs)
|
text = await func(*args, **kwargs)
|
||||||
if deduplicate:
|
if deduplicate:
|
||||||
# find the most recent file in the path
|
# find the most recent file in the path
|
||||||
newest_path = newest_in_dir(directory)
|
newest_path = newest_in_dir(directory)
|
||||||
@@ -64,13 +66,30 @@ def file_cache(
|
|||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
@file_cache(directory="bans", suffix=".html")
|
@file_cache(directory=CACHE_DIR, suffix=".html")
|
||||||
def get_bans_html() -> str:
|
async def get_bans_html() -> str:
|
||||||
r = httpx.get(BANS_URL)
|
async with httpx.AsyncClient() as c:
|
||||||
|
r = await c.get(BANS_URL)
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
def main():
|
async def get_thumb(thumb_path: Union[str, Path], post: dict) -> Optional[bytes]:
|
||||||
|
if "thumb" in post:
|
||||||
|
if thumb_path.exists():
|
||||||
|
return thumb_path.read_bytes()
|
||||||
|
else:
|
||||||
|
url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg"
|
||||||
|
print("Downloading", url)
|
||||||
|
async with httpx.AsyncClient() as c:
|
||||||
|
r = await c.get(url)
|
||||||
|
THUMBS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
thumb_path.write_bytes(r.content)
|
||||||
|
return r.content
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
# Ensure DB tables
|
# Ensure DB tables
|
||||||
db = get_db()
|
db = get_db()
|
||||||
db.executescript(
|
db.executescript(
|
||||||
@@ -81,13 +100,14 @@ def main():
|
|||||||
board varchar(10),
|
board varchar(10),
|
||||||
length varchar(10),
|
length varchar(10),
|
||||||
post text unique,
|
post text unique,
|
||||||
|
thumb_path text,
|
||||||
reason varchar(200)
|
reason varchar(200)
|
||||||
);
|
);
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get HTML
|
# Get HTML
|
||||||
html = get_bans_html()
|
html = await get_bans_html()
|
||||||
# Get post JSON
|
# Get post JSON
|
||||||
m = PREVIEW_RE.search(html)
|
m = PREVIEW_RE.search(html)
|
||||||
posts = json.loads(m[1][:-1])
|
posts = json.loads(m[1][:-1])
|
||||||
@@ -100,6 +120,7 @@ def main():
|
|||||||
|
|
||||||
cur = db.cursor()
|
cur = db.cursor()
|
||||||
|
|
||||||
|
download_jobs = []
|
||||||
for i, row in enumerate(rows):
|
for i, row in enumerate(rows):
|
||||||
# Labels
|
# Labels
|
||||||
# ['board', 'action', 'length', 'post', 'reason', 'time']
|
# ['board', 'action', 'length', 'post', 'reason', 'time']
|
||||||
@@ -108,16 +129,31 @@ def main():
|
|||||||
for key, value in zip(labels, [c for c in row if c != "\n"])
|
for key, value in zip(labels, [c for c in row if c != "\n"])
|
||||||
if key != "time"
|
if key != "time"
|
||||||
}
|
}
|
||||||
cols["post"] = posts[cols["post"]["data-pid"]]
|
post = posts[cols["post"]["data-pid"]]
|
||||||
|
cols["post"] = post
|
||||||
|
|
||||||
|
if 'thumb' in post:
|
||||||
|
thumb_path = Path(THUMBS_DIR, f"{post['thumb']}s.jpg")
|
||||||
|
download_jobs += [get_thumb(thumb_path, post)]
|
||||||
|
|
||||||
# Try to create post in database
|
# Try to create post in database
|
||||||
try:
|
try:
|
||||||
with db:
|
with db:
|
||||||
curs = db.execute(
|
curs = db.execute(
|
||||||
"insert into bans (action, board, length, post, reason) values(?, ?, ?, ?, ?)",
|
"insert into bans (action, board, length, post, thumb_path, reason) values(?, ?, ?, ?, ?)",
|
||||||
(cols['action'], cols['board'], cols['length'], json.dumps(cols['post']), cols['reason']),
|
(
|
||||||
|
cols["action"],
|
||||||
|
cols["board"],
|
||||||
|
cols["length"],
|
||||||
|
json.dumps(cols["post"]),
|
||||||
|
cols["thumb_path"],
|
||||||
|
cols["reason"],
|
||||||
|
),
|
||||||
)
|
)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
# Finish off thumbnail jobs
|
||||||
|
await asyncio.gather(*download_jobs)
|
||||||
|
|
||||||
|
|
||||||
main()
|
asyncio.run(main())
|
||||||
|
|||||||
Reference in New Issue
Block a user