Add cookies and headers config to pull.py

Hopefully this can work around that cloudflare captcha. ugh

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2025-02-10 21:44:45 -08:00
parent 6aea52a29f
commit 77981bfe95

View File

@@ -1,10 +1,10 @@
import asyncio
from collections import defaultdict
import json
import logging
from pathlib import Path
import re
from typing import Optional, Union
from collections import defaultdict
from pathlib import Path
from typing import Any, Optional, Union
import aiohttp
from bs4 import BeautifulSoup as Soup
@@ -14,7 +14,6 @@ from .db import get_db
from .files import file_cache
from .hist import generate_histogram_svg
log = logging.getLogger(__name__)
@@ -22,10 +21,23 @@ BANS_URL = "https://4chan.org/bans"
PREVIEW_RE = re.compile(r"var postPreviews = (.+)")
def get_pull_config() -> dict[str, Any]:
path = Path("./config.json")
if not path.exists():
log.debug(f"no {path}, using empty config")
return {}
with open(path) as fp:
return json.load(fp)
@file_cache(directory=config.CACHE_DIR, suffix=".html")
async def get_bans_html() -> str:
async with aiohttp.ClientSession() as session:
async with session.get(BANS_URL) as resp:
config = get_pull_config()
headers: dict[str, str] = config.get("headers", {})
cookies: dict[str, str] = config.get("cookies", {})
async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get(BANS_URL, headers=headers) as resp:
return await resp.text()
@@ -33,14 +45,18 @@ async def get_thumb(thumb_path: Union[str, Path], post: dict) -> Optional[bytes]
if "thumb" not in post:
return None
config = get_pull_config()
headers: dict[str, str] = config.get("headers", {})
cookies: dict[str, str] = config.get("cookies", {})
thumb_path = Path(thumb_path)
if thumb_path.exists():
return thumb_path.read_bytes()
else:
url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg"
log.info("Downloading %s", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get(url, headers=headers) as resp:
thumb_path.parent.mkdir(parents=True, exist_ok=True)
content = await resp.read()
thumb_path.write_bytes(content)