Add cookies and headers config to pull.py

Hopefully this can work around that cloudflare captcha. ugh

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2025-02-10 21:44:45 -08:00
parent 6aea52a29f
commit 77981bfe95

View File

@@ -1,10 +1,10 @@
import asyncio import asyncio
from collections import defaultdict
import json import json
import logging import logging
from pathlib import Path
import re import re
from typing import Optional, Union from collections import defaultdict
from pathlib import Path
from typing import Any, Optional, Union
import aiohttp import aiohttp
from bs4 import BeautifulSoup as Soup from bs4 import BeautifulSoup as Soup
@@ -14,7 +14,6 @@ from .db import get_db
from .files import file_cache from .files import file_cache
from .hist import generate_histogram_svg from .hist import generate_histogram_svg
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@@ -22,10 +21,23 @@ BANS_URL = "https://4chan.org/bans"
PREVIEW_RE = re.compile(r"var postPreviews = (.+)") PREVIEW_RE = re.compile(r"var postPreviews = (.+)")
def get_pull_config() -> dict[str, Any]:
path = Path("./config.json")
if not path.exists():
log.debug(f"no {path}, using empty config")
return {}
with open(path) as fp:
return json.load(fp)
@file_cache(directory=config.CACHE_DIR, suffix=".html") @file_cache(directory=config.CACHE_DIR, suffix=".html")
async def get_bans_html() -> str: async def get_bans_html() -> str:
async with aiohttp.ClientSession() as session: config = get_pull_config()
async with session.get(BANS_URL) as resp: headers: dict[str, str] = config.get("headers", {})
cookies: dict[str, str] = config.get("cookies", {})
async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get(BANS_URL, headers=headers) as resp:
return await resp.text() return await resp.text()
@@ -33,14 +45,18 @@ async def get_thumb(thumb_path: Union[str, Path], post: dict) -> Optional[bytes]
if "thumb" not in post: if "thumb" not in post:
return None return None
config = get_pull_config()
headers: dict[str, str] = config.get("headers", {})
cookies: dict[str, str] = config.get("cookies", {})
thumb_path = Path(thumb_path) thumb_path = Path(thumb_path)
if thumb_path.exists(): if thumb_path.exists():
return thumb_path.read_bytes() return thumb_path.read_bytes()
else: else:
url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg" url = f"https://i.4cdn.org/bans/thumb/{post['board']}/{post['thumb']}s.jpg"
log.info("Downloading %s", url) log.info("Downloading %s", url)
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession(cookies=cookies) as session:
async with session.get(url) as resp: async with session.get(url, headers=headers) as resp:
thumb_path.parent.mkdir(parents=True, exist_ok=True) thumb_path.parent.mkdir(parents=True, exist_ok=True)
content = await resp.read() content = await resp.read()
thumb_path.write_bytes(content) thumb_path.write_bytes(content)