#!/usr/bin/env python3 # Fetches all Achewood comics and metadata. from pathlib import Path import urllib.request as request import sys import json from html.parser import HTMLParser import re from dataclasses import dataclass from typing import Any, Dict, Optional import time ################################################################################ # Config options - probably don't change these ################################################################################ # The number of seconds to wait between consecutive downloads RATE_LIMIT = 0.5 # The directory to save cached data in DATA_DIR = Path("data") ################################################################################ # Constants - don't change these ################################################################################ ACHEWOOD_URL = "http://achewood.com" ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php" IMAGE_DIR = DATA_DIR / "images" COMIC_DIR = DATA_DIR / "comics" ALT_TEXT_JSON = DATA_DIR / "alt_text.json" ARCHIVE_CACHE = DATA_DIR / "archive.html" LINK_RE = re.compile( r"^index\.php\?date=(?P\d\d)(?P\d\d)(?P\d\d\d\d)$" ) RATE_LIMIT = float(RATE_LIMIT) ################################################################################ # Classes ################################################################################ @dataclass class Date: "Simple date class" year: str month: str day: str @property def iso(self): return f"{self.year}{self.month}{self.day}" @property def us(self): return f"{self.month}{self.day}{self.year}" @dataclass class Comic: "Comic metadata" date: Date @property def alt_text(self) -> str: comic_html = self.get_comic_html() parser = ComicAltParser() parser.feed(comic_html) return parser.alt_text @property def url(self) -> str: return f"{ACHEWOOD_URL}/index.php?date={self.date.us}" @property def image_url(self) -> str: return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}" @property def image_path(self) -> Path: return IMAGE_DIR / (self.date.iso + ".gif") @property def comic_url(self) -> str: return f"{ACHEWOOD_URL}/index.php?date={self.date.us}" @property def comic_path(self) -> Path: return COMIC_DIR / (self.date.iso + ".html") def get_comic_html(self) -> str: "Gets the HTML for this comic and caches it" if self.comic_path.is_file(): with open(self.comic_path) as fp: return fp.read() else: print(f"Downloading HTML for comic date {self.date.iso}") comic_html = download_html(self.comic_url) with open(self.comic_path, 'w') as fp: fp.write(comic_html) return comic_html def get_comic_image(self) -> bytes: "Gets the image bytes for this comic and caches it" if self.image_path.is_file(): with open(self.image_path, "rb") as fp: return fp.read() else: print(f"Downloading image for comic date {self.date.iso}") image = download(self.image_url) with open(self.image_path, 'wb') as fp: fp.write(image) return image class ArchiveParser(HTMLParser): "Achewood archive HTML parser" def __init__(self): self.dates = [] super().__init__() def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag != "a" or "href" not in attrs: return if link := LINK_RE.fullmatch(attrs["href"]): self.dates += [ Date(year=link["year"], month=link["month"], day=link["day"]) ] class ComicAltParser(HTMLParser): def __init__(self): self.alt_text = None super().__init__() def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag != "img" or "title" not in attrs: return if self.alt_text is not None: print("Warning: replacing already-existing alt text") print("Previous:") print("\t", self.alt_text) print("New:") print("\t", attrs["title"]) self.alt_text = attrs["title"] ################################################################################ # Utility functions ################################################################################ # global variable keeping track of the time that the last download was completed last_download = 0.0 def download(url: str) -> bytes: global last_download # Sleep if needed to rate-limit all downloads next_download = last_download + RATE_LIMIT now = time.time() delta = next_download - now if delta > 0.0: time.sleep(delta) with request.urlopen(url) as f: if f.status != 200: raise Exception(f"URL {url} returned non-200 status {f.status}") # update the download global last_download = time.time() return f.read() def download_html(url: str) -> str: return download(url).decode("utf-8") def get_archive_html() -> str: if ARCHIVE_CACHE.is_file(): print("Archive is cached, loading that") with open(ARCHIVE_CACHE, "r") as fp: archive_html = fp.read() else: print("Downloading archive") archive_html = download_html(ARCHIVE_URL) with open(ARCHIVE_CACHE, "w") as fp: fp.write(archive_html) return archive_html ################################################################################ # main ################################################################################ # Make data, comic, and image directory try: DATA_DIR.mkdir(exist_ok=True) IMAGE_DIR.mkdir(exist_ok=True) COMIC_DIR.mkdir(exist_ok=True) except PermissionError: print("ERROR: could not create data, image, or comic cache directory") sys.exit(1) # Fetch all comic links if needed archive_html = get_archive_html() parser = ArchiveParser() parser.feed(archive_html) # All dates of comics dates = parser.dates alt_text = {} for date in dates: comic = Comic(date=date) alt_text[date.iso] = comic.alt_text # Also download the comic image comic.get_comic_image() with open(ALT_TEXT_JSON, 'w') as fp: json.dump(alt_text, fp, indent=4) print("Done")