Add gitignore, fetch script, and postprocess script

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
2022-04-27 15:00:10 -07:00
commit 28faa3e0b1
3 changed files with 460 additions and 0 deletions
--- a/fetch-achewood.py
+++ b/fetch-achewood.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Fetches all Achewood comics and metadata.
+from pathlib import Path
+import urllib.request as request
+import sys
+import json
+from html.parser import HTMLParser
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import time
+
+
+################################################################################
+# Config options - probably don't change these
+################################################################################
+
+# The number of seconds to wait between consecutive downloads
+RATE_LIMIT = 0.5
+# The directory to save cached data in
+DATA_DIR = Path("data")
+
+################################################################################
+# Constants - don't change these
+################################################################################
+
+ACHEWOOD_URL = "http://achewood.com"
+ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
+
+IMAGE_DIR = DATA_DIR / "images"
+COMIC_DIR = DATA_DIR / "comics"
+ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
+ARCHIVE_CACHE = DATA_DIR / "archive.html"
+LINK_RE = re.compile(
+    r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
+)
+RATE_LIMIT = float(RATE_LIMIT)
+
+################################################################################
+# Classes
+################################################################################
+@dataclass
+class Date:
+    "Simple date class"
+
+    year: str
+    month: str
+    day: str
+
+    @property
+    def iso(self):
+        return f"{self.year}{self.month}{self.day}"
+
+    @property
+    def us(self):
+        return f"{self.month}{self.day}{self.year}"
+
+
+@dataclass
+class Comic:
+    "Comic metadata"
+    date: Date
+
+    @property
+    def alt_text(self) -> str:
+        comic_html = self.get_comic_html()
+        parser = ComicAltParser()
+        parser.feed(comic_html)
+        return parser.alt_text
+
+    @property
+    def url(self) -> str:
+        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
+
+    @property
+    def image_url(self) -> str:
+        return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
+
+    @property
+    def image_path(self) -> Path:
+        return IMAGE_DIR / (self.date.iso + ".gif")
+
+    @property
+    def comic_url(self) -> str:
+        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
+
+    @property
+    def comic_path(self) -> Path:
+        return COMIC_DIR / (self.date.iso + ".html")
+
+    def get_comic_html(self) -> str:
+        "Gets the HTML for this comic and caches it"
+        if self.comic_path.is_file():
+            with open(self.comic_path) as fp:
+                return fp.read()
+        else:
+            print(f"Downloading HTML for comic date {self.date.iso}")
+            comic_html = download_html(self.comic_url)
+            with open(self.comic_path, 'w') as fp:
+                fp.write(comic_html)
+            return comic_html
+
+    def get_comic_image(self) -> bytes:
+        "Gets the image bytes for this comic and caches it"
+        if self.image_path.is_file():
+            with open(self.image_path, "rb") as fp:
+                return fp.read()
+        else:
+            print(f"Downloading image for comic date {self.date.iso}")
+            image = download(self.image_url)
+            with open(self.image_path, 'wb') as fp:
+                fp.write(image)
+            return image
+
+
+class ArchiveParser(HTMLParser):
+    "Achewood archive HTML parser"
+
+    def __init__(self):
+        self.dates = []
+        super().__init__()
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag != "a" or "href" not in attrs:
+            return
+        if link := LINK_RE.fullmatch(attrs["href"]):
+            self.dates += [
+                Date(year=link["year"], month=link["month"], day=link["day"])
+            ]
+
+
+class ComicAltParser(HTMLParser):
+    def __init__(self):
+        self.alt_text = None
+        super().__init__()
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag != "img" or "title" not in attrs:
+            return
+        if self.alt_text is not None:
+            print("Warning: replacing already-existing alt text")
+            print("Previous:")
+            print("\t", self.alt_text)
+            print("New:")
+            print("\t", attrs["title"])
+        self.alt_text = attrs["title"]
+
+
+################################################################################
+# Utility functions
+################################################################################
+
+
+# global variable keeping track of the time that the last download was completed
+last_download = 0.0
+
+def download(url: str) -> bytes:
+    global last_download
+    # Sleep if needed to rate-limit all downloads
+    next_download = last_download + RATE_LIMIT
+    now = time.time()
+    delta = next_download - now
+    if delta > 0.0:
+        time.sleep(delta)
+
+    with request.urlopen(url) as f:
+        if f.status != 200:
+            raise Exception(f"URL {url} returned non-200 status {f.status}")
+        # update the download global
+        last_download = time.time()
+        return f.read()
+
+
+def download_html(url: str) -> str:
+    return download(url).decode("utf-8")
+
+
+def get_archive_html() -> str:
+    if ARCHIVE_CACHE.is_file():
+        print("Archive is cached, loading that")
+        with open(ARCHIVE_CACHE, "r") as fp:
+            archive_html = fp.read()
+    else:
+        print("Downloading archive")
+        archive_html = download_html(ARCHIVE_URL)
+        with open(ARCHIVE_CACHE, "w") as fp:
+            fp.write(archive_html)
+    return archive_html
+
+
+################################################################################
+# main
+################################################################################
+
+# Make data, comic, and image directory
+try:
+    DATA_DIR.mkdir(exist_ok=True)
+    IMAGE_DIR.mkdir(exist_ok=True)
+    COMIC_DIR.mkdir(exist_ok=True)
+except PermissionError:
+    print("ERROR: could not create data, image, or comic cache directory")
+    sys.exit(1)
+
+# Fetch all comic links if needed
+archive_html = get_archive_html()
+parser = ArchiveParser()
+parser.feed(archive_html)
+
+# All dates of comics
+dates = parser.dates
+
+alt_text = {}
+
+for date in dates:
+    comic = Comic(date=date)
+    alt_text[date.iso] = comic.alt_text
+    # Also download the comic image
+    comic.get_comic_image()
+
+with open(ALT_TEXT_JSON, 'w') as fp:
+    json.dump(alt_text, fp, indent=4)
+
+print("Done")