fetch-achewood/fetch-achewood.py

#!/usr/bin/env python3
# Fetches all Achewood comics and metadata.
from pathlib import Path
import urllib.request as request
import sys
import json
from html.parser import HTMLParser
import re
from dataclasses import dataclass
from typing import Any, Dict, Optional
import time


################################################################################
# Config options - probably don't change these
################################################################################

# The number of seconds to wait between consecutive downloads
RATE_LIMIT = 0.5
# The directory to save cached data in
DATA_DIR = Path("data")

################################################################################
# Constants - don't change these
################################################################################

ACHEWOOD_URL = "http://achewood.com"
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"

IMAGE_DIR = DATA_DIR / "images"
COMIC_DIR = DATA_DIR / "comics"
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
ARCHIVE_CACHE = DATA_DIR / "archive.html"
LINK_RE = re.compile(
    r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
)
RATE_LIMIT = float(RATE_LIMIT)

################################################################################
# Classes
################################################################################
@dataclass
class Date:
    "Simple date class"

    year: str
    month: str
    day: str

    @property
    def iso(self):
        return f"{self.year}{self.month}{self.day}"

    @property
    def us(self):
        return f"{self.month}{self.day}{self.year}"


@dataclass
class Comic:
    "Comic metadata"
    date: Date

    @property
    def alt_text(self) -> str:
        comic_html = self.get_comic_html()
        parser = ComicAltParser()
        parser.feed(comic_html)
        return parser.alt_text

    @property
    def url(self) -> str:
        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"

    @property
    def image_url(self) -> str:
        return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"

    @property
    def image_path(self) -> Path:
        return IMAGE_DIR / (self.date.iso + ".gif")

    @property
    def comic_url(self) -> str:
        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"

    @property
    def comic_path(self) -> Path:
        return COMIC_DIR / (self.date.iso + ".html")

    def get_comic_html(self) -> str:
        "Gets the HTML for this comic and caches it"
        if self.comic_path.is_file():
            with open(self.comic_path) as fp:
                return fp.read()
        else:
            print(f"Downloading HTML for comic date {self.date.iso}")
            comic_html = download_html(self.comic_url)
            with open(self.comic_path, 'w') as fp:
                fp.write(comic_html)
            return comic_html

    def get_comic_image(self) -> bytes:
        "Gets the image bytes for this comic and caches it"
        if self.image_path.is_file():
            with open(self.image_path, "rb") as fp:
                return fp.read()
        else:
            print(f"Downloading image for comic date {self.date.iso}")
            image = download(self.image_url)
            with open(self.image_path, 'wb') as fp:
                fp.write(image)
            return image


class ArchiveParser(HTMLParser):
    "Achewood archive HTML parser"

    def __init__(self):
        self.dates = []
        super().__init__()

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag != "a" or "href" not in attrs:
            return
        if link := LINK_RE.fullmatch(attrs["href"]):
            self.dates += [
                Date(year=link["year"], month=link["month"], day=link["day"])
            ]


class ComicAltParser(HTMLParser):
    def __init__(self):
        self.alt_text = None
        super().__init__()

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag != "img" or "title" not in attrs:
            return
        if self.alt_text is not None:
            print("Warning: replacing already-existing alt text")
            print("Previous:")
            print("\t", self.alt_text)
            print("New:")
            print("\t", attrs["title"])
        self.alt_text = attrs["title"]


################################################################################
# Utility functions
################################################################################


# global variable keeping track of the time that the last download was completed
last_download = 0.0

def download(url: str) -> bytes:
    global last_download
    # Sleep if needed to rate-limit all downloads
    next_download = last_download + RATE_LIMIT
    now = time.time()
    delta = next_download - now
    if delta > 0.0:
        time.sleep(delta)

    with request.urlopen(url) as f:
        if f.status != 200:
            raise Exception(f"URL {url} returned non-200 status {f.status}")
        # update the download global
        last_download = time.time()
        return f.read()


def download_html(url: str) -> str:
    return download(url).decode("utf-8")


def get_archive_html() -> str:
    if ARCHIVE_CACHE.is_file():
        print("Archive is cached, loading that")
        with open(ARCHIVE_CACHE, "r") as fp:
            archive_html = fp.read()
    else:
        print("Downloading archive")
        archive_html = download_html(ARCHIVE_URL)
        with open(ARCHIVE_CACHE, "w") as fp:
            fp.write(archive_html)
    return archive_html


################################################################################
# main
################################################################################

# Make data, comic, and image directory
try:
    DATA_DIR.mkdir(exist_ok=True)
    IMAGE_DIR.mkdir(exist_ok=True)
    COMIC_DIR.mkdir(exist_ok=True)
except PermissionError:
    print("ERROR: could not create data, image, or comic cache directory")
    sys.exit(1)

# Fetch all comic links if needed
archive_html = get_archive_html()
parser = ArchiveParser()
parser.feed(archive_html)

# All dates of comics
dates = parser.dates

alt_text = {}

for date in dates:
    comic = Comic(date=date)
    alt_text[date.iso] = comic.alt_text
    # Also download the comic image
    comic.get_comic_image()

with open(ALT_TEXT_JSON, 'w') as fp:
    json.dump(alt_text, fp, indent=4)

print("Done")
Add gitignore, fetch script, and postprocess script Signed-off-by: Alek Ratzloff <alekratz@gmail.com> 2022-04-27 15:00:10 -07:00			`#!/usr/bin/env python3`
			`# Fetches all Achewood comics and metadata.`
			`from pathlib import Path`
			`import urllib.request as request`
			`import sys`
			`import json`
			`from html.parser import HTMLParser`
			`import re`
			`from dataclasses import dataclass`
			`from typing import Any, Dict, Optional`
			`import time`


			`################################################################################`
			`# Config options - probably don't change these`
			`################################################################################`

			`# The number of seconds to wait between consecutive downloads`
			`RATE_LIMIT = 0.5`
			`# The directory to save cached data in`
			`DATA_DIR = Path("data")`

			`################################################################################`
			`# Constants - don't change these`
			`################################################################################`

			`ACHEWOOD_URL = "http://achewood.com"`
			`ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"`

			`IMAGE_DIR = DATA_DIR / "images"`
			`COMIC_DIR = DATA_DIR / "comics"`
			`ALT_TEXT_JSON = DATA_DIR / "alt_text.json"`
			`ARCHIVE_CACHE = DATA_DIR / "archive.html"`
			`LINK_RE = re.compile(`
			`r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"`
			`)`
			`RATE_LIMIT = float(RATE_LIMIT)`

			`################################################################################`
			`# Classes`
			`################################################################################`
			`@dataclass`
			`class Date:`
			`"Simple date class"`

			`year: str`
			`month: str`
			`day: str`

			`@property`
			`def iso(self):`
			`return f"{self.year}{self.month}{self.day}"`

			`@property`
			`def us(self):`
			`return f"{self.month}{self.day}{self.year}"`


			`@dataclass`
			`class Comic:`
			`"Comic metadata"`
			`date: Date`

			`@property`
			`def alt_text(self) -> str:`
			`comic_html = self.get_comic_html()`
			`parser = ComicAltParser()`
			`parser.feed(comic_html)`
			`return parser.alt_text`

			`@property`
			`def url(self) -> str:`
			`return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"`

			`@property`
			`def image_url(self) -> str:`
			`return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"`

			`@property`
			`def image_path(self) -> Path:`
			`return IMAGE_DIR / (self.date.iso + ".gif")`

			`@property`
			`def comic_url(self) -> str:`
			`return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"`

			`@property`
			`def comic_path(self) -> Path:`
			`return COMIC_DIR / (self.date.iso + ".html")`

			`def get_comic_html(self) -> str:`
			`"Gets the HTML for this comic and caches it"`
			`if self.comic_path.is_file():`
			`with open(self.comic_path) as fp:`
			`return fp.read()`
			`else:`
			`print(f"Downloading HTML for comic date {self.date.iso}")`
			`comic_html = download_html(self.comic_url)`
			`with open(self.comic_path, 'w') as fp:`
			`fp.write(comic_html)`
			`return comic_html`

			`def get_comic_image(self) -> bytes:`
			`"Gets the image bytes for this comic and caches it"`
			`if self.image_path.is_file():`
			`with open(self.image_path, "rb") as fp:`
			`return fp.read()`
			`else:`
			`print(f"Downloading image for comic date {self.date.iso}")`
			`image = download(self.image_url)`
			`with open(self.image_path, 'wb') as fp:`
			`fp.write(image)`
			`return image`


			`class ArchiveParser(HTMLParser):`
			`"Achewood archive HTML parser"`

			`def __init__(self):`
			`self.dates = []`
			`super().__init__()`

			`def handle_starttag(self, tag, attrs):`
			`attrs = dict(attrs)`
			`if tag != "a" or "href" not in attrs:`
			`return`
			`if link := LINK_RE.fullmatch(attrs["href"]):`
			`self.dates += [`
			`Date(year=link["year"], month=link["month"], day=link["day"])`
			`]`


			`class ComicAltParser(HTMLParser):`
			`def __init__(self):`
			`self.alt_text = None`
			`super().__init__()`

			`def handle_starttag(self, tag, attrs):`
			`attrs = dict(attrs)`
			`if tag != "img" or "title" not in attrs:`
			`return`
			`if self.alt_text is not None:`
			`print("Warning: replacing already-existing alt text")`
			`print("Previous:")`
			`print("\t", self.alt_text)`
			`print("New:")`
			`print("\t", attrs["title"])`
			`self.alt_text = attrs["title"]`


			`################################################################################`
			`# Utility functions`
			`################################################################################`


			`# global variable keeping track of the time that the last download was completed`
			`last_download = 0.0`

			`def download(url: str) -> bytes:`
			`global last_download`
			`# Sleep if needed to rate-limit all downloads`
			`next_download = last_download + RATE_LIMIT`
			`now = time.time()`
			`delta = next_download - now`
			`if delta > 0.0:`
			`time.sleep(delta)`

			`with request.urlopen(url) as f:`
			`if f.status != 200:`
			`raise Exception(f"URL {url} returned non-200 status {f.status}")`
			`# update the download global`
			`last_download = time.time()`
			`return f.read()`


			`def download_html(url: str) -> str:`
			`return download(url).decode("utf-8")`


			`def get_archive_html() -> str:`
			`if ARCHIVE_CACHE.is_file():`
			`print("Archive is cached, loading that")`
			`with open(ARCHIVE_CACHE, "r") as fp:`
			`archive_html = fp.read()`
			`else:`
			`print("Downloading archive")`
			`archive_html = download_html(ARCHIVE_URL)`
			`with open(ARCHIVE_CACHE, "w") as fp:`
			`fp.write(archive_html)`
			`return archive_html`


			`################################################################################`
			`# main`
			`################################################################################`

			`# Make data, comic, and image directory`
			`try:`
			`DATA_DIR.mkdir(exist_ok=True)`
			`IMAGE_DIR.mkdir(exist_ok=True)`
			`COMIC_DIR.mkdir(exist_ok=True)`
			`except PermissionError:`
			`print("ERROR: could not create data, image, or comic cache directory")`
			`sys.exit(1)`

			`# Fetch all comic links if needed`
			`archive_html = get_archive_html()`
			`parser = ArchiveParser()`
			`parser.feed(archive_html)`

			`# All dates of comics`
			`dates = parser.dates`

			`alt_text = {}`

			`for date in dates:`
			`comic = Comic(date=date)`
			`alt_text[date.iso] = comic.alt_text`
			`# Also download the comic image`
			`comic.get_comic_image()`

			`with open(ALT_TEXT_JSON, 'w') as fp:`
			`json.dump(alt_text, fp, indent=4)`

			`print("Done")`