226 lines
6.3 KiB
Python
226 lines
6.3 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
# Fetches all Achewood comics and metadata.
|
||
|
|
from pathlib import Path
|
||
|
|
import urllib.request as request
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
from html.parser import HTMLParser
|
||
|
|
import re
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from typing import Any, Dict, Optional
|
||
|
|
import time
|
||
|
|
|
||
|
|
|
||
|
|
################################################################################
|
||
|
|
# Config options - probably don't change these
|
||
|
|
################################################################################
|
||
|
|
|
||
|
|
# The number of seconds to wait between consecutive downloads
|
||
|
|
RATE_LIMIT = 0.5
|
||
|
|
# The directory to save cached data in
|
||
|
|
DATA_DIR = Path("data")
|
||
|
|
|
||
|
|
################################################################################
|
||
|
|
# Constants - don't change these
|
||
|
|
################################################################################
|
||
|
|
|
||
|
|
ACHEWOOD_URL = "http://achewood.com"
|
||
|
|
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
|
||
|
|
|
||
|
|
IMAGE_DIR = DATA_DIR / "images"
|
||
|
|
COMIC_DIR = DATA_DIR / "comics"
|
||
|
|
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
|
||
|
|
ARCHIVE_CACHE = DATA_DIR / "archive.html"
|
||
|
|
LINK_RE = re.compile(
|
||
|
|
r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
|
||
|
|
)
|
||
|
|
RATE_LIMIT = float(RATE_LIMIT)
|
||
|
|
|
||
|
|
################################################################################
|
||
|
|
# Classes
|
||
|
|
################################################################################
|
||
|
|
@dataclass
|
||
|
|
class Date:
|
||
|
|
"Simple date class"
|
||
|
|
|
||
|
|
year: str
|
||
|
|
month: str
|
||
|
|
day: str
|
||
|
|
|
||
|
|
@property
|
||
|
|
def iso(self):
|
||
|
|
return f"{self.year}{self.month}{self.day}"
|
||
|
|
|
||
|
|
@property
|
||
|
|
def us(self):
|
||
|
|
return f"{self.month}{self.day}{self.year}"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class Comic:
|
||
|
|
"Comic metadata"
|
||
|
|
date: Date
|
||
|
|
|
||
|
|
@property
|
||
|
|
def alt_text(self) -> str:
|
||
|
|
comic_html = self.get_comic_html()
|
||
|
|
parser = ComicAltParser()
|
||
|
|
parser.feed(comic_html)
|
||
|
|
return parser.alt_text
|
||
|
|
|
||
|
|
@property
|
||
|
|
def url(self) -> str:
|
||
|
|
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||
|
|
|
||
|
|
@property
|
||
|
|
def image_url(self) -> str:
|
||
|
|
return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
|
||
|
|
|
||
|
|
@property
|
||
|
|
def image_path(self) -> Path:
|
||
|
|
return IMAGE_DIR / (self.date.iso + ".gif")
|
||
|
|
|
||
|
|
@property
|
||
|
|
def comic_url(self) -> str:
|
||
|
|
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||
|
|
|
||
|
|
@property
|
||
|
|
def comic_path(self) -> Path:
|
||
|
|
return COMIC_DIR / (self.date.iso + ".html")
|
||
|
|
|
||
|
|
def get_comic_html(self) -> str:
|
||
|
|
"Gets the HTML for this comic and caches it"
|
||
|
|
if self.comic_path.is_file():
|
||
|
|
with open(self.comic_path) as fp:
|
||
|
|
return fp.read()
|
||
|
|
else:
|
||
|
|
print(f"Downloading HTML for comic date {self.date.iso}")
|
||
|
|
comic_html = download_html(self.comic_url)
|
||
|
|
with open(self.comic_path, 'w') as fp:
|
||
|
|
fp.write(comic_html)
|
||
|
|
return comic_html
|
||
|
|
|
||
|
|
def get_comic_image(self) -> bytes:
|
||
|
|
"Gets the image bytes for this comic and caches it"
|
||
|
|
if self.image_path.is_file():
|
||
|
|
with open(self.image_path, "rb") as fp:
|
||
|
|
return fp.read()
|
||
|
|
else:
|
||
|
|
print(f"Downloading image for comic date {self.date.iso}")
|
||
|
|
image = download(self.image_url)
|
||
|
|
with open(self.image_path, 'wb') as fp:
|
||
|
|
fp.write(image)
|
||
|
|
return image
|
||
|
|
|
||
|
|
|
||
|
|
class ArchiveParser(HTMLParser):
|
||
|
|
"Achewood archive HTML parser"
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self.dates = []
|
||
|
|
super().__init__()
|
||
|
|
|
||
|
|
def handle_starttag(self, tag, attrs):
|
||
|
|
attrs = dict(attrs)
|
||
|
|
if tag != "a" or "href" not in attrs:
|
||
|
|
return
|
||
|
|
if link := LINK_RE.fullmatch(attrs["href"]):
|
||
|
|
self.dates += [
|
||
|
|
Date(year=link["year"], month=link["month"], day=link["day"])
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
class ComicAltParser(HTMLParser):
|
||
|
|
def __init__(self):
|
||
|
|
self.alt_text = None
|
||
|
|
super().__init__()
|
||
|
|
|
||
|
|
def handle_starttag(self, tag, attrs):
|
||
|
|
attrs = dict(attrs)
|
||
|
|
if tag != "img" or "title" not in attrs:
|
||
|
|
return
|
||
|
|
if self.alt_text is not None:
|
||
|
|
print("Warning: replacing already-existing alt text")
|
||
|
|
print("Previous:")
|
||
|
|
print("\t", self.alt_text)
|
||
|
|
print("New:")
|
||
|
|
print("\t", attrs["title"])
|
||
|
|
self.alt_text = attrs["title"]
|
||
|
|
|
||
|
|
|
||
|
|
################################################################################
|
||
|
|
# Utility functions
|
||
|
|
################################################################################
|
||
|
|
|
||
|
|
|
||
|
|
# global variable keeping track of the time that the last download was completed
|
||
|
|
last_download = 0.0
|
||
|
|
|
||
|
|
def download(url: str) -> bytes:
|
||
|
|
global last_download
|
||
|
|
# Sleep if needed to rate-limit all downloads
|
||
|
|
next_download = last_download + RATE_LIMIT
|
||
|
|
now = time.time()
|
||
|
|
delta = next_download - now
|
||
|
|
if delta > 0.0:
|
||
|
|
time.sleep(delta)
|
||
|
|
|
||
|
|
with request.urlopen(url) as f:
|
||
|
|
if f.status != 200:
|
||
|
|
raise Exception(f"URL {url} returned non-200 status {f.status}")
|
||
|
|
# update the download global
|
||
|
|
last_download = time.time()
|
||
|
|
return f.read()
|
||
|
|
|
||
|
|
|
||
|
|
def download_html(url: str) -> str:
|
||
|
|
return download(url).decode("utf-8")
|
||
|
|
|
||
|
|
|
||
|
|
def get_archive_html() -> str:
|
||
|
|
if ARCHIVE_CACHE.is_file():
|
||
|
|
print("Archive is cached, loading that")
|
||
|
|
with open(ARCHIVE_CACHE, "r") as fp:
|
||
|
|
archive_html = fp.read()
|
||
|
|
else:
|
||
|
|
print("Downloading archive")
|
||
|
|
archive_html = download_html(ARCHIVE_URL)
|
||
|
|
with open(ARCHIVE_CACHE, "w") as fp:
|
||
|
|
fp.write(archive_html)
|
||
|
|
return archive_html
|
||
|
|
|
||
|
|
|
||
|
|
################################################################################
|
||
|
|
# main
|
||
|
|
################################################################################
|
||
|
|
|
||
|
|
# Make data, comic, and image directory
|
||
|
|
try:
|
||
|
|
DATA_DIR.mkdir(exist_ok=True)
|
||
|
|
IMAGE_DIR.mkdir(exist_ok=True)
|
||
|
|
COMIC_DIR.mkdir(exist_ok=True)
|
||
|
|
except PermissionError:
|
||
|
|
print("ERROR: could not create data, image, or comic cache directory")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Fetch all comic links if needed
|
||
|
|
archive_html = get_archive_html()
|
||
|
|
parser = ArchiveParser()
|
||
|
|
parser.feed(archive_html)
|
||
|
|
|
||
|
|
# All dates of comics
|
||
|
|
dates = parser.dates
|
||
|
|
|
||
|
|
alt_text = {}
|
||
|
|
|
||
|
|
for date in dates:
|
||
|
|
comic = Comic(date=date)
|
||
|
|
alt_text[date.iso] = comic.alt_text
|
||
|
|
# Also download the comic image
|
||
|
|
comic.get_comic_image()
|
||
|
|
|
||
|
|
with open(ALT_TEXT_JSON, 'w') as fp:
|
||
|
|
json.dump(alt_text, fp, indent=4)
|
||
|
|
|
||
|
|
print("Done")
|