Add gitignore, fetch script, and postprocess script

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2022-04-27 15:00:10 -07:00
commit 28faa3e0b1
3 changed files with 460 additions and 0 deletions

225
fetch-achewood.py Executable file
View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
# Fetches all Achewood comics and metadata.
from pathlib import Path
import urllib.request as request
import sys
import json
from html.parser import HTMLParser
import re
from dataclasses import dataclass
from typing import Any, Dict, Optional
import time
################################################################################
# Config options - probably don't change these
################################################################################
# The number of seconds to wait between consecutive downloads
RATE_LIMIT = 0.5
# The directory to save cached data in
DATA_DIR = Path("data")
################################################################################
# Constants - don't change these
################################################################################
ACHEWOOD_URL = "http://achewood.com"
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
IMAGE_DIR = DATA_DIR / "images"
COMIC_DIR = DATA_DIR / "comics"
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
ARCHIVE_CACHE = DATA_DIR / "archive.html"
LINK_RE = re.compile(
r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
)
RATE_LIMIT = float(RATE_LIMIT)
################################################################################
# Classes
################################################################################
@dataclass
class Date:
"Simple date class"
year: str
month: str
day: str
@property
def iso(self):
return f"{self.year}{self.month}{self.day}"
@property
def us(self):
return f"{self.month}{self.day}{self.year}"
@dataclass
class Comic:
"Comic metadata"
date: Date
@property
def alt_text(self) -> str:
comic_html = self.get_comic_html()
parser = ComicAltParser()
parser.feed(comic_html)
return parser.alt_text
@property
def url(self) -> str:
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
@property
def image_url(self) -> str:
return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
@property
def image_path(self) -> Path:
return IMAGE_DIR / (self.date.iso + ".gif")
@property
def comic_url(self) -> str:
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
@property
def comic_path(self) -> Path:
return COMIC_DIR / (self.date.iso + ".html")
def get_comic_html(self) -> str:
"Gets the HTML for this comic and caches it"
if self.comic_path.is_file():
with open(self.comic_path) as fp:
return fp.read()
else:
print(f"Downloading HTML for comic date {self.date.iso}")
comic_html = download_html(self.comic_url)
with open(self.comic_path, 'w') as fp:
fp.write(comic_html)
return comic_html
def get_comic_image(self) -> bytes:
"Gets the image bytes for this comic and caches it"
if self.image_path.is_file():
with open(self.image_path, "rb") as fp:
return fp.read()
else:
print(f"Downloading image for comic date {self.date.iso}")
image = download(self.image_url)
with open(self.image_path, 'wb') as fp:
fp.write(image)
return image
class ArchiveParser(HTMLParser):
"Achewood archive HTML parser"
def __init__(self):
self.dates = []
super().__init__()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag != "a" or "href" not in attrs:
return
if link := LINK_RE.fullmatch(attrs["href"]):
self.dates += [
Date(year=link["year"], month=link["month"], day=link["day"])
]
class ComicAltParser(HTMLParser):
def __init__(self):
self.alt_text = None
super().__init__()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag != "img" or "title" not in attrs:
return
if self.alt_text is not None:
print("Warning: replacing already-existing alt text")
print("Previous:")
print("\t", self.alt_text)
print("New:")
print("\t", attrs["title"])
self.alt_text = attrs["title"]
################################################################################
# Utility functions
################################################################################
# global variable keeping track of the time that the last download was completed
last_download = 0.0
def download(url: str) -> bytes:
global last_download
# Sleep if needed to rate-limit all downloads
next_download = last_download + RATE_LIMIT
now = time.time()
delta = next_download - now
if delta > 0.0:
time.sleep(delta)
with request.urlopen(url) as f:
if f.status != 200:
raise Exception(f"URL {url} returned non-200 status {f.status}")
# update the download global
last_download = time.time()
return f.read()
def download_html(url: str) -> str:
return download(url).decode("utf-8")
def get_archive_html() -> str:
if ARCHIVE_CACHE.is_file():
print("Archive is cached, loading that")
with open(ARCHIVE_CACHE, "r") as fp:
archive_html = fp.read()
else:
print("Downloading archive")
archive_html = download_html(ARCHIVE_URL)
with open(ARCHIVE_CACHE, "w") as fp:
fp.write(archive_html)
return archive_html
################################################################################
# main
################################################################################
# Make data, comic, and image directory
try:
DATA_DIR.mkdir(exist_ok=True)
IMAGE_DIR.mkdir(exist_ok=True)
COMIC_DIR.mkdir(exist_ok=True)
except PermissionError:
print("ERROR: could not create data, image, or comic cache directory")
sys.exit(1)
# Fetch all comic links if needed
archive_html = get_archive_html()
parser = ArchiveParser()
parser.feed(archive_html)
# All dates of comics
dates = parser.dates
alt_text = {}
for date in dates:
comic = Comic(date=date)
alt_text[date.iso] = comic.alt_text
# Also download the comic image
comic.get_comic_image()
with open(ALT_TEXT_JSON, 'w') as fp:
json.dump(alt_text, fp, indent=4)
print("Done")