Add gitignore, fetch script, and postprocess script
Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
225
fetch-achewood.py
Executable file
225
fetch-achewood.py
Executable file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
# Fetches all Achewood comics and metadata.
|
||||
from pathlib import Path
|
||||
import urllib.request as request
|
||||
import sys
|
||||
import json
|
||||
from html.parser import HTMLParser
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional
|
||||
import time
|
||||
|
||||
|
||||
################################################################################
|
||||
# Config options - probably don't change these
|
||||
################################################################################
|
||||
|
||||
# The number of seconds to wait between consecutive downloads
|
||||
RATE_LIMIT = 0.5
|
||||
# The directory to save cached data in
|
||||
DATA_DIR = Path("data")
|
||||
|
||||
################################################################################
|
||||
# Constants - don't change these
|
||||
################################################################################
|
||||
|
||||
ACHEWOOD_URL = "http://achewood.com"
|
||||
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
|
||||
|
||||
IMAGE_DIR = DATA_DIR / "images"
|
||||
COMIC_DIR = DATA_DIR / "comics"
|
||||
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
|
||||
ARCHIVE_CACHE = DATA_DIR / "archive.html"
|
||||
LINK_RE = re.compile(
|
||||
r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
|
||||
)
|
||||
RATE_LIMIT = float(RATE_LIMIT)
|
||||
|
||||
################################################################################
|
||||
# Classes
|
||||
################################################################################
|
||||
@dataclass
|
||||
class Date:
|
||||
"Simple date class"
|
||||
|
||||
year: str
|
||||
month: str
|
||||
day: str
|
||||
|
||||
@property
|
||||
def iso(self):
|
||||
return f"{self.year}{self.month}{self.day}"
|
||||
|
||||
@property
|
||||
def us(self):
|
||||
return f"{self.month}{self.day}{self.year}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Comic:
|
||||
"Comic metadata"
|
||||
date: Date
|
||||
|
||||
@property
|
||||
def alt_text(self) -> str:
|
||||
comic_html = self.get_comic_html()
|
||||
parser = ComicAltParser()
|
||||
parser.feed(comic_html)
|
||||
return parser.alt_text
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||||
|
||||
@property
|
||||
def image_url(self) -> str:
|
||||
return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
|
||||
|
||||
@property
|
||||
def image_path(self) -> Path:
|
||||
return IMAGE_DIR / (self.date.iso + ".gif")
|
||||
|
||||
@property
|
||||
def comic_url(self) -> str:
|
||||
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||||
|
||||
@property
|
||||
def comic_path(self) -> Path:
|
||||
return COMIC_DIR / (self.date.iso + ".html")
|
||||
|
||||
def get_comic_html(self) -> str:
|
||||
"Gets the HTML for this comic and caches it"
|
||||
if self.comic_path.is_file():
|
||||
with open(self.comic_path) as fp:
|
||||
return fp.read()
|
||||
else:
|
||||
print(f"Downloading HTML for comic date {self.date.iso}")
|
||||
comic_html = download_html(self.comic_url)
|
||||
with open(self.comic_path, 'w') as fp:
|
||||
fp.write(comic_html)
|
||||
return comic_html
|
||||
|
||||
def get_comic_image(self) -> bytes:
|
||||
"Gets the image bytes for this comic and caches it"
|
||||
if self.image_path.is_file():
|
||||
with open(self.image_path, "rb") as fp:
|
||||
return fp.read()
|
||||
else:
|
||||
print(f"Downloading image for comic date {self.date.iso}")
|
||||
image = download(self.image_url)
|
||||
with open(self.image_path, 'wb') as fp:
|
||||
fp.write(image)
|
||||
return image
|
||||
|
||||
|
||||
class ArchiveParser(HTMLParser):
|
||||
"Achewood archive HTML parser"
|
||||
|
||||
def __init__(self):
|
||||
self.dates = []
|
||||
super().__init__()
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
if tag != "a" or "href" not in attrs:
|
||||
return
|
||||
if link := LINK_RE.fullmatch(attrs["href"]):
|
||||
self.dates += [
|
||||
Date(year=link["year"], month=link["month"], day=link["day"])
|
||||
]
|
||||
|
||||
|
||||
class ComicAltParser(HTMLParser):
|
||||
def __init__(self):
|
||||
self.alt_text = None
|
||||
super().__init__()
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
if tag != "img" or "title" not in attrs:
|
||||
return
|
||||
if self.alt_text is not None:
|
||||
print("Warning: replacing already-existing alt text")
|
||||
print("Previous:")
|
||||
print("\t", self.alt_text)
|
||||
print("New:")
|
||||
print("\t", attrs["title"])
|
||||
self.alt_text = attrs["title"]
|
||||
|
||||
|
||||
################################################################################
|
||||
# Utility functions
|
||||
################################################################################
|
||||
|
||||
|
||||
# global variable keeping track of the time that the last download was completed
|
||||
last_download = 0.0
|
||||
|
||||
def download(url: str) -> bytes:
|
||||
global last_download
|
||||
# Sleep if needed to rate-limit all downloads
|
||||
next_download = last_download + RATE_LIMIT
|
||||
now = time.time()
|
||||
delta = next_download - now
|
||||
if delta > 0.0:
|
||||
time.sleep(delta)
|
||||
|
||||
with request.urlopen(url) as f:
|
||||
if f.status != 200:
|
||||
raise Exception(f"URL {url} returned non-200 status {f.status}")
|
||||
# update the download global
|
||||
last_download = time.time()
|
||||
return f.read()
|
||||
|
||||
|
||||
def download_html(url: str) -> str:
|
||||
return download(url).decode("utf-8")
|
||||
|
||||
|
||||
def get_archive_html() -> str:
|
||||
if ARCHIVE_CACHE.is_file():
|
||||
print("Archive is cached, loading that")
|
||||
with open(ARCHIVE_CACHE, "r") as fp:
|
||||
archive_html = fp.read()
|
||||
else:
|
||||
print("Downloading archive")
|
||||
archive_html = download_html(ARCHIVE_URL)
|
||||
with open(ARCHIVE_CACHE, "w") as fp:
|
||||
fp.write(archive_html)
|
||||
return archive_html
|
||||
|
||||
|
||||
################################################################################
|
||||
# main
|
||||
################################################################################
|
||||
|
||||
# Make data, comic, and image directory
|
||||
try:
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
IMAGE_DIR.mkdir(exist_ok=True)
|
||||
COMIC_DIR.mkdir(exist_ok=True)
|
||||
except PermissionError:
|
||||
print("ERROR: could not create data, image, or comic cache directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch all comic links if needed
|
||||
archive_html = get_archive_html()
|
||||
parser = ArchiveParser()
|
||||
parser.feed(archive_html)
|
||||
|
||||
# All dates of comics
|
||||
dates = parser.dates
|
||||
|
||||
alt_text = {}
|
||||
|
||||
for date in dates:
|
||||
comic = Comic(date=date)
|
||||
alt_text[date.iso] = comic.alt_text
|
||||
# Also download the comic image
|
||||
comic.get_comic_image()
|
||||
|
||||
with open(ALT_TEXT_JSON, 'w') as fp:
|
||||
json.dump(alt_text, fp, indent=4)
|
||||
|
||||
print("Done")
|
||||
Reference in New Issue
Block a user