From 28faa3e0b16f1293b8b2d244c0023ab9fa8c6bfa Mon Sep 17 00:00:00 2001 From: Alek Ratzloff Date: Wed, 27 Apr 2022 15:00:10 -0700 Subject: [PATCH] Add gitignore, fetch script, and postprocess script Signed-off-by: Alek Ratzloff --- .gitignore | 188 ++++++++++++++++++++++++++++++++++++++ fetch-achewood.py | 225 ++++++++++++++++++++++++++++++++++++++++++++++ postprocess.sh | 47 ++++++++++ 3 files changed, 460 insertions(+) create mode 100644 .gitignore create mode 100755 fetch-achewood.py create mode 100755 postprocess.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3e78ab0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,188 @@ +# Data and out directories +data/ +out/ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# End of https://www.toptal.com/developers/gitignore/api/vim,python diff --git a/fetch-achewood.py b/fetch-achewood.py new file mode 100755 index 0000000..aa840bb --- /dev/null +++ b/fetch-achewood.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# Fetches all Achewood comics and metadata. +from pathlib import Path +import urllib.request as request +import sys +import json +from html.parser import HTMLParser +import re +from dataclasses import dataclass +from typing import Any, Dict, Optional +import time + + +################################################################################ +# Config options - probably don't change these +################################################################################ + +# The number of seconds to wait between consecutive downloads +RATE_LIMIT = 0.5 +# The directory to save cached data in +DATA_DIR = Path("data") + +################################################################################ +# Constants - don't change these +################################################################################ + +ACHEWOOD_URL = "http://achewood.com" +ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php" + +IMAGE_DIR = DATA_DIR / "images" +COMIC_DIR = DATA_DIR / "comics" +ALT_TEXT_JSON = DATA_DIR / "alt_text.json" +ARCHIVE_CACHE = DATA_DIR / "archive.html" +LINK_RE = re.compile( + r"^index\.php\?date=(?P\d\d)(?P\d\d)(?P\d\d\d\d)$" +) +RATE_LIMIT = float(RATE_LIMIT) + +################################################################################ +# Classes +################################################################################ +@dataclass +class Date: + "Simple date class" + + year: str + month: str + day: str + + @property + def iso(self): + return f"{self.year}{self.month}{self.day}" + + @property + def us(self): + return f"{self.month}{self.day}{self.year}" + + +@dataclass +class Comic: + "Comic metadata" + date: Date + + @property + def alt_text(self) -> str: + comic_html = self.get_comic_html() + parser = ComicAltParser() + parser.feed(comic_html) + return parser.alt_text + + @property + def url(self) -> str: + return f"{ACHEWOOD_URL}/index.php?date={self.date.us}" + + @property + def image_url(self) -> str: + return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}" + + @property + def image_path(self) -> Path: + return IMAGE_DIR / (self.date.iso + ".gif") + + @property + def comic_url(self) -> str: + return f"{ACHEWOOD_URL}/index.php?date={self.date.us}" + + @property + def comic_path(self) -> Path: + return COMIC_DIR / (self.date.iso + ".html") + + def get_comic_html(self) -> str: + "Gets the HTML for this comic and caches it" + if self.comic_path.is_file(): + with open(self.comic_path) as fp: + return fp.read() + else: + print(f"Downloading HTML for comic date {self.date.iso}") + comic_html = download_html(self.comic_url) + with open(self.comic_path, 'w') as fp: + fp.write(comic_html) + return comic_html + + def get_comic_image(self) -> bytes: + "Gets the image bytes for this comic and caches it" + if self.image_path.is_file(): + with open(self.image_path, "rb") as fp: + return fp.read() + else: + print(f"Downloading image for comic date {self.date.iso}") + image = download(self.image_url) + with open(self.image_path, 'wb') as fp: + fp.write(image) + return image + + +class ArchiveParser(HTMLParser): + "Achewood archive HTML parser" + + def __init__(self): + self.dates = [] + super().__init__() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag != "a" or "href" not in attrs: + return + if link := LINK_RE.fullmatch(attrs["href"]): + self.dates += [ + Date(year=link["year"], month=link["month"], day=link["day"]) + ] + + +class ComicAltParser(HTMLParser): + def __init__(self): + self.alt_text = None + super().__init__() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag != "img" or "title" not in attrs: + return + if self.alt_text is not None: + print("Warning: replacing already-existing alt text") + print("Previous:") + print("\t", self.alt_text) + print("New:") + print("\t", attrs["title"]) + self.alt_text = attrs["title"] + + +################################################################################ +# Utility functions +################################################################################ + + +# global variable keeping track of the time that the last download was completed +last_download = 0.0 + +def download(url: str) -> bytes: + global last_download + # Sleep if needed to rate-limit all downloads + next_download = last_download + RATE_LIMIT + now = time.time() + delta = next_download - now + if delta > 0.0: + time.sleep(delta) + + with request.urlopen(url) as f: + if f.status != 200: + raise Exception(f"URL {url} returned non-200 status {f.status}") + # update the download global + last_download = time.time() + return f.read() + + +def download_html(url: str) -> str: + return download(url).decode("utf-8") + + +def get_archive_html() -> str: + if ARCHIVE_CACHE.is_file(): + print("Archive is cached, loading that") + with open(ARCHIVE_CACHE, "r") as fp: + archive_html = fp.read() + else: + print("Downloading archive") + archive_html = download_html(ARCHIVE_URL) + with open(ARCHIVE_CACHE, "w") as fp: + fp.write(archive_html) + return archive_html + + +################################################################################ +# main +################################################################################ + +# Make data, comic, and image directory +try: + DATA_DIR.mkdir(exist_ok=True) + IMAGE_DIR.mkdir(exist_ok=True) + COMIC_DIR.mkdir(exist_ok=True) +except PermissionError: + print("ERROR: could not create data, image, or comic cache directory") + sys.exit(1) + +# Fetch all comic links if needed +archive_html = get_archive_html() +parser = ArchiveParser() +parser.feed(archive_html) + +# All dates of comics +dates = parser.dates + +alt_text = {} + +for date in dates: + comic = Comic(date=date) + alt_text[date.iso] = comic.alt_text + # Also download the comic image + comic.get_comic_image() + +with open(ALT_TEXT_JSON, 'w') as fp: + json.dump(alt_text, fp, indent=4) + +print("Done") diff --git a/postprocess.sh b/postprocess.sh new file mode 100755 index 0000000..82ea454 --- /dev/null +++ b/postprocess.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Postprocesses all fetched comics from the fetch-comics.py file +set -euo pipefail + +HERE="$(dirname "$0")" +DATA_DIR="$HERE/data" +IMAGE_DIR="$DATA_DIR/images" +OUT_DIR="$HERE/out" +#alt_json="$HERE/alt_text.json" +alt_json="$DATA_DIR/alt_text.json" + +if [[ ! -f "$alt_json" ]]; then + echo "Could not find $alt_json - have you run the fetch script?" + exit 1 +fi + +mkdir -vp "$OUT_DIR" + +# Get the list of all comic dates +readarray -t dates < <(jq -r 'keys[]' < "$alt_json") + +#echo ${#dates[@]} +for date in "${dates[@]}"; do + echo -n "Processing comic $date ... " + image_in="$IMAGE_DIR/$date.gif" + image_out="$OUT_DIR/$date.png" + if [[ -f "$image_out" ]]; then + echo "already exists, skipping" + continue + fi + alt_text="$(jq -r ".[\"$date\"]" < "$alt_json")" + + if [[ -z "$alt_text" ]]; then + convert "$image_in" \ + -background white -alpha background \ + "$image_out" + else + convert "$image_in" \ + -background white -alpha background \ + -gravity center \ + -pointsize 12 -size 360x caption:"$alt_text" \ + -append \ + "$image_out" + fi + + echo "OK" +done