Add gitignore, fetch script, and postprocess script

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
2022-04-27 15:00:10 -07:00
commit 28faa3e0b1
3 changed files with 460 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,188 @@
+# Data and out directories
+data/
+out/
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+# End of https://www.toptal.com/developers/gitignore/api/vim,python
--- a/fetch-achewood.py
+++ b/fetch-achewood.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Fetches all Achewood comics and metadata.
+from pathlib import Path
+import urllib.request as request
+import sys
+import json
+from html.parser import HTMLParser
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import time
+
+
+################################################################################
+# Config options - probably don't change these
+################################################################################
+
+# The number of seconds to wait between consecutive downloads
+RATE_LIMIT = 0.5
+# The directory to save cached data in
+DATA_DIR = Path("data")
+
+################################################################################
+# Constants - don't change these
+################################################################################
+
+ACHEWOOD_URL = "http://achewood.com"
+ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
+
+IMAGE_DIR = DATA_DIR / "images"
+COMIC_DIR = DATA_DIR / "comics"
+ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
+ARCHIVE_CACHE = DATA_DIR / "archive.html"
+LINK_RE = re.compile(
+    r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
+)
+RATE_LIMIT = float(RATE_LIMIT)
+
+################################################################################
+# Classes
+################################################################################
+@dataclass
+class Date:
+    "Simple date class"
+
+    year: str
+    month: str
+    day: str
+
+    @property
+    def iso(self):
+        return f"{self.year}{self.month}{self.day}"
+
+    @property
+    def us(self):
+        return f"{self.month}{self.day}{self.year}"
+
+
+@dataclass
+class Comic:
+    "Comic metadata"
+    date: Date
+
+    @property
+    def alt_text(self) -> str:
+        comic_html = self.get_comic_html()
+        parser = ComicAltParser()
+        parser.feed(comic_html)
+        return parser.alt_text
+
+    @property
+    def url(self) -> str:
+        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
+
+    @property
+    def image_url(self) -> str:
+        return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
+
+    @property
+    def image_path(self) -> Path:
+        return IMAGE_DIR / (self.date.iso + ".gif")
+
+    @property
+    def comic_url(self) -> str:
+        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
+
+    @property
+    def comic_path(self) -> Path:
+        return COMIC_DIR / (self.date.iso + ".html")
+
+    def get_comic_html(self) -> str:
+        "Gets the HTML for this comic and caches it"
+        if self.comic_path.is_file():
+            with open(self.comic_path) as fp:
+                return fp.read()
+        else:
+            print(f"Downloading HTML for comic date {self.date.iso}")
+            comic_html = download_html(self.comic_url)
+            with open(self.comic_path, 'w') as fp:
+                fp.write(comic_html)
+            return comic_html
+
+    def get_comic_image(self) -> bytes:
+        "Gets the image bytes for this comic and caches it"
+        if self.image_path.is_file():
+            with open(self.image_path, "rb") as fp:
+                return fp.read()
+        else:
+            print(f"Downloading image for comic date {self.date.iso}")
+            image = download(self.image_url)
+            with open(self.image_path, 'wb') as fp:
+                fp.write(image)
+            return image
+
+
+class ArchiveParser(HTMLParser):
+    "Achewood archive HTML parser"
+
+    def __init__(self):
+        self.dates = []
+        super().__init__()
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag != "a" or "href" not in attrs:
+            return
+        if link := LINK_RE.fullmatch(attrs["href"]):
+            self.dates += [
+                Date(year=link["year"], month=link["month"], day=link["day"])
+            ]
+
+
+class ComicAltParser(HTMLParser):
+    def __init__(self):
+        self.alt_text = None
+        super().__init__()
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag != "img" or "title" not in attrs:
+            return
+        if self.alt_text is not None:
+            print("Warning: replacing already-existing alt text")
+            print("Previous:")
+            print("\t", self.alt_text)
+            print("New:")
+            print("\t", attrs["title"])
+        self.alt_text = attrs["title"]
+
+
+################################################################################
+# Utility functions
+################################################################################
+
+
+# global variable keeping track of the time that the last download was completed
+last_download = 0.0
+
+def download(url: str) -> bytes:
+    global last_download
+    # Sleep if needed to rate-limit all downloads
+    next_download = last_download + RATE_LIMIT
+    now = time.time()
+    delta = next_download - now
+    if delta > 0.0:
+        time.sleep(delta)
+
+    with request.urlopen(url) as f:
+        if f.status != 200:
+            raise Exception(f"URL {url} returned non-200 status {f.status}")
+        # update the download global
+        last_download = time.time()
+        return f.read()
+
+
+def download_html(url: str) -> str:
+    return download(url).decode("utf-8")
+
+
+def get_archive_html() -> str:
+    if ARCHIVE_CACHE.is_file():
+        print("Archive is cached, loading that")
+        with open(ARCHIVE_CACHE, "r") as fp:
+            archive_html = fp.read()
+    else:
+        print("Downloading archive")
+        archive_html = download_html(ARCHIVE_URL)
+        with open(ARCHIVE_CACHE, "w") as fp:
+            fp.write(archive_html)
+    return archive_html
+
+
+################################################################################
+# main
+################################################################################
+
+# Make data, comic, and image directory
+try:
+    DATA_DIR.mkdir(exist_ok=True)
+    IMAGE_DIR.mkdir(exist_ok=True)
+    COMIC_DIR.mkdir(exist_ok=True)
+except PermissionError:
+    print("ERROR: could not create data, image, or comic cache directory")
+    sys.exit(1)
+
+# Fetch all comic links if needed
+archive_html = get_archive_html()
+parser = ArchiveParser()
+parser.feed(archive_html)
+
+# All dates of comics
+dates = parser.dates
+
+alt_text = {}
+
+for date in dates:
+    comic = Comic(date=date)
+    alt_text[date.iso] = comic.alt_text
+    # Also download the comic image
+    comic.get_comic_image()
+
+with open(ALT_TEXT_JSON, 'w') as fp:
+    json.dump(alt_text, fp, indent=4)
+
+print("Done")
--- a/postprocess.sh
+++ b/postprocess.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Postprocesses all fetched comics from the fetch-comics.py file
+set -euo pipefail
+
+HERE="$(dirname "$0")"
+DATA_DIR="$HERE/data"
+IMAGE_DIR="$DATA_DIR/images"
+OUT_DIR="$HERE/out"
+#alt_json="$HERE/alt_text.json"
+alt_json="$DATA_DIR/alt_text.json"
+
+if [[ ! -f "$alt_json" ]]; then
+    echo "Could not find $alt_json - have you run the fetch script?"
+    exit 1
+fi
+
+mkdir -vp "$OUT_DIR"
+
+# Get the list of all comic dates
+readarray -t dates < <(jq -r 'keys[]' < "$alt_json")
+
+#echo ${#dates[@]}
+for date in "${dates[@]}"; do
+    echo -n "Processing comic $date ... "
+    image_in="$IMAGE_DIR/$date.gif"
+    image_out="$OUT_DIR/$date.png"
+    if [[ -f "$image_out" ]]; then
+        echo "already exists, skipping"
+        continue
+    fi
+    alt_text="$(jq -r ".[\"$date\"]" < "$alt_json")"
+
+    if [[ -z "$alt_text" ]]; then
+        convert "$image_in" \
+            -background white -alpha background \
+            "$image_out"
+    else
+        convert "$image_in" \
+            -background white -alpha background \
+            -gravity center \
+            -pointsize 12 -size 360x caption:"$alt_text" \
+            -append \
+            "$image_out"
+    fi
+
+    echo "OK"
+done