Add gitignore, fetch script, and postprocess script

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
2022-04-27 15:00:10 -07:00
commit 28faa3e0b1
3 changed files with 460 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,188 @@
 # Data and out directories
 data/
 out/
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 ### Vim ###
 # Swap
 [._]*.s[a-v][a-z]
 !*.svg  # comment out if you don't need vector files
 [._]*.sw[a-p]
 [._]s[a-rt-v][a-z]
 [._]ss[a-gi-z]
 [._]sw[a-p]
 # Session
 Session.vim
 Sessionx.vim
 # Temporary
 .netrwhist
 *~
 # Auto-generated tag files
 tags
 # Persistent undo
 [._]*.un~
 # End of https://www.toptal.com/developers/gitignore/api/vim,python
--- a/fetch-achewood.py
+++ b/fetch-achewood.py
@@ -0,0 +1,225 @@
 #!/usr/bin/env python3
 # Fetches all Achewood comics and metadata.
 from pathlib import Path
 import urllib.request as request
 import sys
 import json
 from html.parser import HTMLParser
 import re
 from dataclasses import dataclass
 from typing import Any, Dict, Optional
 import time
 ################################################################################
 # Config options - probably don't change these
 ################################################################################
 # The number of seconds to wait between consecutive downloads
 RATE_LIMIT = 0.5
 # The directory to save cached data in
 DATA_DIR = Path("data")
 ################################################################################
 # Constants - don't change these
 ################################################################################
 ACHEWOOD_URL = "http://achewood.com"
 ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
 IMAGE_DIR = DATA_DIR / "images"
 COMIC_DIR = DATA_DIR / "comics"
 ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
 ARCHIVE_CACHE = DATA_DIR / "archive.html"
 LINK_RE = re.compile(
    r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
 )
 RATE_LIMIT = float(RATE_LIMIT)
 ################################################################################
 # Classes
 ################################################################################
@dataclass
 class Date:
    "Simple date class"
    year: str
    month: str
    day: str
    @property
    def iso(self):
        return f"{self.year}{self.month}{self.day}"
    @property
    def us(self):
        return f"{self.month}{self.day}{self.year}"
@dataclass
 class Comic:
    "Comic metadata"
    date: Date
    @property
    def alt_text(self) -> str:
        comic_html = self.get_comic_html()
        parser = ComicAltParser()
        parser.feed(comic_html)
        return parser.alt_text
    @property
    def url(self) -> str:
        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
    @property
    def image_url(self) -> str:
        return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
    @property
    def image_path(self) -> Path:
        return IMAGE_DIR / (self.date.iso + ".gif")
    @property
    def comic_url(self) -> str:
        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
    @property
    def comic_path(self) -> Path:
        return COMIC_DIR / (self.date.iso + ".html")
    def get_comic_html(self) -> str:
        "Gets the HTML for this comic and caches it"
        if self.comic_path.is_file():
            with open(self.comic_path) as fp:
                return fp.read()
        else:
            print(f"Downloading HTML for comic date {self.date.iso}")
            comic_html = download_html(self.comic_url)
            with open(self.comic_path, 'w') as fp:
                fp.write(comic_html)
            return comic_html
    def get_comic_image(self) -> bytes:
        "Gets the image bytes for this comic and caches it"
        if self.image_path.is_file():
            with open(self.image_path, "rb") as fp:
                return fp.read()
        else:
            print(f"Downloading image for comic date {self.date.iso}")
            image = download(self.image_url)
            with open(self.image_path, 'wb') as fp:
                fp.write(image)
            return image
 class ArchiveParser(HTMLParser):
    "Achewood archive HTML parser"
    def __init__(self):
        self.dates = []
        super().__init__()
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag != "a" or "href" not in attrs:
            return
        if link := LINK_RE.fullmatch(attrs["href"]):
            self.dates += [
                Date(year=link["year"], month=link["month"], day=link["day"])
            ]
 class ComicAltParser(HTMLParser):
    def __init__(self):
        self.alt_text = None
        super().__init__()
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag != "img" or "title" not in attrs:
            return
        if self.alt_text is not None:
            print("Warning: replacing already-existing alt text")
            print("Previous:")
            print("\t", self.alt_text)
            print("New:")
            print("\t", attrs["title"])
        self.alt_text = attrs["title"]
 ################################################################################
 # Utility functions
 ################################################################################
 # global variable keeping track of the time that the last download was completed
 last_download = 0.0
 def download(url: str) -> bytes:
    global last_download
    # Sleep if needed to rate-limit all downloads
    next_download = last_download + RATE_LIMIT
    now = time.time()
    delta = next_download - now
    if delta > 0.0:
        time.sleep(delta)
    with request.urlopen(url) as f:
        if f.status != 200:
            raise Exception(f"URL {url} returned non-200 status {f.status}")
        # update the download global
        last_download = time.time()
        return f.read()
 def download_html(url: str) -> str:
    return download(url).decode("utf-8")
 def get_archive_html() -> str:
    if ARCHIVE_CACHE.is_file():
        print("Archive is cached, loading that")
        with open(ARCHIVE_CACHE, "r") as fp:
            archive_html = fp.read()
    else:
        print("Downloading archive")
        archive_html = download_html(ARCHIVE_URL)
        with open(ARCHIVE_CACHE, "w") as fp:
            fp.write(archive_html)
    return archive_html
 ################################################################################
 # main
 ################################################################################
 # Make data, comic, and image directory
 try:
    DATA_DIR.mkdir(exist_ok=True)
    IMAGE_DIR.mkdir(exist_ok=True)
    COMIC_DIR.mkdir(exist_ok=True)
 except PermissionError:
    print("ERROR: could not create data, image, or comic cache directory")
    sys.exit(1)
 # Fetch all comic links if needed
 archive_html = get_archive_html()
 parser = ArchiveParser()
 parser.feed(archive_html)
 # All dates of comics
 dates = parser.dates
 alt_text = {}
 for date in dates:
    comic = Comic(date=date)
    alt_text[date.iso] = comic.alt_text
    # Also download the comic image
    comic.get_comic_image()
 with open(ALT_TEXT_JSON, 'w') as fp:
    json.dump(alt_text, fp, indent=4)
 print("Done")
--- a/postprocess.sh
+++ b/postprocess.sh
@@ -0,0 +1,47 @@
 #!/bin/bash
 # Postprocesses all fetched comics from the fetch-comics.py file
 set -euo pipefail
 HERE="$(dirname "$0")"
 DATA_DIR="$HERE/data"
 IMAGE_DIR="$DATA_DIR/images"
 OUT_DIR="$HERE/out"
 #alt_json="$HERE/alt_text.json"
 alt_json="$DATA_DIR/alt_text.json"
 if [[ ! -f "$alt_json" ]]; then
    echo "Could not find $alt_json - have you run the fetch script?"
    exit 1
 fi
 mkdir -vp "$OUT_DIR"
 # Get the list of all comic dates
 readarray -t dates < <(jq -r 'keys[]' < "$alt_json")
 #echo ${#dates[@]}
 for date in "${dates[@]}"; do
    echo -n "Processing comic $date ... "
    image_in="$IMAGE_DIR/$date.gif"
    image_out="$OUT_DIR/$date.png"
    if [[ -f "$image_out" ]]; then
        echo "already exists, skipping"
        continue
    fi
    alt_text="$(jq -r ".[\"$date\"]" < "$alt_json")"
    if [[ -z "$alt_text" ]]; then
        convert "$image_in" \
            -background white -alpha background \
            "$image_out"
    else
        convert "$image_in" \
            -background white -alpha background \
            -gravity center \
            -pointsize 12 -size 360x caption:"$alt_text" \
            -append \
            "$image_out"
    fi
    echo "OK"
 done