Add gitignore, fetch script, and postprocess script

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2022-04-27 15:00:10 -07:00
commit 28faa3e0b1
3 changed files with 460 additions and 0 deletions

188
.gitignore vendored Normal file
View File

@@ -0,0 +1,188 @@
# Data and out directories
data/
out/
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
*~
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
# End of https://www.toptal.com/developers/gitignore/api/vim,python

225
fetch-achewood.py Executable file
View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
# Fetches all Achewood comics and metadata.
from pathlib import Path
import urllib.request as request
import sys
import json
from html.parser import HTMLParser
import re
from dataclasses import dataclass
from typing import Any, Dict, Optional
import time
################################################################################
# Config options - probably don't change these
################################################################################
# The number of seconds to wait between consecutive downloads
RATE_LIMIT = 0.5
# The directory to save cached data in
DATA_DIR = Path("data")
################################################################################
# Constants - don't change these
################################################################################
ACHEWOOD_URL = "http://achewood.com"
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
IMAGE_DIR = DATA_DIR / "images"
COMIC_DIR = DATA_DIR / "comics"
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
ARCHIVE_CACHE = DATA_DIR / "archive.html"
LINK_RE = re.compile(
r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
)
RATE_LIMIT = float(RATE_LIMIT)
################################################################################
# Classes
################################################################################
@dataclass
class Date:
"Simple date class"
year: str
month: str
day: str
@property
def iso(self):
return f"{self.year}{self.month}{self.day}"
@property
def us(self):
return f"{self.month}{self.day}{self.year}"
@dataclass
class Comic:
"Comic metadata"
date: Date
@property
def alt_text(self) -> str:
comic_html = self.get_comic_html()
parser = ComicAltParser()
parser.feed(comic_html)
return parser.alt_text
@property
def url(self) -> str:
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
@property
def image_url(self) -> str:
return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
@property
def image_path(self) -> Path:
return IMAGE_DIR / (self.date.iso + ".gif")
@property
def comic_url(self) -> str:
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
@property
def comic_path(self) -> Path:
return COMIC_DIR / (self.date.iso + ".html")
def get_comic_html(self) -> str:
"Gets the HTML for this comic and caches it"
if self.comic_path.is_file():
with open(self.comic_path) as fp:
return fp.read()
else:
print(f"Downloading HTML for comic date {self.date.iso}")
comic_html = download_html(self.comic_url)
with open(self.comic_path, 'w') as fp:
fp.write(comic_html)
return comic_html
def get_comic_image(self) -> bytes:
"Gets the image bytes for this comic and caches it"
if self.image_path.is_file():
with open(self.image_path, "rb") as fp:
return fp.read()
else:
print(f"Downloading image for comic date {self.date.iso}")
image = download(self.image_url)
with open(self.image_path, 'wb') as fp:
fp.write(image)
return image
class ArchiveParser(HTMLParser):
"Achewood archive HTML parser"
def __init__(self):
self.dates = []
super().__init__()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag != "a" or "href" not in attrs:
return
if link := LINK_RE.fullmatch(attrs["href"]):
self.dates += [
Date(year=link["year"], month=link["month"], day=link["day"])
]
class ComicAltParser(HTMLParser):
def __init__(self):
self.alt_text = None
super().__init__()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag != "img" or "title" not in attrs:
return
if self.alt_text is not None:
print("Warning: replacing already-existing alt text")
print("Previous:")
print("\t", self.alt_text)
print("New:")
print("\t", attrs["title"])
self.alt_text = attrs["title"]
################################################################################
# Utility functions
################################################################################
# global variable keeping track of the time that the last download was completed
last_download = 0.0
def download(url: str) -> bytes:
global last_download
# Sleep if needed to rate-limit all downloads
next_download = last_download + RATE_LIMIT
now = time.time()
delta = next_download - now
if delta > 0.0:
time.sleep(delta)
with request.urlopen(url) as f:
if f.status != 200:
raise Exception(f"URL {url} returned non-200 status {f.status}")
# update the download global
last_download = time.time()
return f.read()
def download_html(url: str) -> str:
return download(url).decode("utf-8")
def get_archive_html() -> str:
if ARCHIVE_CACHE.is_file():
print("Archive is cached, loading that")
with open(ARCHIVE_CACHE, "r") as fp:
archive_html = fp.read()
else:
print("Downloading archive")
archive_html = download_html(ARCHIVE_URL)
with open(ARCHIVE_CACHE, "w") as fp:
fp.write(archive_html)
return archive_html
################################################################################
# main
################################################################################
# Make data, comic, and image directory
try:
DATA_DIR.mkdir(exist_ok=True)
IMAGE_DIR.mkdir(exist_ok=True)
COMIC_DIR.mkdir(exist_ok=True)
except PermissionError:
print("ERROR: could not create data, image, or comic cache directory")
sys.exit(1)
# Fetch all comic links if needed
archive_html = get_archive_html()
parser = ArchiveParser()
parser.feed(archive_html)
# All dates of comics
dates = parser.dates
alt_text = {}
for date in dates:
comic = Comic(date=date)
alt_text[date.iso] = comic.alt_text
# Also download the comic image
comic.get_comic_image()
with open(ALT_TEXT_JSON, 'w') as fp:
json.dump(alt_text, fp, indent=4)
print("Done")

47
postprocess.sh Executable file
View File

@@ -0,0 +1,47 @@
#!/bin/bash
# Postprocesses all fetched comics from the fetch-comics.py file
set -euo pipefail
HERE="$(dirname "$0")"
DATA_DIR="$HERE/data"
IMAGE_DIR="$DATA_DIR/images"
OUT_DIR="$HERE/out"
#alt_json="$HERE/alt_text.json"
alt_json="$DATA_DIR/alt_text.json"
if [[ ! -f "$alt_json" ]]; then
echo "Could not find $alt_json - have you run the fetch script?"
exit 1
fi
mkdir -vp "$OUT_DIR"
# Get the list of all comic dates
readarray -t dates < <(jq -r 'keys[]' < "$alt_json")
#echo ${#dates[@]}
for date in "${dates[@]}"; do
echo -n "Processing comic $date ... "
image_in="$IMAGE_DIR/$date.gif"
image_out="$OUT_DIR/$date.png"
if [[ -f "$image_out" ]]; then
echo "already exists, skipping"
continue
fi
alt_text="$(jq -r ".[\"$date\"]" < "$alt_json")"
if [[ -z "$alt_text" ]]; then
convert "$image_in" \
-background white -alpha background \
"$image_out"
else
convert "$image_in" \
-background white -alpha background \
-gravity center \
-pointsize 12 -size 360x caption:"$alt_text" \
-append \
"$image_out"
fi
echo "OK"
done