Add gitignore, fetch script, and postprocess script
Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
188
.gitignore
vendored
Normal file
188
.gitignore
vendored
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
# Data and out directories
|
||||||
|
data/
|
||||||
|
out/
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
### Vim ###
|
||||||
|
# Swap
|
||||||
|
[._]*.s[a-v][a-z]
|
||||||
|
!*.svg # comment out if you don't need vector files
|
||||||
|
[._]*.sw[a-p]
|
||||||
|
[._]s[a-rt-v][a-z]
|
||||||
|
[._]ss[a-gi-z]
|
||||||
|
[._]sw[a-p]
|
||||||
|
|
||||||
|
# Session
|
||||||
|
Session.vim
|
||||||
|
Sessionx.vim
|
||||||
|
|
||||||
|
# Temporary
|
||||||
|
.netrwhist
|
||||||
|
*~
|
||||||
|
# Auto-generated tag files
|
||||||
|
tags
|
||||||
|
# Persistent undo
|
||||||
|
[._]*.un~
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/vim,python
|
||||||
225
fetch-achewood.py
Executable file
225
fetch-achewood.py
Executable file
@@ -0,0 +1,225 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Fetches all Achewood comics and metadata.
|
||||||
|
from pathlib import Path
|
||||||
|
import urllib.request as request
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Config options - probably don't change these
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# The number of seconds to wait between consecutive downloads
|
||||||
|
RATE_LIMIT = 0.5
|
||||||
|
# The directory to save cached data in
|
||||||
|
DATA_DIR = Path("data")
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Constants - don't change these
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
ACHEWOOD_URL = "http://achewood.com"
|
||||||
|
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
|
||||||
|
|
||||||
|
IMAGE_DIR = DATA_DIR / "images"
|
||||||
|
COMIC_DIR = DATA_DIR / "comics"
|
||||||
|
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
|
||||||
|
ARCHIVE_CACHE = DATA_DIR / "archive.html"
|
||||||
|
LINK_RE = re.compile(
|
||||||
|
r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
|
||||||
|
)
|
||||||
|
RATE_LIMIT = float(RATE_LIMIT)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Classes
|
||||||
|
################################################################################
|
||||||
|
@dataclass
|
||||||
|
class Date:
|
||||||
|
"Simple date class"
|
||||||
|
|
||||||
|
year: str
|
||||||
|
month: str
|
||||||
|
day: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def iso(self):
|
||||||
|
return f"{self.year}{self.month}{self.day}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def us(self):
|
||||||
|
return f"{self.month}{self.day}{self.year}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Comic:
|
||||||
|
"Comic metadata"
|
||||||
|
date: Date
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alt_text(self) -> str:
|
||||||
|
comic_html = self.get_comic_html()
|
||||||
|
parser = ComicAltParser()
|
||||||
|
parser.feed(comic_html)
|
||||||
|
return parser.alt_text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def url(self) -> str:
|
||||||
|
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def image_url(self) -> str:
|
||||||
|
return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def image_path(self) -> Path:
|
||||||
|
return IMAGE_DIR / (self.date.iso + ".gif")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def comic_url(self) -> str:
|
||||||
|
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def comic_path(self) -> Path:
|
||||||
|
return COMIC_DIR / (self.date.iso + ".html")
|
||||||
|
|
||||||
|
def get_comic_html(self) -> str:
|
||||||
|
"Gets the HTML for this comic and caches it"
|
||||||
|
if self.comic_path.is_file():
|
||||||
|
with open(self.comic_path) as fp:
|
||||||
|
return fp.read()
|
||||||
|
else:
|
||||||
|
print(f"Downloading HTML for comic date {self.date.iso}")
|
||||||
|
comic_html = download_html(self.comic_url)
|
||||||
|
with open(self.comic_path, 'w') as fp:
|
||||||
|
fp.write(comic_html)
|
||||||
|
return comic_html
|
||||||
|
|
||||||
|
def get_comic_image(self) -> bytes:
|
||||||
|
"Gets the image bytes for this comic and caches it"
|
||||||
|
if self.image_path.is_file():
|
||||||
|
with open(self.image_path, "rb") as fp:
|
||||||
|
return fp.read()
|
||||||
|
else:
|
||||||
|
print(f"Downloading image for comic date {self.date.iso}")
|
||||||
|
image = download(self.image_url)
|
||||||
|
with open(self.image_path, 'wb') as fp:
|
||||||
|
fp.write(image)
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveParser(HTMLParser):
|
||||||
|
"Achewood archive HTML parser"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.dates = []
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
if tag != "a" or "href" not in attrs:
|
||||||
|
return
|
||||||
|
if link := LINK_RE.fullmatch(attrs["href"]):
|
||||||
|
self.dates += [
|
||||||
|
Date(year=link["year"], month=link["month"], day=link["day"])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ComicAltParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
self.alt_text = None
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
if tag != "img" or "title" not in attrs:
|
||||||
|
return
|
||||||
|
if self.alt_text is not None:
|
||||||
|
print("Warning: replacing already-existing alt text")
|
||||||
|
print("Previous:")
|
||||||
|
print("\t", self.alt_text)
|
||||||
|
print("New:")
|
||||||
|
print("\t", attrs["title"])
|
||||||
|
self.alt_text = attrs["title"]
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Utility functions
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
# global variable keeping track of the time that the last download was completed
|
||||||
|
last_download = 0.0
|
||||||
|
|
||||||
|
def download(url: str) -> bytes:
|
||||||
|
global last_download
|
||||||
|
# Sleep if needed to rate-limit all downloads
|
||||||
|
next_download = last_download + RATE_LIMIT
|
||||||
|
now = time.time()
|
||||||
|
delta = next_download - now
|
||||||
|
if delta > 0.0:
|
||||||
|
time.sleep(delta)
|
||||||
|
|
||||||
|
with request.urlopen(url) as f:
|
||||||
|
if f.status != 200:
|
||||||
|
raise Exception(f"URL {url} returned non-200 status {f.status}")
|
||||||
|
# update the download global
|
||||||
|
last_download = time.time()
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def download_html(url: str) -> str:
|
||||||
|
return download(url).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def get_archive_html() -> str:
|
||||||
|
if ARCHIVE_CACHE.is_file():
|
||||||
|
print("Archive is cached, loading that")
|
||||||
|
with open(ARCHIVE_CACHE, "r") as fp:
|
||||||
|
archive_html = fp.read()
|
||||||
|
else:
|
||||||
|
print("Downloading archive")
|
||||||
|
archive_html = download_html(ARCHIVE_URL)
|
||||||
|
with open(ARCHIVE_CACHE, "w") as fp:
|
||||||
|
fp.write(archive_html)
|
||||||
|
return archive_html
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# main
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# Make data, comic, and image directory
|
||||||
|
try:
|
||||||
|
DATA_DIR.mkdir(exist_ok=True)
|
||||||
|
IMAGE_DIR.mkdir(exist_ok=True)
|
||||||
|
COMIC_DIR.mkdir(exist_ok=True)
|
||||||
|
except PermissionError:
|
||||||
|
print("ERROR: could not create data, image, or comic cache directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Fetch all comic links if needed
|
||||||
|
archive_html = get_archive_html()
|
||||||
|
parser = ArchiveParser()
|
||||||
|
parser.feed(archive_html)
|
||||||
|
|
||||||
|
# All dates of comics
|
||||||
|
dates = parser.dates
|
||||||
|
|
||||||
|
alt_text = {}
|
||||||
|
|
||||||
|
for date in dates:
|
||||||
|
comic = Comic(date=date)
|
||||||
|
alt_text[date.iso] = comic.alt_text
|
||||||
|
# Also download the comic image
|
||||||
|
comic.get_comic_image()
|
||||||
|
|
||||||
|
with open(ALT_TEXT_JSON, 'w') as fp:
|
||||||
|
json.dump(alt_text, fp, indent=4)
|
||||||
|
|
||||||
|
print("Done")
|
||||||
47
postprocess.sh
Executable file
47
postprocess.sh
Executable file
@@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Postprocesses all fetched comics from the fetch-comics.py file
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
HERE="$(dirname "$0")"
|
||||||
|
DATA_DIR="$HERE/data"
|
||||||
|
IMAGE_DIR="$DATA_DIR/images"
|
||||||
|
OUT_DIR="$HERE/out"
|
||||||
|
#alt_json="$HERE/alt_text.json"
|
||||||
|
alt_json="$DATA_DIR/alt_text.json"
|
||||||
|
|
||||||
|
if [[ ! -f "$alt_json" ]]; then
|
||||||
|
echo "Could not find $alt_json - have you run the fetch script?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -vp "$OUT_DIR"
|
||||||
|
|
||||||
|
# Get the list of all comic dates
|
||||||
|
readarray -t dates < <(jq -r 'keys[]' < "$alt_json")
|
||||||
|
|
||||||
|
#echo ${#dates[@]}
|
||||||
|
for date in "${dates[@]}"; do
|
||||||
|
echo -n "Processing comic $date ... "
|
||||||
|
image_in="$IMAGE_DIR/$date.gif"
|
||||||
|
image_out="$OUT_DIR/$date.png"
|
||||||
|
if [[ -f "$image_out" ]]; then
|
||||||
|
echo "already exists, skipping"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
alt_text="$(jq -r ".[\"$date\"]" < "$alt_json")"
|
||||||
|
|
||||||
|
if [[ -z "$alt_text" ]]; then
|
||||||
|
convert "$image_in" \
|
||||||
|
-background white -alpha background \
|
||||||
|
"$image_out"
|
||||||
|
else
|
||||||
|
convert "$image_in" \
|
||||||
|
-background white -alpha background \
|
||||||
|
-gravity center \
|
||||||
|
-pointsize 12 -size 360x caption:"$alt_text" \
|
||||||
|
-append \
|
||||||
|
"$image_out"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "OK"
|
||||||
|
done
|
||||||
Reference in New Issue
Block a user