Add gitignore, fetch script, and postprocess script
Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
188
.gitignore
vendored
Normal file
188
.gitignore
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
# Data and out directories
|
||||
data/
|
||||
out/
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
### Vim ###
|
||||
# Swap
|
||||
[._]*.s[a-v][a-z]
|
||||
!*.svg # comment out if you don't need vector files
|
||||
[._]*.sw[a-p]
|
||||
[._]s[a-rt-v][a-z]
|
||||
[._]ss[a-gi-z]
|
||||
[._]sw[a-p]
|
||||
|
||||
# Session
|
||||
Session.vim
|
||||
Sessionx.vim
|
||||
|
||||
# Temporary
|
||||
.netrwhist
|
||||
*~
|
||||
# Auto-generated tag files
|
||||
tags
|
||||
# Persistent undo
|
||||
[._]*.un~
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/vim,python
|
||||
225
fetch-achewood.py
Executable file
225
fetch-achewood.py
Executable file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
# Fetches all Achewood comics and metadata.
|
||||
from pathlib import Path
|
||||
import urllib.request as request
|
||||
import sys
|
||||
import json
|
||||
from html.parser import HTMLParser
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional
|
||||
import time
|
||||
|
||||
|
||||
################################################################################
|
||||
# Config options - probably don't change these
|
||||
################################################################################
|
||||
|
||||
# The number of seconds to wait between consecutive downloads
|
||||
RATE_LIMIT = 0.5
|
||||
# The directory to save cached data in
|
||||
DATA_DIR = Path("data")
|
||||
|
||||
################################################################################
|
||||
# Constants - don't change these
|
||||
################################################################################
|
||||
|
||||
ACHEWOOD_URL = "http://achewood.com"
|
||||
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"
|
||||
|
||||
IMAGE_DIR = DATA_DIR / "images"
|
||||
COMIC_DIR = DATA_DIR / "comics"
|
||||
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
|
||||
ARCHIVE_CACHE = DATA_DIR / "archive.html"
|
||||
LINK_RE = re.compile(
|
||||
r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
|
||||
)
|
||||
RATE_LIMIT = float(RATE_LIMIT)
|
||||
|
||||
################################################################################
|
||||
# Classes
|
||||
################################################################################
|
||||
@dataclass
|
||||
class Date:
|
||||
"Simple date class"
|
||||
|
||||
year: str
|
||||
month: str
|
||||
day: str
|
||||
|
||||
@property
|
||||
def iso(self):
|
||||
return f"{self.year}{self.month}{self.day}"
|
||||
|
||||
@property
|
||||
def us(self):
|
||||
return f"{self.month}{self.day}{self.year}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Comic:
|
||||
"Comic metadata"
|
||||
date: Date
|
||||
|
||||
@property
|
||||
def alt_text(self) -> str:
|
||||
comic_html = self.get_comic_html()
|
||||
parser = ComicAltParser()
|
||||
parser.feed(comic_html)
|
||||
return parser.alt_text
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||||
|
||||
@property
|
||||
def image_url(self) -> str:
|
||||
return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"
|
||||
|
||||
@property
|
||||
def image_path(self) -> Path:
|
||||
return IMAGE_DIR / (self.date.iso + ".gif")
|
||||
|
||||
@property
|
||||
def comic_url(self) -> str:
|
||||
return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"
|
||||
|
||||
@property
|
||||
def comic_path(self) -> Path:
|
||||
return COMIC_DIR / (self.date.iso + ".html")
|
||||
|
||||
def get_comic_html(self) -> str:
|
||||
"Gets the HTML for this comic and caches it"
|
||||
if self.comic_path.is_file():
|
||||
with open(self.comic_path) as fp:
|
||||
return fp.read()
|
||||
else:
|
||||
print(f"Downloading HTML for comic date {self.date.iso}")
|
||||
comic_html = download_html(self.comic_url)
|
||||
with open(self.comic_path, 'w') as fp:
|
||||
fp.write(comic_html)
|
||||
return comic_html
|
||||
|
||||
def get_comic_image(self) -> bytes:
|
||||
"Gets the image bytes for this comic and caches it"
|
||||
if self.image_path.is_file():
|
||||
with open(self.image_path, "rb") as fp:
|
||||
return fp.read()
|
||||
else:
|
||||
print(f"Downloading image for comic date {self.date.iso}")
|
||||
image = download(self.image_url)
|
||||
with open(self.image_path, 'wb') as fp:
|
||||
fp.write(image)
|
||||
return image
|
||||
|
||||
|
||||
class ArchiveParser(HTMLParser):
|
||||
"Achewood archive HTML parser"
|
||||
|
||||
def __init__(self):
|
||||
self.dates = []
|
||||
super().__init__()
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
if tag != "a" or "href" not in attrs:
|
||||
return
|
||||
if link := LINK_RE.fullmatch(attrs["href"]):
|
||||
self.dates += [
|
||||
Date(year=link["year"], month=link["month"], day=link["day"])
|
||||
]
|
||||
|
||||
|
||||
class ComicAltParser(HTMLParser):
|
||||
def __init__(self):
|
||||
self.alt_text = None
|
||||
super().__init__()
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
if tag != "img" or "title" not in attrs:
|
||||
return
|
||||
if self.alt_text is not None:
|
||||
print("Warning: replacing already-existing alt text")
|
||||
print("Previous:")
|
||||
print("\t", self.alt_text)
|
||||
print("New:")
|
||||
print("\t", attrs["title"])
|
||||
self.alt_text = attrs["title"]
|
||||
|
||||
|
||||
################################################################################
|
||||
# Utility functions
|
||||
################################################################################
|
||||
|
||||
|
||||
# global variable keeping track of the time that the last download was completed
|
||||
last_download = 0.0
|
||||
|
||||
def download(url: str) -> bytes:
|
||||
global last_download
|
||||
# Sleep if needed to rate-limit all downloads
|
||||
next_download = last_download + RATE_LIMIT
|
||||
now = time.time()
|
||||
delta = next_download - now
|
||||
if delta > 0.0:
|
||||
time.sleep(delta)
|
||||
|
||||
with request.urlopen(url) as f:
|
||||
if f.status != 200:
|
||||
raise Exception(f"URL {url} returned non-200 status {f.status}")
|
||||
# update the download global
|
||||
last_download = time.time()
|
||||
return f.read()
|
||||
|
||||
|
||||
def download_html(url: str) -> str:
|
||||
return download(url).decode("utf-8")
|
||||
|
||||
|
||||
def get_archive_html() -> str:
|
||||
if ARCHIVE_CACHE.is_file():
|
||||
print("Archive is cached, loading that")
|
||||
with open(ARCHIVE_CACHE, "r") as fp:
|
||||
archive_html = fp.read()
|
||||
else:
|
||||
print("Downloading archive")
|
||||
archive_html = download_html(ARCHIVE_URL)
|
||||
with open(ARCHIVE_CACHE, "w") as fp:
|
||||
fp.write(archive_html)
|
||||
return archive_html
|
||||
|
||||
|
||||
################################################################################
|
||||
# main
|
||||
################################################################################
|
||||
|
||||
# Make data, comic, and image directory
|
||||
try:
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
IMAGE_DIR.mkdir(exist_ok=True)
|
||||
COMIC_DIR.mkdir(exist_ok=True)
|
||||
except PermissionError:
|
||||
print("ERROR: could not create data, image, or comic cache directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch all comic links if needed
|
||||
archive_html = get_archive_html()
|
||||
parser = ArchiveParser()
|
||||
parser.feed(archive_html)
|
||||
|
||||
# All dates of comics
|
||||
dates = parser.dates
|
||||
|
||||
alt_text = {}
|
||||
|
||||
for date in dates:
|
||||
comic = Comic(date=date)
|
||||
alt_text[date.iso] = comic.alt_text
|
||||
# Also download the comic image
|
||||
comic.get_comic_image()
|
||||
|
||||
with open(ALT_TEXT_JSON, 'w') as fp:
|
||||
json.dump(alt_text, fp, indent=4)
|
||||
|
||||
print("Done")
|
||||
47
postprocess.sh
Executable file
47
postprocess.sh
Executable file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# Postprocesses all fetched comics from the fetch-comics.py file
|
||||
set -euo pipefail
|
||||
|
||||
HERE="$(dirname "$0")"
|
||||
DATA_DIR="$HERE/data"
|
||||
IMAGE_DIR="$DATA_DIR/images"
|
||||
OUT_DIR="$HERE/out"
|
||||
#alt_json="$HERE/alt_text.json"
|
||||
alt_json="$DATA_DIR/alt_text.json"
|
||||
|
||||
if [[ ! -f "$alt_json" ]]; then
|
||||
echo "Could not find $alt_json - have you run the fetch script?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -vp "$OUT_DIR"
|
||||
|
||||
# Get the list of all comic dates
|
||||
readarray -t dates < <(jq -r 'keys[]' < "$alt_json")
|
||||
|
||||
#echo ${#dates[@]}
|
||||
for date in "${dates[@]}"; do
|
||||
echo -n "Processing comic $date ... "
|
||||
image_in="$IMAGE_DIR/$date.gif"
|
||||
image_out="$OUT_DIR/$date.png"
|
||||
if [[ -f "$image_out" ]]; then
|
||||
echo "already exists, skipping"
|
||||
continue
|
||||
fi
|
||||
alt_text="$(jq -r ".[\"$date\"]" < "$alt_json")"
|
||||
|
||||
if [[ -z "$alt_text" ]]; then
|
||||
convert "$image_in" \
|
||||
-background white -alpha background \
|
||||
"$image_out"
|
||||
else
|
||||
convert "$image_in" \
|
||||
-background white -alpha background \
|
||||
-gravity center \
|
||||
-pointsize 12 -size 360x caption:"$alt_text" \
|
||||
-append \
|
||||
"$image_out"
|
||||
fi
|
||||
|
||||
echo "OK"
|
||||
done
|
||||
Reference in New Issue
Block a user