#!/usr/bin/env python3
# Fetches all Achewood comics and metadata.
from pathlib import Path
import urllib.request as request
import sys
import json
from html.parser import HTMLParser
import re
from dataclasses import dataclass
from typing import Any, Dict, Optional
import time


################################################################################
# Config options - probably don't change these
################################################################################

# The number of seconds to wait between consecutive downloads
RATE_LIMIT = 0.5
# The directory to save cached data in
DATA_DIR = Path("data")

################################################################################
# Constants - don't change these
################################################################################

ACHEWOOD_URL = "http://achewood.com"
ARCHIVE_URL = f"{ACHEWOOD_URL}/list.php"

IMAGE_DIR = DATA_DIR / "images"
COMIC_DIR = DATA_DIR / "comics"
ALT_TEXT_JSON = DATA_DIR / "alt_text.json"
ARCHIVE_CACHE = DATA_DIR / "archive.html"
LINK_RE = re.compile(
    r"^index\.php\?date=(?P<month>\d\d)(?P<day>\d\d)(?P<year>\d\d\d\d)$"
)
RATE_LIMIT = float(RATE_LIMIT)

################################################################################
# Classes
################################################################################
@dataclass
class Date:
    "Simple date class"

    year: str
    month: str
    day: str

    @property
    def iso(self):
        return f"{self.year}{self.month}{self.day}"

    @property
    def us(self):
        return f"{self.month}{self.day}{self.year}"


@dataclass
class Comic:
    "Comic metadata"
    date: Date

    @property
    def alt_text(self) -> str:
        comic_html = self.get_comic_html()
        parser = ComicAltParser()
        parser.feed(comic_html)
        return parser.alt_text

    @property
    def url(self) -> str:
        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"

    @property
    def image_url(self) -> str:
        return f"{ACHEWOOD_URL}/comic.php?date={self.date.us}"

    @property
    def image_path(self) -> Path:
        return IMAGE_DIR / (self.date.iso + ".gif")

    @property
    def comic_url(self) -> str:
        return f"{ACHEWOOD_URL}/index.php?date={self.date.us}"

    @property
    def comic_path(self) -> Path:
        return COMIC_DIR / (self.date.iso + ".html")

    def get_comic_html(self) -> str:
        "Gets the HTML for this comic and caches it"
        if self.comic_path.is_file():
            with open(self.comic_path) as fp:
                return fp.read()
        else:
            print(f"Downloading HTML for comic date {self.date.iso}")
            comic_html = download_html(self.comic_url)
            with open(self.comic_path, 'w') as fp:
                fp.write(comic_html)
            return comic_html

    def get_comic_image(self) -> bytes:
        "Gets the image bytes for this comic and caches it"
        if self.image_path.is_file():
            with open(self.image_path, "rb") as fp:
                return fp.read()
        else:
            print(f"Downloading image for comic date {self.date.iso}")
            image = download(self.image_url)
            with open(self.image_path, 'wb') as fp:
                fp.write(image)
            return image


class ArchiveParser(HTMLParser):
    "Achewood archive HTML parser"

    def __init__(self):
        self.dates = []
        super().__init__()

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag != "a" or "href" not in attrs:
            return
        if link := LINK_RE.fullmatch(attrs["href"]):
            self.dates += [
                Date(year=link["year"], month=link["month"], day=link["day"])
            ]


class ComicAltParser(HTMLParser):
    def __init__(self):
        self.alt_text = None
        super().__init__()

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag != "img" or "title" not in attrs:
            return
        if self.alt_text is not None:
            print("Warning: replacing already-existing alt text")
            print("Previous:")
            print("\t", self.alt_text)
            print("New:")
            print("\t", attrs["title"])
        self.alt_text = attrs["title"]


################################################################################
# Utility functions
################################################################################


# global variable keeping track of the time that the last download was completed
last_download = 0.0

def download(url: str) -> bytes:
    global last_download
    # Sleep if needed to rate-limit all downloads
    next_download = last_download + RATE_LIMIT
    now = time.time()
    delta = next_download - now
    if delta > 0.0:
        time.sleep(delta)

    with request.urlopen(url) as f:
        if f.status != 200:
            raise Exception(f"URL {url} returned non-200 status {f.status}")
        # update the download global
        last_download = time.time()
        return f.read()


def download_html(url: str) -> str:
    return download(url).decode("utf-8")


def get_archive_html() -> str:
    if ARCHIVE_CACHE.is_file():
        print("Archive is cached, loading that")
        with open(ARCHIVE_CACHE, "r") as fp:
            archive_html = fp.read()
    else:
        print("Downloading archive")
        archive_html = download_html(ARCHIVE_URL)
        with open(ARCHIVE_CACHE, "w") as fp:
            fp.write(archive_html)
    return archive_html


################################################################################
# main
################################################################################

# Make data, comic, and image directory
try:
    DATA_DIR.mkdir(exist_ok=True)
    IMAGE_DIR.mkdir(exist_ok=True)
    COMIC_DIR.mkdir(exist_ok=True)
except PermissionError:
    print("ERROR: could not create data, image, or comic cache directory")
    sys.exit(1)

# Fetch all comic links if needed
archive_html = get_archive_html()
parser = ArchiveParser()
parser.feed(archive_html)

# All dates of comics
dates = parser.dates

alt_text = {}

for date in dates:
    comic = Comic(date=date)
    alt_text[date.iso] = comic.alt_text
    # Also download the comic image
    comic.get_comic_image()

with open(ALT_TEXT_JSON, 'w') as fp:
    json.dump(alt_text, fp, indent=4)

print("Done")