From 57e1d211a30c31ee8b53ffe43d4d63a90bd8e46e Mon Sep 17 00:00:00 2001 From: Alek Ratzloff Date: Wed, 1 Jun 2022 21:43:45 -0700 Subject: [PATCH] Update linkbot to use a fancier parser, and add HTML decoding * Linkbot parser also looks for tags and uses an actual HTML parser. * Inner title HTML is decoded before being displayed. Signed-off-by: Alek Ratzloff --- plugins/linkbot.py | 48 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/plugins/linkbot.py b/plugins/linkbot.py index ba2f814..4325d02 100644 --- a/plugins/linkbot.py +++ b/plugins/linkbot.py @@ -1,4 +1,6 @@ import asyncio +import html +from html.parser import HTMLParser import ipaddress import logging import re @@ -14,7 +16,41 @@ from omnibot.plugin import Plugin log = logging.getLogger(__name__) LINK_RE = re.compile(r"https?://[^ ]+") -TITLE_RE = re.compile(r"(?P<title>.+?)") + + +class TitleParser(HTMLParser): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.parsing_title = False + self.parsed_title: str | None = None + + def reset(self) -> None: + super().reset() + self.parsed_title = None + + def handle_starttag( + self, tag: str, attrs_list: list[tuple[str, str | None]] + ) -> None: + if self.parsed_title: + return + tag = tag.lower() + if tag == "title": + self.parsing_title = True + elif tag == "meta": + attrs = dict(attrs_list) + if attrs.get("property", None) == "og:title": + self.parsed_title = attrs.get("content", None) + elif attrs.get("name", None) == "title": + self.parsed_title = attrs.get("content", None) + + def handle_endtag(self, tag: str) -> None: + match tag.lower(): + case "title": + self.parsing_title = False + + def handle_data(self, data: str) -> None: + if self.parsing_title and not self.parsed_title: + self.parsed_title = html.unescape(data) async def dns_lookup(host: str) -> str | None: @@ -97,12 +133,14 @@ class Linkbot(Plugin): log.debug("skipping URL %s because it couldn't be fetched", url) continue (status, content_type, text) = result - if title := TITLE_RE.search(text): - message = f"{title['title']}" - elif not (200 <= status <= 299): + + title_parser = TitleParser() + title_parser.feed(text) + message: str | None + if not (200 <= status <= 299): message = f"{who.nick}: (status {status})" else: - message = None + message = title_parser.parsed_title if message: self.send_to(conn, channel, message)