diff --git a/plugins/linkbot.py b/plugins/linkbot.py
index ba2f814..4325d02 100644
--- a/plugins/linkbot.py
+++ b/plugins/linkbot.py
@@ -1,4 +1,6 @@
import asyncio
+import html
+from html.parser import HTMLParser
import ipaddress
import logging
import re
@@ -14,7 +16,41 @@ from omnibot.plugin import Plugin
log = logging.getLogger(__name__)
LINK_RE = re.compile(r"https?://[^ ]+")
-TITLE_RE = re.compile(r"
(?P.+?)")
+
+
+class TitleParser(HTMLParser):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.parsing_title = False
+ self.parsed_title: str | None = None
+
+ def reset(self) -> None:
+ super().reset()
+ self.parsed_title = None
+
+ def handle_starttag(
+ self, tag: str, attrs_list: list[tuple[str, str | None]]
+ ) -> None:
+ if self.parsed_title:
+ return
+ tag = tag.lower()
+ if tag == "title":
+ self.parsing_title = True
+ elif tag == "meta":
+ attrs = dict(attrs_list)
+ if attrs.get("property", None) == "og:title":
+ self.parsed_title = attrs.get("content", None)
+ elif attrs.get("name", None) == "title":
+ self.parsed_title = attrs.get("content", None)
+
+ def handle_endtag(self, tag: str) -> None:
+ match tag.lower():
+ case "title":
+ self.parsing_title = False
+
+ def handle_data(self, data: str) -> None:
+ if self.parsing_title and not self.parsed_title:
+ self.parsed_title = html.unescape(data)
async def dns_lookup(host: str) -> str | None:
@@ -97,12 +133,14 @@ class Linkbot(Plugin):
log.debug("skipping URL %s because it couldn't be fetched", url)
continue
(status, content_type, text) = result
- if title := TITLE_RE.search(text):
- message = f"{title['title']}"
- elif not (200 <= status <= 299):
+
+ title_parser = TitleParser()
+ title_parser.feed(text)
+ message: str | None
+ if not (200 <= status <= 299):
message = f"{who.nick}: (status {status})"
else:
- message = None
+ message = title_parser.parsed_title
if message:
self.send_to(conn, channel, message)