Update linkbot to use a fancier parser, and add HTML decoding

* Linkbot parser also looks for <meta> tags and uses an actual HTML
  parser.
* Inner title HTML is decoded before being displayed.

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2022-06-01 21:43:45 -07:00
parent 741bd85ff7
commit 57e1d211a3

View File

@@ -1,4 +1,6 @@
import asyncio
import html
from html.parser import HTMLParser
import ipaddress
import logging
import re
@@ -14,7 +16,41 @@ from omnibot.plugin import Plugin
log = logging.getLogger(__name__)
LINK_RE = re.compile(r"https?://[^ ]+")
TITLE_RE = re.compile(r"<title>(?P<title>.+?)</title>")
class TitleParser(HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.parsing_title = False
self.parsed_title: str | None = None
def reset(self) -> None:
super().reset()
self.parsed_title = None
def handle_starttag(
self, tag: str, attrs_list: list[tuple[str, str | None]]
) -> None:
if self.parsed_title:
return
tag = tag.lower()
if tag == "title":
self.parsing_title = True
elif tag == "meta":
attrs = dict(attrs_list)
if attrs.get("property", None) == "og:title":
self.parsed_title = attrs.get("content", None)
elif attrs.get("name", None) == "title":
self.parsed_title = attrs.get("content", None)
def handle_endtag(self, tag: str) -> None:
match tag.lower():
case "title":
self.parsing_title = False
def handle_data(self, data: str) -> None:
if self.parsing_title and not self.parsed_title:
self.parsed_title = html.unescape(data)
async def dns_lookup(host: str) -> str | None:
@@ -97,12 +133,14 @@ class Linkbot(Plugin):
log.debug("skipping URL %s because it couldn't be fetched", url)
continue
(status, content_type, text) = result
if title := TITLE_RE.search(text):
message = f"{title['title']}"
elif not (200 <= status <= 299):
title_parser = TitleParser()
title_parser.feed(text)
message: str | None
if not (200 <= status <= 299):
message = f"{who.nick}: (status {status})"
else:
message = None
message = title_parser.parsed_title
if message:
self.send_to(conn, channel, message)