Update linkbot to use a fancier parser, and add HTML decoding
* Linkbot parser also looks for <meta> tags and uses an actual HTML parser. * Inner title HTML is decoded before being displayed. Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
import asyncio
|
||||
import html
|
||||
from html.parser import HTMLParser
|
||||
import ipaddress
|
||||
import logging
|
||||
import re
|
||||
@@ -14,7 +16,41 @@ from omnibot.plugin import Plugin
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
LINK_RE = re.compile(r"https?://[^ ]+")
|
||||
TITLE_RE = re.compile(r"<title>(?P<title>.+?)</title>")
|
||||
|
||||
|
||||
class TitleParser(HTMLParser):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.parsing_title = False
|
||||
self.parsed_title: str | None = None
|
||||
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.parsed_title = None
|
||||
|
||||
def handle_starttag(
|
||||
self, tag: str, attrs_list: list[tuple[str, str | None]]
|
||||
) -> None:
|
||||
if self.parsed_title:
|
||||
return
|
||||
tag = tag.lower()
|
||||
if tag == "title":
|
||||
self.parsing_title = True
|
||||
elif tag == "meta":
|
||||
attrs = dict(attrs_list)
|
||||
if attrs.get("property", None) == "og:title":
|
||||
self.parsed_title = attrs.get("content", None)
|
||||
elif attrs.get("name", None) == "title":
|
||||
self.parsed_title = attrs.get("content", None)
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
match tag.lower():
|
||||
case "title":
|
||||
self.parsing_title = False
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self.parsing_title and not self.parsed_title:
|
||||
self.parsed_title = html.unescape(data)
|
||||
|
||||
|
||||
async def dns_lookup(host: str) -> str | None:
|
||||
@@ -97,12 +133,14 @@ class Linkbot(Plugin):
|
||||
log.debug("skipping URL %s because it couldn't be fetched", url)
|
||||
continue
|
||||
(status, content_type, text) = result
|
||||
if title := TITLE_RE.search(text):
|
||||
message = f"{title['title']}"
|
||||
elif not (200 <= status <= 299):
|
||||
|
||||
title_parser = TitleParser()
|
||||
title_parser.feed(text)
|
||||
message: str | None
|
||||
if not (200 <= status <= 299):
|
||||
message = f"{who.nick}: (status {status})"
|
||||
else:
|
||||
message = None
|
||||
message = title_parser.parsed_title
|
||||
|
||||
if message:
|
||||
self.send_to(conn, channel, message)
|
||||
|
||||
Reference in New Issue
Block a user