Update linkbot to use a fancier parser, and add HTML decoding
* Linkbot parser also looks for <meta> tags and uses an actual HTML parser. * Inner title HTML is decoded before being displayed. Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
@@ -1,4 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import html
|
||||||
|
from html.parser import HTMLParser
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@@ -14,7 +16,41 @@ from omnibot.plugin import Plugin
|
|||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
LINK_RE = re.compile(r"https?://[^ ]+")
|
LINK_RE = re.compile(r"https?://[^ ]+")
|
||||||
TITLE_RE = re.compile(r"<title>(?P<title>.+?)</title>")
|
|
||||||
|
|
||||||
|
class TitleParser(HTMLParser):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.parsing_title = False
|
||||||
|
self.parsed_title: str | None = None
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
super().reset()
|
||||||
|
self.parsed_title = None
|
||||||
|
|
||||||
|
def handle_starttag(
|
||||||
|
self, tag: str, attrs_list: list[tuple[str, str | None]]
|
||||||
|
) -> None:
|
||||||
|
if self.parsed_title:
|
||||||
|
return
|
||||||
|
tag = tag.lower()
|
||||||
|
if tag == "title":
|
||||||
|
self.parsing_title = True
|
||||||
|
elif tag == "meta":
|
||||||
|
attrs = dict(attrs_list)
|
||||||
|
if attrs.get("property", None) == "og:title":
|
||||||
|
self.parsed_title = attrs.get("content", None)
|
||||||
|
elif attrs.get("name", None) == "title":
|
||||||
|
self.parsed_title = attrs.get("content", None)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str) -> None:
|
||||||
|
match tag.lower():
|
||||||
|
case "title":
|
||||||
|
self.parsing_title = False
|
||||||
|
|
||||||
|
def handle_data(self, data: str) -> None:
|
||||||
|
if self.parsing_title and not self.parsed_title:
|
||||||
|
self.parsed_title = html.unescape(data)
|
||||||
|
|
||||||
|
|
||||||
async def dns_lookup(host: str) -> str | None:
|
async def dns_lookup(host: str) -> str | None:
|
||||||
@@ -97,12 +133,14 @@ class Linkbot(Plugin):
|
|||||||
log.debug("skipping URL %s because it couldn't be fetched", url)
|
log.debug("skipping URL %s because it couldn't be fetched", url)
|
||||||
continue
|
continue
|
||||||
(status, content_type, text) = result
|
(status, content_type, text) = result
|
||||||
if title := TITLE_RE.search(text):
|
|
||||||
message = f"{title['title']}"
|
title_parser = TitleParser()
|
||||||
elif not (200 <= status <= 299):
|
title_parser.feed(text)
|
||||||
|
message: str | None
|
||||||
|
if not (200 <= status <= 299):
|
||||||
message = f"{who.nick}: (status {status})"
|
message = f"{who.nick}: (status {status})"
|
||||||
else:
|
else:
|
||||||
message = None
|
message = title_parser.parsed_title
|
||||||
|
|
||||||
if message:
|
if message:
|
||||||
self.send_to(conn, channel, message)
|
self.send_to(conn, channel, message)
|
||||||
|
|||||||
Reference in New Issue
Block a user