Make linkbot title parsing more robust

Linkbot titles now check the "meta" tags for either "og:title" or
"title" attributes. This is usually a more accurate/correct title than
using the "title" tag, but this is checked as a last resort.

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2020-07-02 18:09:09 -07:00
parent 67192f2c5e
commit e3f30d30c1
3 changed files with 24 additions and 5 deletions

View File

@@ -13,6 +13,7 @@ defmodule Omnibot.Contrib.Linkbot do
defmodule Client do
use Tesla
alias Omnibot.Contrib.Linkbot
import Meeseeks.CSS
plug Tesla.Middleware.Headers, [{"user-agent", "Tesla/Omnibot"}]
plug Tesla.Middleware.FollowRedirects, max_redirects: 10
@@ -26,11 +27,23 @@ defmodule Omnibot.Contrib.Linkbot do
@title_regex ~r"<title>(?<title>.+)</title>"i
def get_title(url) do
html = get_url(url)
document = Meeseeks.parse(html)
[title | _] = (Meeseeks.all(document, css("meta")) ++ [Meeseeks.one(document, css("title"))])
|> Enum.map(&(
Meeseeks.attr(&1, "property") == "og:title" && Meeseeks.attr(&1, "content")
|| Meeseeks.attr(&1, "name") == "title" && Meeseeks.attr(&1, "content")
|| Meeseeks.tag(&1) == "title" && Meeseeks.text(&1)
))
|> Enum.filter(& &1)
title
end
defp get_url(url) do
if should_get?(url) do
Logger.info("Fetching #{url}")
resp = get!(url)
%{"title" => title} = Regex.named_captures(@title_regex, resp.body)
title
get!(url).body
end
end
@@ -53,6 +66,7 @@ defmodule Omnibot.Contrib.Linkbot do
Regex.scan(@url_regex, line)
|> Enum.flat_map(& &1)
|> Enum.map(fn url -> Client.get_title(url) end)
|> Enum.filter(& &1)
|> Enum.each(fn title -> Irc.send_to(irc, channel, title) end)
end