2020-06-13 20:59:58 -04:00
|
|
|
defmodule Omnibot.Contrib.Linkbot do
|
|
|
|
|
use Omnibot.Module
|
|
|
|
|
require Logger
|
|
|
|
|
|
2020-06-13 21:47:46 -04:00
|
|
|
@default_config timeout: 30_000
|
2020-06-14 16:41:40 -04:00
|
|
|
@hostname_blacklist ~r/(^localhost$|\.local$|\.localdomain$|\.home$|^[^.]+$|^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$)/i
|
|
|
|
|
|
|
|
|
|
def blacklisted?(url) do
|
|
|
|
|
host = URI.parse(url).host
|
|
|
|
|
Regex.match?(@hostname_blacklist, host)
|
|
|
|
|
end
|
|
|
|
|
|
2020-06-13 20:59:58 -04:00
|
|
|
defmodule Client do
|
|
|
|
|
use Tesla
|
2020-06-14 16:41:40 -04:00
|
|
|
alias Omnibot.Contrib.Linkbot
|
2020-06-13 20:59:58 -04:00
|
|
|
|
|
|
|
|
plug Tesla.Middleware.Headers, [{"user-agent", "Tesla/Omnibot"}]
|
|
|
|
|
plug Tesla.Middleware.FollowRedirects, max_redirects: 10
|
|
|
|
|
plug Tesla.Middleware.Compression, format: "gzip"
|
|
|
|
|
|
2020-07-02 16:23:38 -07:00
|
|
|
# TODO instead of checking for <title> exclusively, do this:
|
|
|
|
|
# 1. check for "meta" tag (in the header) with a "property" attribute of "og:title", and fetch the "content" attribute of that tag
|
|
|
|
|
# 2. check for meta tag with attribute "name" == "title", and fetch "content" attribute
|
|
|
|
|
# 3. Fall back to the <title>
|
|
|
|
|
|
2020-06-13 20:59:58 -04:00
|
|
|
@title_regex ~r"<title>(?<title>.+)</title>"i
|
|
|
|
|
|
|
|
|
|
def get_title(url) do
|
|
|
|
|
if should_get?(url) do
|
2020-06-14 18:10:10 -04:00
|
|
|
Logger.info("Fetching #{url}")
|
2020-06-13 20:59:58 -04:00
|
|
|
resp = get!(url)
|
|
|
|
|
%{"title" => title} = Regex.named_captures(@title_regex, resp.body)
|
|
|
|
|
title
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
defp should_get?(url) do
|
2020-06-14 16:41:40 -04:00
|
|
|
if Linkbot.blacklisted?(url) do
|
|
|
|
|
false
|
|
|
|
|
else
|
|
|
|
|
resp = head!(url)
|
|
|
|
|
Tesla.get_header(resp, "content-type")
|
|
|
|
|
|> String.downcase()
|
|
|
|
|
|> String.contains?(["html", "text"])
|
|
|
|
|
end
|
2020-06-13 20:59:58 -04:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
@url_regex ~r"\bhttps?://[^\s]+"
|
|
|
|
|
|
|
|
|
|
@impl true
|
|
|
|
|
def on_channel_msg(irc, channel, _nick, line) do
|
|
|
|
|
Regex.scan(@url_regex, line)
|
2020-06-14 18:10:10 -04:00
|
|
|
|> Enum.flat_map(& &1)
|
2020-06-13 20:59:58 -04:00
|
|
|
|> Enum.map(fn url -> Client.get_title(url) end)
|
|
|
|
|
|> Enum.each(fn title -> Irc.send_to(irc, channel, title) end)
|
|
|
|
|
end
|
2020-07-02 16:23:38 -07:00
|
|
|
|
2020-06-13 20:59:58 -04:00
|
|
|
end
|