Make linkbot title parsing more robust
Linkbot titles now check the "meta" tags for either "og:title" or "title" attributes. This is usually a more accurate/correct title than using the "title" tag, but this is checked as a last resort. Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
@@ -13,6 +13,7 @@ defmodule Omnibot.Contrib.Linkbot do
|
|||||||
defmodule Client do
|
defmodule Client do
|
||||||
use Tesla
|
use Tesla
|
||||||
alias Omnibot.Contrib.Linkbot
|
alias Omnibot.Contrib.Linkbot
|
||||||
|
import Meeseeks.CSS
|
||||||
|
|
||||||
plug Tesla.Middleware.Headers, [{"user-agent", "Tesla/Omnibot"}]
|
plug Tesla.Middleware.Headers, [{"user-agent", "Tesla/Omnibot"}]
|
||||||
plug Tesla.Middleware.FollowRedirects, max_redirects: 10
|
plug Tesla.Middleware.FollowRedirects, max_redirects: 10
|
||||||
@@ -26,11 +27,23 @@ defmodule Omnibot.Contrib.Linkbot do
|
|||||||
@title_regex ~r"<title>(?<title>.+)</title>"i
|
@title_regex ~r"<title>(?<title>.+)</title>"i
|
||||||
|
|
||||||
def get_title(url) do
|
def get_title(url) do
|
||||||
|
html = get_url(url)
|
||||||
|
document = Meeseeks.parse(html)
|
||||||
|
[title | _] = (Meeseeks.all(document, css("meta")) ++ [Meeseeks.one(document, css("title"))])
|
||||||
|
|> Enum.map(&(
|
||||||
|
Meeseeks.attr(&1, "property") == "og:title" && Meeseeks.attr(&1, "content")
|
||||||
|
|| Meeseeks.attr(&1, "name") == "title" && Meeseeks.attr(&1, "content")
|
||||||
|
|| Meeseeks.tag(&1) == "title" && Meeseeks.text(&1)
|
||||||
|
))
|
||||||
|
|> Enum.filter(& &1)
|
||||||
|
|
||||||
|
title
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_url(url) do
|
||||||
if should_get?(url) do
|
if should_get?(url) do
|
||||||
Logger.info("Fetching #{url}")
|
Logger.info("Fetching #{url}")
|
||||||
resp = get!(url)
|
get!(url).body
|
||||||
%{"title" => title} = Regex.named_captures(@title_regex, resp.body)
|
|
||||||
title
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -53,6 +66,7 @@ defmodule Omnibot.Contrib.Linkbot do
|
|||||||
Regex.scan(@url_regex, line)
|
Regex.scan(@url_regex, line)
|
||||||
|> Enum.flat_map(& &1)
|
|> Enum.flat_map(& &1)
|
||||||
|> Enum.map(fn url -> Client.get_title(url) end)
|
|> Enum.map(fn url -> Client.get_title(url) end)
|
||||||
|
|> Enum.filter(& &1)
|
||||||
|> Enum.each(fn title -> Irc.send_to(irc, channel, title) end)
|
|> Enum.each(fn title -> Irc.send_to(irc, channel, title) end)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
1
mix.exs
1
mix.exs
@@ -32,6 +32,7 @@ defmodule Omnibot.MixProject do
|
|||||||
# TODO : figure out how to make contrib modules optional (umbrella project?) and enable specific requirements
|
# TODO : figure out how to make contrib modules optional (umbrella project?) and enable specific requirements
|
||||||
[
|
[
|
||||||
{:tesla, "~> 1.3.0"}, # Used by Omnibot.Contrib.Linkbot
|
{:tesla, "~> 1.3.0"}, # Used by Omnibot.Contrib.Linkbot
|
||||||
|
{:meeseeks, "~> 0.15.1"}, # Used by Omnibot.Contrib.Linkbot
|
||||||
{:sqlitex, "~> 1.7"}, # Used by Omnibot.Contrib.Wordbot
|
{:sqlitex, "~> 1.7"}, # Used by Omnibot.Contrib.Wordbot
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|||||||
4
mix.lock
4
mix.lock
@@ -4,13 +4,17 @@
|
|||||||
"esqlite": {:hex, :esqlite, "0.4.1", "ba5d0bab6b9c8432ffe1bf12fee8e154a50f1c3c40eadc3a9c870c23ca94d961", [:rebar3], [], "hexpm"},
|
"esqlite": {:hex, :esqlite, "0.4.1", "ba5d0bab6b9c8432ffe1bf12fee8e154a50f1c3c40eadc3a9c870c23ca94d961", [:rebar3], [], "hexpm"},
|
||||||
"hackney": {:hex, :hackney, "1.16.0", "5096ac8e823e3a441477b2d187e30dd3fff1a82991a806b2003845ce72ce2d84", [:rebar3], [{:certifi, "2.5.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.1", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.0", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.6", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
"hackney": {:hex, :hackney, "1.16.0", "5096ac8e823e3a441477b2d187e30dd3fff1a82991a806b2003845ce72ce2d84", [:rebar3], [{:certifi, "2.5.2", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.1", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.0", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.6", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
|
"idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"meeseeks": {:hex, :meeseeks, "0.15.1", "148d5d9ea879cdb415b8bc4162ac5528f9a2fe42fbfe1802c681a2842cb1c0a4", [:mix], [{:meeseeks_html5ever, "~> 0.12.1", [hex: :meeseeks_html5ever, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
|
"meeseeks_html5ever": {:hex, :meeseeks_html5ever, "0.12.1", "718fab10d05b83204524a518b2b88caa37ba6a6e02f82e80d6a7bc47552fb54a", [:mix], [{:rustler, "~> 0.21.0", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
|
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"},
|
||||||
"mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm"},
|
"mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm"},
|
||||||
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm"},
|
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm"},
|
||||||
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
|
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
|
||||||
|
"rustler": {:hex, :rustler, "0.21.1", "5299980be32da997c54382e945bacaa015ed97a60745e1e639beaf6a7b278c65", [:mix], [{:toml, "~> 0.5.2", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"sqlitex": {:hex, :sqlitex, "1.7.1", "022d477aab2ae999c43ae6fbd1782ff1457e0e95c251c7b5fa6f7b7b102040ff", [:mix], [{:decimal, "~> 1.7", [hex: :decimal, repo: "hexpm", optional: false]}, {:esqlite, "~> 0.4", [hex: :esqlite, repo: "hexpm", optional: false]}], "hexpm"},
|
"sqlitex": {:hex, :sqlitex, "1.7.1", "022d477aab2ae999c43ae6fbd1782ff1457e0e95c251c7b5fa6f7b7b102040ff", [:mix], [{:decimal, "~> 1.7", [hex: :decimal, repo: "hexpm", optional: false]}, {:esqlite, "~> 0.4", [hex: :esqlite, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm"},
|
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm"},
|
||||||
"tesla": {:hex, :tesla, "1.3.3", "26ae98627af5c406584aa6755ab5fc96315d70d69a24dd7f8369cfcb75094a45", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]}, {:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]}, {:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]}, {:gun, "~> 1.3", [hex: :gun, repo: "hexpm", optional: true]}, {:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]}, {:ibrowse, "~> 4.4.0", [hex: :ibrowse, repo: "hexpm", optional: true]}, {:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]}, {:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]}, {:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm"},
|
"tesla": {:hex, :tesla, "1.3.3", "26ae98627af5c406584aa6755ab5fc96315d70d69a24dd7f8369cfcb75094a45", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]}, {:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]}, {:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]}, {:gun, "~> 1.3", [hex: :gun, repo: "hexpm", optional: true]}, {:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]}, {:ibrowse, "~> 4.4.0", [hex: :ibrowse, repo: "hexpm", optional: true]}, {:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]}, {:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]}, {:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm"},
|
||||||
|
"toml": {:hex, :toml, "0.5.2", "e471388a8726d1ce51a6b32f864b8228a1eb8edc907a0edf2bb50eab9321b526", [:mix], [], "hexpm"},
|
||||||
"tzdata": {:hex, :tzdata, "1.0.3", "73470ad29dde46e350c60a66e6b360d3b99d2d18b74c4c349dbebbc27a09a3eb", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
|
"tzdata": {:hex, :tzdata, "1.0.3", "73470ad29dde46e350c60a66e6b360d3b99d2d18b74c4c349dbebbc27a09a3eb", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},
|
||||||
"unicode_util_compat": {:hex, :unicode_util_compat, "0.5.0", "8516502659002cec19e244ebd90d312183064be95025a319a6c7e89f4bccd65b", [:rebar3], [], "hexpm"},
|
"unicode_util_compat": {:hex, :unicode_util_compat, "0.5.0", "8516502659002cec19e244ebd90d312183064be95025a319a6c7e89f4bccd65b", [:rebar3], [], "hexpm"},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user