From 2134e7b1525aa024a66332f2b3343c4634658a95 Mon Sep 17 00:00:00 2001 From: Thomas Citharel Date: Tue, 18 Jan 2022 12:52:45 +0100 Subject: [PATCH] Improve rich media parsers Signed-off-by: Thomas Citharel --- lib/service/rich_media/parser.ex | 6 ++++++ lib/service/rich_media/parsers/fallback.ex | 2 +- lib/service/rich_media/parsers/meta_tags_parser.ex | 9 ++++++--- lib/service/rich_media/parsers/oembed_parser.ex | 2 +- lib/service/rich_media/parsers/ogp.ex | 1 + lib/service/rich_media/parsers/twitter_card.ex | 1 + 6 files changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/service/rich_media/parser.ex b/lib/service/rich_media/parser.ex index 19196527..1873ca83 100644 --- a/lib/service/rich_media/parser.ex +++ b/lib/service/rich_media/parser.ex @@ -87,6 +87,10 @@ defmodule Mobilizon.Service.RichMedia.Parser do {:ok, data} + {:ok, err} -> + Logger.debug("HTTP error: #{inspect(err)}") + {:error, "HTTP error: #{inspect(err)}"} + {:error, err} -> Logger.debug("HTTP error: #{inspect(err)}") {:error, "HTTP error: #{inspect(err)}"} @@ -196,6 +200,8 @@ defmodule Mobilizon.Service.RichMedia.Parser do @spec maybe_parse(String.t()) :: map() defp maybe_parse(html) do Enum.reduce_while(parsers(), %{}, fn parser, acc -> + Logger.debug("Using #{inspect(parser)} to parse link") + case parser.parse(html, acc) do {:ok, data} -> {:halt, data} diff --git a/lib/service/rich_media/parsers/fallback.ex b/lib/service/rich_media/parsers/fallback.ex index 252d1620..2463bc08 100644 --- a/lib/service/rich_media/parsers/fallback.ex +++ b/lib/service/rich_media/parsers/fallback.ex @@ -35,7 +35,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.Fallback do defp get_page(html, :title) do html |> Floki.parse_document!() - |> Floki.find("html title") + |> Floki.find("title") |> List.first() |> Floki.text() |> String.trim() diff --git a/lib/service/rich_media/parsers/meta_tags_parser.ex b/lib/service/rich_media/parsers/meta_tags_parser.ex index e552ddd9..6bff050d 100644 --- a/lib/service/rich_media/parsers/meta_tags_parser.ex +++ b/lib/service/rich_media/parsers/meta_tags_parser.ex @@ -53,7 +53,10 @@ defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do end) if data[to_string(key_name)] in Enum.map(allowed_attributes, &to_string/1) do - %{String.to_existing_atom(data[to_string(key_name)]) => data[to_string(value_name)]} + %{ + String.to_existing_atom(data[to_string(key_name)]) => + String.trim(data[to_string(value_name)]) + } else %{} end @@ -65,7 +68,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do defp maybe_put_title(meta, html) when meta != %{} do case get_page_title(html) do "" -> meta - title -> Map.put_new(meta, :title, title) + title -> Map.put_new(meta, :title, String.trim(title)) end end @@ -80,7 +83,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do meta description -> - Map.put_new(meta, :description, description) + Map.put_new(meta, :description, String.trim(description)) end end diff --git a/lib/service/rich_media/parsers/oembed_parser.ex b/lib/service/rich_media/parsers/oembed_parser.ex index bebfdec5..bcddd220 100644 --- a/lib/service/rich_media/parsers/oembed_parser.ex +++ b/lib/service/rich_media/parsers/oembed_parser.ex @@ -67,7 +67,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.OEmbed do {:ok, data} <- Jason.decode(json), data <- data - |> Map.new(fn {k, v} -> {String.to_existing_atom(k), v} end) + |> Map.new(fn {k, v} -> {String.to_existing_atom(k), String.trim(v)} end) |> Map.take(@oembed_allowed_attributes) do {:ok, data} end diff --git a/lib/service/rich_media/parsers/ogp.ex b/lib/service/rich_media/parsers/ogp.ex index 07c89619..06923f4c 100644 --- a/lib/service/rich_media/parsers/ogp.ex +++ b/lib/service/rich_media/parsers/ogp.ex @@ -54,6 +54,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.OGP do defp transform_tags(data) do data |> Enum.reject(fn {_, v} -> is_nil(v) end) + |> Enum.map(fn {k, v} -> {k, String.trim(v)} end) |> Map.new() |> Map.update(:image_remote_url, Map.get(data, :image), & &1) |> Map.update(:width, get_integer_value(data, :"image:width"), & &1) diff --git a/lib/service/rich_media/parsers/twitter_card.ex b/lib/service/rich_media/parsers/twitter_card.ex index 70740d16..8277db5a 100644 --- a/lib/service/rich_media/parsers/twitter_card.ex +++ b/lib/service/rich_media/parsers/twitter_card.ex @@ -63,6 +63,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.TwitterCard do defp transform_tags(data) do data |> Enum.reject(fn {_, v} -> is_nil(v) end) + |> Enum.map(fn {k, v} -> {k, String.trim(v)} end) |> Map.new() |> Map.update(:image_remote_url, Map.get(data, :image), & &1) end