From 7fca37741bfdd050e7570caace764d7c99e87afe Mon Sep 17 00:00:00 2001 From: "byte[]" Date: Wed, 20 May 2020 14:18:13 -0400 Subject: [PATCH] replace use of hackney in scraper with mint --- config/dev.exs | 2 +- lib/philomena/http.ex | 45 +++++++++++-------- lib/philomena/scrapers/deviantart.ex | 42 ++++++++--------- lib/philomena/scrapers/raw.ex | 4 +- lib/philomena/scrapers/tumblr.ex | 4 +- lib/philomena/scrapers/twitter.ex | 2 +- .../servers/picarto_channel_updater.ex | 2 +- .../servers/piczel_channel_updater.ex | 2 +- lib/philomena/servers/user_link_updater.ex | 2 +- .../plugs/compromised_password_check_plug.ex | 4 +- lib/philomena_web/plugs/scraper_plug.ex | 4 +- 11 files changed, 58 insertions(+), 55 deletions(-) diff --git a/config/dev.exs b/config/dev.exs index 3c21fbda..01eb67de 100644 --- a/config/dev.exs +++ b/config/dev.exs @@ -65,7 +65,7 @@ config :philomena, PhilomenaWeb.Endpoint, # Do not include metadata nor timestamps in development logs config :logger, :console, format: "[$level] $message\n" -config :logger, compile_time_purge_matching: [[application: :remote_ip]] +config :logger, compile_time_purge_matching: [[application: :remote_ip], [application: :mint]] # Set up mailer config :philomena, PhilomenaWeb.Mailer, adapter: Bamboo.LocalAdapter diff --git a/lib/philomena/http.ex b/lib/philomena/http.ex index 5dc9c5f1..81930b70 100644 --- a/lib/philomena/http.ex +++ b/lib/philomena/http.ex @@ -1,31 +1,38 @@ defmodule Philomena.Http do - @user_agent [ - "User-Agent": - "Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0" - ] - def get!(url, headers \\ [], options \\ []) do - headers = Keyword.merge(@user_agent, headers) |> add_host(url) - options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true]) - - HTTPoison.get!(url, headers, options) + Tesla.get!(client(headers), url, opts: [adapter: adapter_opts(options)]) end def head!(url, headers \\ [], options \\ []) do - headers = Keyword.merge(@user_agent, headers) |> add_host(url) - options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true]) - - HTTPoison.head!(url, headers, options) + Tesla.head!(client(headers), url, opts: [adapter: adapter_opts(options)]) end - # Add host for caching proxies, since hackney doesn't do it for us - defp add_host(headers, url) do - %{host: host} = URI.parse(url) + defp adapter_opts(opts) do + opts = Keyword.merge(opts, max_body: 30_000_000) - Keyword.merge([Host: host, Connection: "close"], headers) + case Application.get_env(:philomena, :proxy_host) do + nil -> + opts + + url -> + Keyword.merge(opts, proxy: proxy_opts(URI.parse(url))) + end end - defp proxy_host do - Application.get_env(:philomena, :proxy_host) + defp proxy_opts(%{host: host, port: port, scheme: "https"}), do: {:https, host, port, []} + defp proxy_opts(%{host: host, port: port, scheme: "http"}), do: {:http, host, port, []} + + defp client(headers) do + Tesla.client( + [ + {Tesla.Middleware.Headers, + [ + {"User-Agent", + "Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/76.0"} + | headers + ]} + ], + Tesla.Adapter.Mint + ) end end diff --git a/lib/philomena/scrapers/deviantart.ex b/lib/philomena/scrapers/deviantart.ex index 70b08128..f83d132f 100644 --- a/lib/philomena/scrapers/deviantart.ex +++ b/lib/philomena/scrapers/deviantart.ex @@ -30,7 +30,7 @@ defmodule Philomena.Scrapers.Deviantart do |> try_old_hires!() end - defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do + defp extract_data!(%Tesla.Env{body: body, status: 200}) do [image] = Regex.run(@image_regex, body, capture: :all_but_first) [source] = Regex.run(@source_regex, body, capture: :all_but_first) [artist] = Regex.run(@artist_regex, source, capture: :all_but_first) @@ -48,25 +48,21 @@ defmodule Philomena.Scrapers.Deviantart do end defp try_intermediary_hires!(%{images: [image]} = data) do - [domain, object_uuid, object_name] = - Regex.run(@cdnint_regex, image.url, capture: :all_but_first) - - built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}" - - case Philomena.Http.head!(built_url) do - %HTTPoison.Response{status_code: 200} -> - # This is the high resolution URL. - - %{ - data - | images: [ - %{ - url: built_url, - camo_url: image.camo_url - } - ] - } - + with [domain, object_uuid, object_name] <- + Regex.run(@cdnint_regex, image.url, capture: :all_but_first), + built_url <- "#{domain}/intermediary/f/#{object_uuid}/#{object_name}", + %Tesla.Env{status: 200} <- Philomena.Http.head!(built_url) do + # This is the high resolution URL. + %{ + data + | images: [ + %{ + url: built_url, + camo_url: image.camo_url + } + ] + } + else _ -> # Nothing to be found here, move along... data @@ -115,9 +111,9 @@ defmodule Philomena.Scrapers.Deviantart do built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png" case Philomena.Http.get!(built_url) do - %HTTPoison.Response{status_code: 301, headers: headers} -> + %Tesla.Env{status: 301, headers: headers} -> # Location header provides URL of high res image. - {_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end) + {_location, link} = Enum.find(headers, fn {header, _val} -> header == "location" end) %{ data @@ -140,7 +136,7 @@ defmodule Philomena.Scrapers.Deviantart do defp follow_redirect(url, max_times) do case Philomena.Http.get!(url) do - %HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] -> + %Tesla.Env{headers: headers, status: code} when code in [301, 302] -> location = Enum.find_value(headers, &location_header/1) follow_redirect(location, max_times - 1) diff --git a/lib/philomena/scrapers/raw.ex b/lib/philomena/scrapers/raw.ex index 6a478d08..01e4506e 100644 --- a/lib/philomena/scrapers/raw.ex +++ b/lib/philomena/scrapers/raw.ex @@ -3,9 +3,9 @@ defmodule Philomena.Scrapers.Raw do @spec can_handle?(URI.t(), String.t()) :: true | false def can_handle?(_uri, url) do - Philomena.Http.head!(url, [], max_body_length: 30_000_000) + Philomena.Http.head!(url) |> case do - %HTTPoison.Response{status_code: 200, headers: headers} -> + %Tesla.Env{status: 200, headers: headers} -> headers |> Enum.any?(fn {k, v} -> String.downcase(k) == "content-type" and String.downcase(v) in @mime_types diff --git a/lib/philomena/scrapers/tumblr.ex b/lib/philomena/scrapers/tumblr.ex index bcc88cc1..4e7a4eae 100644 --- a/lib/philomena/scrapers/tumblr.ex +++ b/lib/philomena/scrapers/tumblr.ex @@ -31,7 +31,7 @@ defmodule Philomena.Scrapers.Tumblr do |> process_response!() end - defp json!(%HTTPoison.Response{body: body, status_code: 200}), + defp json!(%Tesla.Env{body: body, status: 200}), do: Jason.decode!(body) defp process_response!(%{"response" => %{"posts" => [post | _rest]}}), @@ -70,7 +70,7 @@ defmodule Philomena.Scrapers.Tumblr do end defp url_ok?(url) do - match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url)) + match?(%Tesla.Env{status: 200}, Philomena.Http.head!(url)) end defp add_meta(post, images) do diff --git a/lib/philomena/scrapers/twitter.ex b/lib/philomena/scrapers/twitter.ex index 967b3682..ff048c52 100644 --- a/lib/philomena/scrapers/twitter.ex +++ b/lib/philomena/scrapers/twitter.ex @@ -50,7 +50,7 @@ defmodule Philomena.Scrapers.Twitter do |> Map.get(:body) |> extract_guest_token_and_bearer() - Philomena.Http.get!(api_url, Authorization: "Bearer #{bearer}", "x-guest-token": gt) + Philomena.Http.get!(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}]) |> Map.get(:body) |> Jason.decode!() |> Map.get("globalObjects") diff --git a/lib/philomena/servers/picarto_channel_updater.ex b/lib/philomena/servers/picarto_channel_updater.ex index 392797e8..a6edf84a 100644 --- a/lib/philomena/servers/picarto_channel_updater.ex +++ b/lib/philomena/servers/picarto_channel_updater.ex @@ -28,7 +28,7 @@ defmodule Philomena.Servers.PicartoChannelUpdater do run() end - defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do + defp handle_response(%Tesla.Env{body: body, status: 200}, now) do resp = body |> Jason.decode!() diff --git a/lib/philomena/servers/piczel_channel_updater.ex b/lib/philomena/servers/piczel_channel_updater.ex index 0a1717b7..78a57624 100644 --- a/lib/philomena/servers/piczel_channel_updater.ex +++ b/lib/philomena/servers/piczel_channel_updater.ex @@ -28,7 +28,7 @@ defmodule Philomena.Servers.PiczelChannelUpdater do run() end - defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do + defp handle_response(%Tesla.Env{body: body, status: 200}, now) do resp = body |> Jason.decode!() diff --git a/lib/philomena/servers/user_link_updater.ex b/lib/philomena/servers/user_link_updater.ex index 8c76451f..d2ab12f1 100644 --- a/lib/philomena/servers/user_link_updater.ex +++ b/lib/philomena/servers/user_link_updater.ex @@ -63,7 +63,7 @@ defmodule Philomena.Servers.UserLinkUpdater do |> handle_response(user_link) end - defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, user_link) do + defp handle_response(%Tesla.Env{body: body, status: 200}, user_link) do case :binary.match(body, user_link.verification_code) do :nomatch -> nil diff --git a/lib/philomena_web/plugs/compromised_password_check_plug.ex b/lib/philomena_web/plugs/compromised_password_check_plug.ex index e462dfee..5cfb4964 100644 --- a/lib/philomena_web/plugs/compromised_password_check_plug.ex +++ b/lib/philomena_web/plugs/compromised_password_check_plug.ex @@ -32,8 +32,8 @@ defmodule PhilomenaWeb.CompromisedPasswordCheckPlug do :crypto.hash(:sha, password) |> Base.encode16() - case HTTPoison.get(make_api_url(prefix)) do - {:ok, %HTTPoison.Response{body: body, status_code: 200}} -> String.contains?(body, rest) + case Philomena.Http.get!(make_api_url(prefix)) do + %Tesla.Env{body: body, status: 200} -> String.contains?(body, rest) _ -> false end end diff --git a/lib/philomena_web/plugs/scraper_plug.ex b/lib/philomena_web/plugs/scraper_plug.ex index f9454e39..987d5ccc 100644 --- a/lib/philomena_web/plugs/scraper_plug.ex +++ b/lib/philomena_web/plugs/scraper_plug.ex @@ -10,7 +10,7 @@ defmodule PhilomenaWeb.ScraperPlug do conn %{"scraper_cache" => url} when not is_nil(url) -> - Philomena.Http.get!(url, [], max_body_length: 30_000_000) + Philomena.Http.get!(url) |> maybe_fixup_params(opts, conn) _ -> @@ -18,7 +18,7 @@ defmodule PhilomenaWeb.ScraperPlug do end end - defp maybe_fixup_params(%HTTPoison.Response{body: body, status_code: 200}, opts, conn) do + defp maybe_fixup_params(%Tesla.Env{body: body, status: 200}, opts, conn) do params_name = Keyword.get(opts, :params_name, "image") params_key = Keyword.get(opts, :params_key, "image") file = Briefly.create!()