replace use of hackney in scraper with mint

This commit is contained in:
byte[] 2020-05-20 14:18:13 -04:00
parent 68102bdddd
commit 7fca37741b
11 changed files with 58 additions and 55 deletions

View file

@ -65,7 +65,7 @@ config :philomena, PhilomenaWeb.Endpoint,
# Do not include metadata nor timestamps in development logs # Do not include metadata nor timestamps in development logs
config :logger, :console, format: "[$level] $message\n" config :logger, :console, format: "[$level] $message\n"
config :logger, compile_time_purge_matching: [[application: :remote_ip]] config :logger, compile_time_purge_matching: [[application: :remote_ip], [application: :mint]]
# Set up mailer # Set up mailer
config :philomena, PhilomenaWeb.Mailer, adapter: Bamboo.LocalAdapter config :philomena, PhilomenaWeb.Mailer, adapter: Bamboo.LocalAdapter

View file

@ -1,31 +1,38 @@
defmodule Philomena.Http do defmodule Philomena.Http do
@user_agent [
"User-Agent":
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"
]
def get!(url, headers \\ [], options \\ []) do def get!(url, headers \\ [], options \\ []) do
headers = Keyword.merge(@user_agent, headers) |> add_host(url) Tesla.get!(client(headers), url, opts: [adapter: adapter_opts(options)])
options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true])
HTTPoison.get!(url, headers, options)
end end
def head!(url, headers \\ [], options \\ []) do def head!(url, headers \\ [], options \\ []) do
headers = Keyword.merge(@user_agent, headers) |> add_host(url) Tesla.head!(client(headers), url, opts: [adapter: adapter_opts(options)])
options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true])
HTTPoison.head!(url, headers, options)
end end
# Add host for caching proxies, since hackney doesn't do it for us defp adapter_opts(opts) do
defp add_host(headers, url) do opts = Keyword.merge(opts, max_body: 30_000_000)
%{host: host} = URI.parse(url)
Keyword.merge([Host: host, Connection: "close"], headers) case Application.get_env(:philomena, :proxy_host) do
nil ->
opts
url ->
Keyword.merge(opts, proxy: proxy_opts(URI.parse(url)))
end
end end
defp proxy_host do defp proxy_opts(%{host: host, port: port, scheme: "https"}), do: {:https, host, port, []}
Application.get_env(:philomena, :proxy_host) defp proxy_opts(%{host: host, port: port, scheme: "http"}), do: {:http, host, port, []}
defp client(headers) do
Tesla.client(
[
{Tesla.Middleware.Headers,
[
{"User-Agent",
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/76.0"}
| headers
]}
],
Tesla.Adapter.Mint
)
end end
end end

View file

@ -30,7 +30,7 @@ defmodule Philomena.Scrapers.Deviantart do
|> try_old_hires!() |> try_old_hires!()
end end
defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do defp extract_data!(%Tesla.Env{body: body, status: 200}) do
[image] = Regex.run(@image_regex, body, capture: :all_but_first) [image] = Regex.run(@image_regex, body, capture: :all_but_first)
[source] = Regex.run(@source_regex, body, capture: :all_but_first) [source] = Regex.run(@source_regex, body, capture: :all_but_first)
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first) [artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
@ -48,15 +48,11 @@ defmodule Philomena.Scrapers.Deviantart do
end end
defp try_intermediary_hires!(%{images: [image]} = data) do defp try_intermediary_hires!(%{images: [image]} = data) do
[domain, object_uuid, object_name] = with [domain, object_uuid, object_name] <-
Regex.run(@cdnint_regex, image.url, capture: :all_but_first) Regex.run(@cdnint_regex, image.url, capture: :all_but_first),
built_url <- "#{domain}/intermediary/f/#{object_uuid}/#{object_name}",
built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}" %Tesla.Env{status: 200} <- Philomena.Http.head!(built_url) do
case Philomena.Http.head!(built_url) do
%HTTPoison.Response{status_code: 200} ->
# This is the high resolution URL. # This is the high resolution URL.
%{ %{
data data
| images: [ | images: [
@ -66,7 +62,7 @@ defmodule Philomena.Scrapers.Deviantart do
} }
] ]
} }
else
_ -> _ ->
# Nothing to be found here, move along... # Nothing to be found here, move along...
data data
@ -115,9 +111,9 @@ defmodule Philomena.Scrapers.Deviantart do
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png" built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
case Philomena.Http.get!(built_url) do case Philomena.Http.get!(built_url) do
%HTTPoison.Response{status_code: 301, headers: headers} -> %Tesla.Env{status: 301, headers: headers} ->
# Location header provides URL of high res image. # Location header provides URL of high res image.
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end) {_location, link} = Enum.find(headers, fn {header, _val} -> header == "location" end)
%{ %{
data data
@ -140,7 +136,7 @@ defmodule Philomena.Scrapers.Deviantart do
defp follow_redirect(url, max_times) do defp follow_redirect(url, max_times) do
case Philomena.Http.get!(url) do case Philomena.Http.get!(url) do
%HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] -> %Tesla.Env{headers: headers, status: code} when code in [301, 302] ->
location = Enum.find_value(headers, &location_header/1) location = Enum.find_value(headers, &location_header/1)
follow_redirect(location, max_times - 1) follow_redirect(location, max_times - 1)

View file

@ -3,9 +3,9 @@ defmodule Philomena.Scrapers.Raw do
@spec can_handle?(URI.t(), String.t()) :: true | false @spec can_handle?(URI.t(), String.t()) :: true | false
def can_handle?(_uri, url) do def can_handle?(_uri, url) do
Philomena.Http.head!(url, [], max_body_length: 30_000_000) Philomena.Http.head!(url)
|> case do |> case do
%HTTPoison.Response{status_code: 200, headers: headers} -> %Tesla.Env{status: 200, headers: headers} ->
headers headers
|> Enum.any?(fn {k, v} -> |> Enum.any?(fn {k, v} ->
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types String.downcase(k) == "content-type" and String.downcase(v) in @mime_types

View file

@ -31,7 +31,7 @@ defmodule Philomena.Scrapers.Tumblr do
|> process_response!() |> process_response!()
end end
defp json!(%HTTPoison.Response{body: body, status_code: 200}), defp json!(%Tesla.Env{body: body, status: 200}),
do: Jason.decode!(body) do: Jason.decode!(body)
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}), defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
@ -70,7 +70,7 @@ defmodule Philomena.Scrapers.Tumblr do
end end
defp url_ok?(url) do defp url_ok?(url) do
match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url)) match?(%Tesla.Env{status: 200}, Philomena.Http.head!(url))
end end
defp add_meta(post, images) do defp add_meta(post, images) do

View file

@ -50,7 +50,7 @@ defmodule Philomena.Scrapers.Twitter do
|> Map.get(:body) |> Map.get(:body)
|> extract_guest_token_and_bearer() |> extract_guest_token_and_bearer()
Philomena.Http.get!(api_url, Authorization: "Bearer #{bearer}", "x-guest-token": gt) Philomena.Http.get!(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}])
|> Map.get(:body) |> Map.get(:body)
|> Jason.decode!() |> Jason.decode!()
|> Map.get("globalObjects") |> Map.get("globalObjects")

View file

@ -28,7 +28,7 @@ defmodule Philomena.Servers.PicartoChannelUpdater do
run() run()
end end
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do defp handle_response(%Tesla.Env{body: body, status: 200}, now) do
resp = resp =
body body
|> Jason.decode!() |> Jason.decode!()

View file

@ -28,7 +28,7 @@ defmodule Philomena.Servers.PiczelChannelUpdater do
run() run()
end end
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do defp handle_response(%Tesla.Env{body: body, status: 200}, now) do
resp = resp =
body body
|> Jason.decode!() |> Jason.decode!()

View file

@ -63,7 +63,7 @@ defmodule Philomena.Servers.UserLinkUpdater do
|> handle_response(user_link) |> handle_response(user_link)
end end
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, user_link) do defp handle_response(%Tesla.Env{body: body, status: 200}, user_link) do
case :binary.match(body, user_link.verification_code) do case :binary.match(body, user_link.verification_code) do
:nomatch -> :nomatch ->
nil nil

View file

@ -32,8 +32,8 @@ defmodule PhilomenaWeb.CompromisedPasswordCheckPlug do
:crypto.hash(:sha, password) :crypto.hash(:sha, password)
|> Base.encode16() |> Base.encode16()
case HTTPoison.get(make_api_url(prefix)) do case Philomena.Http.get!(make_api_url(prefix)) do
{:ok, %HTTPoison.Response{body: body, status_code: 200}} -> String.contains?(body, rest) %Tesla.Env{body: body, status: 200} -> String.contains?(body, rest)
_ -> false _ -> false
end end
end end

View file

@ -10,7 +10,7 @@ defmodule PhilomenaWeb.ScraperPlug do
conn conn
%{"scraper_cache" => url} when not is_nil(url) -> %{"scraper_cache" => url} when not is_nil(url) ->
Philomena.Http.get!(url, [], max_body_length: 30_000_000) Philomena.Http.get!(url)
|> maybe_fixup_params(opts, conn) |> maybe_fixup_params(opts, conn)
_ -> _ ->
@ -18,7 +18,7 @@ defmodule PhilomenaWeb.ScraperPlug do
end end
end end
defp maybe_fixup_params(%HTTPoison.Response{body: body, status_code: 200}, opts, conn) do defp maybe_fixup_params(%Tesla.Env{body: body, status: 200}, opts, conn) do
params_name = Keyword.get(opts, :params_name, "image") params_name = Keyword.get(opts, :params_name, "image")
params_key = Keyword.get(opts, :params_key, "image") params_key = Keyword.get(opts, :params_key, "image")
file = Briefly.create!() file = Briefly.create!()