replace use of hackney in scraper with mint

This commit is contained in:
byte[] 2020-05-20 14:18:13 -04:00
parent 68102bdddd
commit 7fca37741b
11 changed files with 58 additions and 55 deletions

View file

@ -65,7 +65,7 @@ config :philomena, PhilomenaWeb.Endpoint,
# Do not include metadata nor timestamps in development logs
config :logger, :console, format: "[$level] $message\n"
config :logger, compile_time_purge_matching: [[application: :remote_ip]]
config :logger, compile_time_purge_matching: [[application: :remote_ip], [application: :mint]]
# Set up mailer
config :philomena, PhilomenaWeb.Mailer, adapter: Bamboo.LocalAdapter

View file

@ -1,31 +1,38 @@
defmodule Philomena.Http do
@user_agent [
"User-Agent":
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"
]
def get!(url, headers \\ [], options \\ []) do
headers = Keyword.merge(@user_agent, headers) |> add_host(url)
options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true])
HTTPoison.get!(url, headers, options)
Tesla.get!(client(headers), url, opts: [adapter: adapter_opts(options)])
end
def head!(url, headers \\ [], options \\ []) do
headers = Keyword.merge(@user_agent, headers) |> add_host(url)
options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true])
HTTPoison.head!(url, headers, options)
Tesla.head!(client(headers), url, opts: [adapter: adapter_opts(options)])
end
# Add host for caching proxies, since hackney doesn't do it for us
defp add_host(headers, url) do
%{host: host} = URI.parse(url)
defp adapter_opts(opts) do
opts = Keyword.merge(opts, max_body: 30_000_000)
Keyword.merge([Host: host, Connection: "close"], headers)
case Application.get_env(:philomena, :proxy_host) do
nil ->
opts
url ->
Keyword.merge(opts, proxy: proxy_opts(URI.parse(url)))
end
end
defp proxy_host do
Application.get_env(:philomena, :proxy_host)
defp proxy_opts(%{host: host, port: port, scheme: "https"}), do: {:https, host, port, []}
defp proxy_opts(%{host: host, port: port, scheme: "http"}), do: {:http, host, port, []}
defp client(headers) do
Tesla.client(
[
{Tesla.Middleware.Headers,
[
{"User-Agent",
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/76.0"}
| headers
]}
],
Tesla.Adapter.Mint
)
end
end

View file

@ -30,7 +30,7 @@ defmodule Philomena.Scrapers.Deviantart do
|> try_old_hires!()
end
defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
defp extract_data!(%Tesla.Env{body: body, status: 200}) do
[image] = Regex.run(@image_regex, body, capture: :all_but_first)
[source] = Regex.run(@source_regex, body, capture: :all_but_first)
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
@ -48,15 +48,11 @@ defmodule Philomena.Scrapers.Deviantart do
end
defp try_intermediary_hires!(%{images: [image]} = data) do
[domain, object_uuid, object_name] =
Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
case Philomena.Http.head!(built_url) do
%HTTPoison.Response{status_code: 200} ->
with [domain, object_uuid, object_name] <-
Regex.run(@cdnint_regex, image.url, capture: :all_but_first),
built_url <- "#{domain}/intermediary/f/#{object_uuid}/#{object_name}",
%Tesla.Env{status: 200} <- Philomena.Http.head!(built_url) do
# This is the high resolution URL.
%{
data
| images: [
@ -66,7 +62,7 @@ defmodule Philomena.Scrapers.Deviantart do
}
]
}
else
_ ->
# Nothing to be found here, move along...
data
@ -115,9 +111,9 @@ defmodule Philomena.Scrapers.Deviantart do
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
case Philomena.Http.get!(built_url) do
%HTTPoison.Response{status_code: 301, headers: headers} ->
%Tesla.Env{status: 301, headers: headers} ->
# Location header provides URL of high res image.
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "location" end)
%{
data
@ -140,7 +136,7 @@ defmodule Philomena.Scrapers.Deviantart do
defp follow_redirect(url, max_times) do
case Philomena.Http.get!(url) do
%HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] ->
%Tesla.Env{headers: headers, status: code} when code in [301, 302] ->
location = Enum.find_value(headers, &location_header/1)
follow_redirect(location, max_times - 1)

View file

@ -3,9 +3,9 @@ defmodule Philomena.Scrapers.Raw do
@spec can_handle?(URI.t(), String.t()) :: true | false
def can_handle?(_uri, url) do
Philomena.Http.head!(url, [], max_body_length: 30_000_000)
Philomena.Http.head!(url)
|> case do
%HTTPoison.Response{status_code: 200, headers: headers} ->
%Tesla.Env{status: 200, headers: headers} ->
headers
|> Enum.any?(fn {k, v} ->
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types

View file

@ -31,7 +31,7 @@ defmodule Philomena.Scrapers.Tumblr do
|> process_response!()
end
defp json!(%HTTPoison.Response{body: body, status_code: 200}),
defp json!(%Tesla.Env{body: body, status: 200}),
do: Jason.decode!(body)
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
@ -70,7 +70,7 @@ defmodule Philomena.Scrapers.Tumblr do
end
defp url_ok?(url) do
match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
match?(%Tesla.Env{status: 200}, Philomena.Http.head!(url))
end
defp add_meta(post, images) do

View file

@ -50,7 +50,7 @@ defmodule Philomena.Scrapers.Twitter do
|> Map.get(:body)
|> extract_guest_token_and_bearer()
Philomena.Http.get!(api_url, Authorization: "Bearer #{bearer}", "x-guest-token": gt)
Philomena.Http.get!(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}])
|> Map.get(:body)
|> Jason.decode!()
|> Map.get("globalObjects")

View file

@ -28,7 +28,7 @@ defmodule Philomena.Servers.PicartoChannelUpdater do
run()
end
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do
defp handle_response(%Tesla.Env{body: body, status: 200}, now) do
resp =
body
|> Jason.decode!()

View file

@ -28,7 +28,7 @@ defmodule Philomena.Servers.PiczelChannelUpdater do
run()
end
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do
defp handle_response(%Tesla.Env{body: body, status: 200}, now) do
resp =
body
|> Jason.decode!()

View file

@ -63,7 +63,7 @@ defmodule Philomena.Servers.UserLinkUpdater do
|> handle_response(user_link)
end
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, user_link) do
defp handle_response(%Tesla.Env{body: body, status: 200}, user_link) do
case :binary.match(body, user_link.verification_code) do
:nomatch ->
nil

View file

@ -32,8 +32,8 @@ defmodule PhilomenaWeb.CompromisedPasswordCheckPlug do
:crypto.hash(:sha, password)
|> Base.encode16()
case HTTPoison.get(make_api_url(prefix)) do
{:ok, %HTTPoison.Response{body: body, status_code: 200}} -> String.contains?(body, rest)
case Philomena.Http.get!(make_api_url(prefix)) do
%Tesla.Env{body: body, status: 200} -> String.contains?(body, rest)
_ -> false
end
end

View file

@ -10,7 +10,7 @@ defmodule PhilomenaWeb.ScraperPlug do
conn
%{"scraper_cache" => url} when not is_nil(url) ->
Philomena.Http.get!(url, [], max_body_length: 30_000_000)
Philomena.Http.get!(url)
|> maybe_fixup_params(opts, conn)
_ ->
@ -18,7 +18,7 @@ defmodule PhilomenaWeb.ScraperPlug do
end
end
defp maybe_fixup_params(%HTTPoison.Response{body: body, status_code: 200}, opts, conn) do
defp maybe_fixup_params(%Tesla.Env{body: body, status: 200}, opts, conn) do
params_name = Keyword.get(opts, :params_name, "image")
params_key = Keyword.get(opts, :params_key, "image")
file = Briefly.create!()