mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-11-23 12:08:00 +01:00
replace use of hackney in scraper with mint
This commit is contained in:
parent
68102bdddd
commit
7fca37741b
11 changed files with 58 additions and 55 deletions
|
@ -65,7 +65,7 @@ config :philomena, PhilomenaWeb.Endpoint,
|
||||||
|
|
||||||
# Do not include metadata nor timestamps in development logs
|
# Do not include metadata nor timestamps in development logs
|
||||||
config :logger, :console, format: "[$level] $message\n"
|
config :logger, :console, format: "[$level] $message\n"
|
||||||
config :logger, compile_time_purge_matching: [[application: :remote_ip]]
|
config :logger, compile_time_purge_matching: [[application: :remote_ip], [application: :mint]]
|
||||||
|
|
||||||
# Set up mailer
|
# Set up mailer
|
||||||
config :philomena, PhilomenaWeb.Mailer, adapter: Bamboo.LocalAdapter
|
config :philomena, PhilomenaWeb.Mailer, adapter: Bamboo.LocalAdapter
|
||||||
|
|
|
@ -1,31 +1,38 @@
|
||||||
defmodule Philomena.Http do
|
defmodule Philomena.Http do
|
||||||
@user_agent [
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"
|
|
||||||
]
|
|
||||||
|
|
||||||
def get!(url, headers \\ [], options \\ []) do
|
def get!(url, headers \\ [], options \\ []) do
|
||||||
headers = Keyword.merge(@user_agent, headers) |> add_host(url)
|
Tesla.get!(client(headers), url, opts: [adapter: adapter_opts(options)])
|
||||||
options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true])
|
|
||||||
|
|
||||||
HTTPoison.get!(url, headers, options)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def head!(url, headers \\ [], options \\ []) do
|
def head!(url, headers \\ [], options \\ []) do
|
||||||
headers = Keyword.merge(@user_agent, headers) |> add_host(url)
|
Tesla.head!(client(headers), url, opts: [adapter: adapter_opts(options)])
|
||||||
options = Keyword.merge(options, proxy: proxy_host(), ssl: [insecure: true])
|
|
||||||
|
|
||||||
HTTPoison.head!(url, headers, options)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Add host for caching proxies, since hackney doesn't do it for us
|
defp adapter_opts(opts) do
|
||||||
defp add_host(headers, url) do
|
opts = Keyword.merge(opts, max_body: 30_000_000)
|
||||||
%{host: host} = URI.parse(url)
|
|
||||||
|
|
||||||
Keyword.merge([Host: host, Connection: "close"], headers)
|
case Application.get_env(:philomena, :proxy_host) do
|
||||||
|
nil ->
|
||||||
|
opts
|
||||||
|
|
||||||
|
url ->
|
||||||
|
Keyword.merge(opts, proxy: proxy_opts(URI.parse(url)))
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp proxy_host do
|
defp proxy_opts(%{host: host, port: port, scheme: "https"}), do: {:https, host, port, []}
|
||||||
Application.get_env(:philomena, :proxy_host)
|
defp proxy_opts(%{host: host, port: port, scheme: "http"}), do: {:http, host, port, []}
|
||||||
|
|
||||||
|
defp client(headers) do
|
||||||
|
Tesla.client(
|
||||||
|
[
|
||||||
|
{Tesla.Middleware.Headers,
|
||||||
|
[
|
||||||
|
{"User-Agent",
|
||||||
|
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/76.0"}
|
||||||
|
| headers
|
||||||
|
]}
|
||||||
|
],
|
||||||
|
Tesla.Adapter.Mint
|
||||||
|
)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -30,7 +30,7 @@ defmodule Philomena.Scrapers.Deviantart do
|
||||||
|> try_old_hires!()
|
|> try_old_hires!()
|
||||||
end
|
end
|
||||||
|
|
||||||
defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
|
defp extract_data!(%Tesla.Env{body: body, status: 200}) do
|
||||||
[image] = Regex.run(@image_regex, body, capture: :all_but_first)
|
[image] = Regex.run(@image_regex, body, capture: :all_but_first)
|
||||||
[source] = Regex.run(@source_regex, body, capture: :all_but_first)
|
[source] = Regex.run(@source_regex, body, capture: :all_but_first)
|
||||||
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
|
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
|
||||||
|
@ -48,25 +48,21 @@ defmodule Philomena.Scrapers.Deviantart do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp try_intermediary_hires!(%{images: [image]} = data) do
|
defp try_intermediary_hires!(%{images: [image]} = data) do
|
||||||
[domain, object_uuid, object_name] =
|
with [domain, object_uuid, object_name] <-
|
||||||
Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
|
Regex.run(@cdnint_regex, image.url, capture: :all_but_first),
|
||||||
|
built_url <- "#{domain}/intermediary/f/#{object_uuid}/#{object_name}",
|
||||||
built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
|
%Tesla.Env{status: 200} <- Philomena.Http.head!(built_url) do
|
||||||
|
# This is the high resolution URL.
|
||||||
case Philomena.Http.head!(built_url) do
|
%{
|
||||||
%HTTPoison.Response{status_code: 200} ->
|
data
|
||||||
# This is the high resolution URL.
|
| images: [
|
||||||
|
%{
|
||||||
%{
|
url: built_url,
|
||||||
data
|
camo_url: image.camo_url
|
||||||
| images: [
|
}
|
||||||
%{
|
]
|
||||||
url: built_url,
|
}
|
||||||
camo_url: image.camo_url
|
else
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
# Nothing to be found here, move along...
|
# Nothing to be found here, move along...
|
||||||
data
|
data
|
||||||
|
@ -115,9 +111,9 @@ defmodule Philomena.Scrapers.Deviantart do
|
||||||
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
|
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
|
||||||
|
|
||||||
case Philomena.Http.get!(built_url) do
|
case Philomena.Http.get!(built_url) do
|
||||||
%HTTPoison.Response{status_code: 301, headers: headers} ->
|
%Tesla.Env{status: 301, headers: headers} ->
|
||||||
# Location header provides URL of high res image.
|
# Location header provides URL of high res image.
|
||||||
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
|
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "location" end)
|
||||||
|
|
||||||
%{
|
%{
|
||||||
data
|
data
|
||||||
|
@ -140,7 +136,7 @@ defmodule Philomena.Scrapers.Deviantart do
|
||||||
|
|
||||||
defp follow_redirect(url, max_times) do
|
defp follow_redirect(url, max_times) do
|
||||||
case Philomena.Http.get!(url) do
|
case Philomena.Http.get!(url) do
|
||||||
%HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] ->
|
%Tesla.Env{headers: headers, status: code} when code in [301, 302] ->
|
||||||
location = Enum.find_value(headers, &location_header/1)
|
location = Enum.find_value(headers, &location_header/1)
|
||||||
follow_redirect(location, max_times - 1)
|
follow_redirect(location, max_times - 1)
|
||||||
|
|
||||||
|
|
|
@ -3,9 +3,9 @@ defmodule Philomena.Scrapers.Raw do
|
||||||
|
|
||||||
@spec can_handle?(URI.t(), String.t()) :: true | false
|
@spec can_handle?(URI.t(), String.t()) :: true | false
|
||||||
def can_handle?(_uri, url) do
|
def can_handle?(_uri, url) do
|
||||||
Philomena.Http.head!(url, [], max_body_length: 30_000_000)
|
Philomena.Http.head!(url)
|
||||||
|> case do
|
|> case do
|
||||||
%HTTPoison.Response{status_code: 200, headers: headers} ->
|
%Tesla.Env{status: 200, headers: headers} ->
|
||||||
headers
|
headers
|
||||||
|> Enum.any?(fn {k, v} ->
|
|> Enum.any?(fn {k, v} ->
|
||||||
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
|
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
|
||||||
|
|
|
@ -31,7 +31,7 @@ defmodule Philomena.Scrapers.Tumblr do
|
||||||
|> process_response!()
|
|> process_response!()
|
||||||
end
|
end
|
||||||
|
|
||||||
defp json!(%HTTPoison.Response{body: body, status_code: 200}),
|
defp json!(%Tesla.Env{body: body, status: 200}),
|
||||||
do: Jason.decode!(body)
|
do: Jason.decode!(body)
|
||||||
|
|
||||||
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
|
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
|
||||||
|
@ -70,7 +70,7 @@ defmodule Philomena.Scrapers.Tumblr do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp url_ok?(url) do
|
defp url_ok?(url) do
|
||||||
match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
|
match?(%Tesla.Env{status: 200}, Philomena.Http.head!(url))
|
||||||
end
|
end
|
||||||
|
|
||||||
defp add_meta(post, images) do
|
defp add_meta(post, images) do
|
||||||
|
|
|
@ -50,7 +50,7 @@ defmodule Philomena.Scrapers.Twitter do
|
||||||
|> Map.get(:body)
|
|> Map.get(:body)
|
||||||
|> extract_guest_token_and_bearer()
|
|> extract_guest_token_and_bearer()
|
||||||
|
|
||||||
Philomena.Http.get!(api_url, Authorization: "Bearer #{bearer}", "x-guest-token": gt)
|
Philomena.Http.get!(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}])
|
||||||
|> Map.get(:body)
|
|> Map.get(:body)
|
||||||
|> Jason.decode!()
|
|> Jason.decode!()
|
||||||
|> Map.get("globalObjects")
|
|> Map.get("globalObjects")
|
||||||
|
|
|
@ -28,7 +28,7 @@ defmodule Philomena.Servers.PicartoChannelUpdater do
|
||||||
run()
|
run()
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do
|
defp handle_response(%Tesla.Env{body: body, status: 200}, now) do
|
||||||
resp =
|
resp =
|
||||||
body
|
body
|
||||||
|> Jason.decode!()
|
|> Jason.decode!()
|
||||||
|
|
|
@ -28,7 +28,7 @@ defmodule Philomena.Servers.PiczelChannelUpdater do
|
||||||
run()
|
run()
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, now) do
|
defp handle_response(%Tesla.Env{body: body, status: 200}, now) do
|
||||||
resp =
|
resp =
|
||||||
body
|
body
|
||||||
|> Jason.decode!()
|
|> Jason.decode!()
|
||||||
|
|
|
@ -63,7 +63,7 @@ defmodule Philomena.Servers.UserLinkUpdater do
|
||||||
|> handle_response(user_link)
|
|> handle_response(user_link)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(%HTTPoison.Response{body: body, status_code: 200}, user_link) do
|
defp handle_response(%Tesla.Env{body: body, status: 200}, user_link) do
|
||||||
case :binary.match(body, user_link.verification_code) do
|
case :binary.match(body, user_link.verification_code) do
|
||||||
:nomatch ->
|
:nomatch ->
|
||||||
nil
|
nil
|
||||||
|
|
|
@ -32,8 +32,8 @@ defmodule PhilomenaWeb.CompromisedPasswordCheckPlug do
|
||||||
:crypto.hash(:sha, password)
|
:crypto.hash(:sha, password)
|
||||||
|> Base.encode16()
|
|> Base.encode16()
|
||||||
|
|
||||||
case HTTPoison.get(make_api_url(prefix)) do
|
case Philomena.Http.get!(make_api_url(prefix)) do
|
||||||
{:ok, %HTTPoison.Response{body: body, status_code: 200}} -> String.contains?(body, rest)
|
%Tesla.Env{body: body, status: 200} -> String.contains?(body, rest)
|
||||||
_ -> false
|
_ -> false
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -10,7 +10,7 @@ defmodule PhilomenaWeb.ScraperPlug do
|
||||||
conn
|
conn
|
||||||
|
|
||||||
%{"scraper_cache" => url} when not is_nil(url) ->
|
%{"scraper_cache" => url} when not is_nil(url) ->
|
||||||
Philomena.Http.get!(url, [], max_body_length: 30_000_000)
|
Philomena.Http.get!(url)
|
||||||
|> maybe_fixup_params(opts, conn)
|
|> maybe_fixup_params(opts, conn)
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
|
@ -18,7 +18,7 @@ defmodule PhilomenaWeb.ScraperPlug do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp maybe_fixup_params(%HTTPoison.Response{body: body, status_code: 200}, opts, conn) do
|
defp maybe_fixup_params(%Tesla.Env{body: body, status: 200}, opts, conn) do
|
||||||
params_name = Keyword.get(opts, :params_name, "image")
|
params_name = Keyword.get(opts, :params_name, "image")
|
||||||
params_key = Keyword.get(opts, :params_key, "image")
|
params_key = Keyword.get(opts, :params_key, "image")
|
||||||
file = Briefly.create!()
|
file = Briefly.create!()
|
||||||
|
|
Loading…
Reference in a new issue