Update response header usages for list format

This commit is contained in:
Liam 2024-06-20 19:22:22 -04:00
parent 44c160b905
commit a344062d53
3 changed files with 19 additions and 62 deletions

View file

@ -9,7 +9,6 @@ defmodule PhilomenaProxy.Scrapers.Deviantart do
@image_regex ~r|data-rh="true" rel="preload" href="([^"]*)" as="image"| @image_regex ~r|data-rh="true" rel="preload" href="([^"]*)" as="image"|
@source_regex ~r|rel="canonical" href="([^"]*)"| @source_regex ~r|rel="canonical" href="([^"]*)"|
@artist_regex ~r|https://www.deviantart.com/([^/]*)/art| @artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
@serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z|
@cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)| @cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
@png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)| @png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
@jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)| @jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
@ -35,7 +34,6 @@ defmodule PhilomenaProxy.Scrapers.Deviantart do
|> extract_data!() |> extract_data!()
|> try_intermediary_hires!() |> try_intermediary_hires!()
|> try_new_hires!() |> try_new_hires!()
|> try_old_hires!()
end end
defp extract_data!({:ok, %{body: body, status: 200}}) do defp extract_data!({:ok, %{body: body, status: 200}}) do
@ -107,36 +105,4 @@ defmodule PhilomenaProxy.Scrapers.Deviantart do
data data
end end
end end
defp try_old_hires!(%{source_url: source, images: [image]} = data) do
[serial] = Regex.run(@serial_regex, source, capture: :all_but_first)
base36 =
serial
|> String.to_integer()
|> Integer.to_string(36)
|> String.downcase()
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
case PhilomenaProxy.Http.get(built_url) do
{:ok, %{status: 301, headers: headers}} ->
# Location header provides URL of high res image.
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "location" end)
%{
data
| images: [
%{
url: link,
camo_url: image.camo_url
}
]
}
_ ->
# Nothing to be found here, move along...
data
end
end
end end

View file

@ -10,14 +10,10 @@ defmodule PhilomenaProxy.Scrapers.Raw do
@spec can_handle?(URI.t(), String.t()) :: boolean() @spec can_handle?(URI.t(), String.t()) :: boolean()
def can_handle?(_uri, url) do def can_handle?(_uri, url) do
PhilomenaProxy.Http.head(url) with {:ok, %{status: 200, headers: headers}} <- PhilomenaProxy.Http.head(url),
|> case do [type | _] <- headers["content-type"] do
{:ok, %{status: 200, headers: headers}} -> String.downcase(type) in @mime_types
headers else
|> Enum.any?(fn {k, v} ->
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
end)
_ -> _ ->
false false
end end

View file

@ -1,10 +1,12 @@
defmodule PhilomenaWeb.ScraperPlug do defmodule PhilomenaWeb.ScraperPlug do
@filename_regex ~r/filename="([^"]+)"/ @filename_regex ~r/filename="([^"]+)"/
@spec init(keyword()) :: keyword()
def init(opts) do def init(opts) do
opts opts
end end
@spec call(Plug.Conn.t(), keyword()) :: Plug.Conn.t()
def call(conn, opts) do def call(conn, opts) do
params_name = Keyword.get(opts, :params_name, "image") params_name = Keyword.get(opts, :params_name, "image")
params_key = Keyword.get(opts, :params_key, "image") params_key = Keyword.get(opts, :params_key, "image")
@ -25,18 +27,13 @@ defmodule PhilomenaWeb.ScraperPlug do
# Writing the tempfile doesn't allow traversal # Writing the tempfile doesn't allow traversal
# sobelow_skip ["Traversal.FileModule"] # sobelow_skip ["Traversal.FileModule"]
defp maybe_fixup_params( defp maybe_fixup_params({:ok, %{status: 200} = resp}, url, opts, conn) do
{:ok, %{body: body, status: 200, headers: headers}},
url,
opts,
conn
) do
params_name = Keyword.get(opts, :params_name, "image") params_name = Keyword.get(opts, :params_name, "image")
params_key = Keyword.get(opts, :params_key, "image") params_key = Keyword.get(opts, :params_key, "image")
name = extract_filename(url, headers) name = extract_filename(url, resp.headers)
file = Plug.Upload.random_file!(UUID.uuid1()) file = Plug.Upload.random_file!(UUID.uuid1())
File.write!(file, body) File.write!(file, resp.body)
fake_upload = %Plug.Upload{ fake_upload = %Plug.Upload{
path: file, path: file,
@ -44,22 +41,20 @@ defmodule PhilomenaWeb.ScraperPlug do
filename: name filename: name
} }
updated_form = Map.put(conn.params[params_name], params_key, fake_upload) put_in(conn.params[params_name][params_key], fake_upload)
updated_params = Map.put(conn.params, params_name, updated_form)
%Plug.Conn{conn | params: updated_params}
end end
defp maybe_fixup_params(_response, _url, _opts, conn), do: conn defp maybe_fixup_params(_response, _url, _opts, conn), do: conn
defp extract_filename(url, resp_headers) do defp extract_filename(url, headers) do
{_, header} = name =
Enum.find(resp_headers, {nil, "filename=\"#{Path.basename(url)}\""}, fn {key, value} -> with [value | _] <- headers["content-disposition"],
key == "content-disposition" and Regex.match?(@filename_regex, value) [name] <- Regex.run(@filename_regex, value, capture: :all_but_first) do
end) name
else
[name] = Regex.run(@filename_regex, header, capture: :all_but_first) _ ->
Path.basename(url)
end
String.slice(name, 0, 127) String.slice(name, 0, 127)
end end