2024-05-25 03:15:05 +02:00
|
|
|
defmodule PhilomenaProxy.Scrapers.Deviantart do
|
|
|
|
@moduledoc false
|
|
|
|
|
|
|
|
alias PhilomenaProxy.Scrapers.Scraper
|
|
|
|
alias PhilomenaProxy.Scrapers
|
|
|
|
|
|
|
|
@behaviour Scraper
|
|
|
|
|
2022-09-04 05:10:52 +02:00
|
|
|
@image_regex ~r|data-rh="true" rel="preload" href="([^"]*)" as="image"|
|
|
|
|
@source_regex ~r|rel="canonical" href="([^"]*)"|
|
2019-11-28 18:12:10 +01:00
|
|
|
@artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
|
|
|
|
@cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
|
|
|
|
@png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
|
|
|
|
@jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
|
|
|
|
|
2024-05-25 03:15:05 +02:00
|
|
|
@spec can_handle?(URI.t(), String.t()) :: boolean()
|
2019-11-28 18:12:10 +01:00
|
|
|
def can_handle?(uri, _url) do
|
|
|
|
String.ends_with?(uri.host, "deviantart.com")
|
|
|
|
end
|
|
|
|
|
|
|
|
# https://github.com/DeviantArt/DeviantArt-API/issues/153
|
|
|
|
#
|
|
|
|
# Note that Erlang (and by extension Elixir) do not have any sort of
|
|
|
|
# reliable HTML/XML parsers that can accept untrusted input. As an example,
|
|
|
|
# xmerl is vulnerable to almost every XML attack which has ever been
|
|
|
|
# created, and also exposes the runtime to symbol DoS as an added bonus.
|
|
|
|
#
|
|
|
|
# So, regex it is. Eat dirt, deviantart. You don't deserve the respect
|
|
|
|
# artists give you.
|
2024-05-25 03:15:05 +02:00
|
|
|
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
|
2019-11-28 18:12:10 +01:00
|
|
|
def scrape(_uri, url) do
|
|
|
|
url
|
2024-06-20 05:57:00 +02:00
|
|
|
|> PhilomenaProxy.Http.get()
|
2019-11-28 18:12:10 +01:00
|
|
|
|> extract_data!()
|
|
|
|
|> try_intermediary_hires!()
|
|
|
|
|> try_new_hires!()
|
|
|
|
end
|
|
|
|
|
2024-06-20 05:03:44 +02:00
|
|
|
defp extract_data!({:ok, %{body: body, status: 200}}) do
|
2019-11-28 18:12:10 +01:00
|
|
|
[image] = Regex.run(@image_regex, body, capture: :all_but_first)
|
|
|
|
[source] = Regex.run(@source_regex, body, capture: :all_but_first)
|
|
|
|
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
|
|
|
|
|
|
|
|
%{
|
|
|
|
source_url: source,
|
|
|
|
author_name: artist,
|
2024-05-25 03:15:05 +02:00
|
|
|
description: "",
|
2019-11-28 18:12:10 +01:00
|
|
|
images: [
|
|
|
|
%{
|
|
|
|
url: image,
|
2024-05-25 03:15:05 +02:00
|
|
|
camo_url: PhilomenaProxy.Camo.image_url(image)
|
2019-11-28 18:12:10 +01:00
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
|
|
|
defp try_intermediary_hires!(%{images: [image]} = data) do
|
2020-05-20 20:18:13 +02:00
|
|
|
with [domain, object_uuid, object_name] <-
|
|
|
|
Regex.run(@cdnint_regex, image.url, capture: :all_but_first),
|
|
|
|
built_url <- "#{domain}/intermediary/f/#{object_uuid}/#{object_name}",
|
2024-06-20 05:03:44 +02:00
|
|
|
{:ok, %{status: 200}} <- PhilomenaProxy.Http.head(built_url) do
|
2020-05-20 20:18:13 +02:00
|
|
|
# This is the high resolution URL.
|
|
|
|
%{
|
|
|
|
data
|
|
|
|
| images: [
|
|
|
|
%{
|
|
|
|
url: built_url,
|
|
|
|
camo_url: image.camo_url
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
else
|
2019-11-28 18:12:10 +01:00
|
|
|
_ ->
|
|
|
|
# Nothing to be found here, move along...
|
|
|
|
data
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp try_new_hires!(%{images: [image]} = data) do
|
|
|
|
cond do
|
|
|
|
String.match?(image.url, @png_regex) ->
|
|
|
|
%{
|
2020-01-11 05:20:19 +01:00
|
|
|
data
|
|
|
|
| images: [
|
|
|
|
%{
|
|
|
|
url: String.replace(image.url, @png_regex, "\\1.png\\3"),
|
|
|
|
camo_url: image.camo_url
|
|
|
|
}
|
|
|
|
]
|
2019-11-28 18:12:10 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
String.match?(image.url, @jpg_regex) ->
|
|
|
|
%{
|
2020-01-11 05:20:19 +01:00
|
|
|
data
|
|
|
|
| images: [
|
|
|
|
%{
|
|
|
|
url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"),
|
|
|
|
camo_url: image.camo_url
|
|
|
|
}
|
|
|
|
]
|
2019-11-28 18:12:10 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
true ->
|
|
|
|
# Nothing to be found here, move along...
|
|
|
|
data
|
|
|
|
end
|
|
|
|
end
|
2019-12-19 00:51:02 +01:00
|
|
|
end
|