diff --git a/config/config.exs b/config/config.exs
index f72ebe32..ed213b64 100644
--- a/config/config.exs
+++ b/config/config.exs
@@ -12,10 +12,13 @@ config :philomena,
elasticsearch_url: "http://localhost:9200",
password_pepper: "dn2e0EpZrvBLoxUM3gfQveBhjf0bG/6/bYhrOyq3L3hV9hdo/bimJ+irbDWsuXLP",
otp_secret_key: "Wn7O/8DD+qxL0X4X7bvT90wOkVGcA90bIHww4twR03Ci//zq7PnMw8ypqyyT/b/C",
+ tumblr_api_key: "fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4",
image_url_root: "/img",
avatar_url_root: "/avatars",
badge_url_root: "/media",
- image_file_root: "priv/static/system/images"
+ image_file_root: "priv/static/system/images",
+ cdn_host: "",
+ proxy_host: nil
config :philomena, :pow,
user: Philomena.Users.User,
diff --git a/config/prod.secret.exs b/config/prod.secret.exs
index 5e0c2a00..a612dfab 100644
--- a/config/prod.secret.exs
+++ b/config/prod.secret.exs
@@ -19,10 +19,12 @@ config :philomena,
password_pepper: System.get_env("PASSWORD_PEPPER"),
avatar_url_root: System.get_env("AVATAR_URL_ROOT"),
image_file_root: System.get_env("IMAGE_FILE_ROOT"),
+ tumblr_api_key: System.get_env("TUMBLR_API_KEY"),
otp_secret_key: System.get_env("OTP_SECRET_KEY"),
image_url_root: System.get_env("IMAGE_URL_ROOT"),
badge_url_root: System.get_env("BADGE_URL_ROOT"),
mailer_address: System.get_env("MAILER_ADDRESS"),
+ proxy_host: System.get_env("PROXY_HOST"),
camo_host: System.get_env("CAMO_HOST"),
camo_key: System.get_env("CAMO_KEY"),
cdn_host: System.get_env("CDN_HOST")
diff --git a/lib/philomena/http.ex b/lib/philomena/http.ex
new file mode 100644
index 00000000..12705e89
--- /dev/null
+++ b/lib/philomena/http.ex
@@ -0,0 +1,17 @@
+defmodule Philomena.Http do
+ def get!(url, headers \\ [], options \\ []) do
+ options = Keyword.merge(options, proxy: proxy_host())
+ HTTPoison.get!(url, headers, options)
+ end
+ def head!(url, headers \\ [], options \\ []) do
+ options = Keyword.merge(options, proxy: proxy_host())
+ HTTPoison.head!(url, headers, options)
+ end
+ defp proxy_host do
+ Application.get_env(:philomena, :proxy_host)
+ end
\ No newline at end of file
diff --git a/lib/philomena/processors/webm.ex b/lib/philomena/processors/webm.ex
index 6aa5b1ae..aad0923d 100644
--- a/lib/philomena/processors/webm.ex
+++ b/lib/philomena/processors/webm.ex
@@ -30,10 +30,10 @@ defmodule Philomena.Processors.Webm do
defp scale_if_smaller(file, palette, dimensions, {:full, _target_dim}) do
- {webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
+ {_webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
- {:copy, webm, "full.webm"},
+ {:symlink_original, "full.webm"},
{:copy, mp4, "full.mp4"}
diff --git a/lib/philomena/scrapers.ex b/lib/philomena/scrapers.ex
new file mode 100644
index 00000000..39668e5b
--- /dev/null
+++ b/lib/philomena/scrapers.ex
@@ -0,0 +1,24 @@
+defmodule Philomena.Scrapers do
+ @scrapers [
+ Philomena.Scrapers.Deviantart,
+ Philomena.Scrapers.Twitter,
+ Philomena.Scrapers.Tumblr,
+ Philomena.Scrapers.Raw
+ ]
+ def scrape!(url) do
+ uri = URI.parse(url)
+ @scrapers
+ |> Enum.find(& &1.can_handle?(uri, url))
+ |> wrap()
+ |> Enum.map(& &1.scrape(uri, url))
+ |> unwrap()
+ end
+ defp wrap(nil), do: []
+ defp wrap(res), do: [res]
+ defp unwrap([result]), do: result
+ defp unwrap(_result), do: nil
\ No newline at end of file
diff --git a/lib/philomena/scrapers/deviantart.ex b/lib/philomena/scrapers/deviantart.ex
new file mode 100644
index 00000000..b6c1841e
--- /dev/null
+++ b/lib/philomena/scrapers/deviantart.ex
@@ -0,0 +1,135 @@
+defmodule Philomena.Scrapers.Deviantart do
+ @image_regex ~r||
+ @source_regex ~r||
+ @artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
+ @serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z|
+ @cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
+ @png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
+ @jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
+ @spec can_handle?(URI.t(), String.t()) :: true | false
+ def can_handle?(uri, _url) do
+ String.ends_with?(uri.host, "deviantart.com")
+ end
+ # https://github.com/DeviantArt/DeviantArt-API/issues/153
+ #
+ # Note that Erlang (and by extension Elixir) do not have any sort of
+ # reliable HTML/XML parsers that can accept untrusted input. As an example,
+ # xmerl is vulnerable to almost every XML attack which has ever been
+ # created, and also exposes the runtime to symbol DoS as an added bonus.
+ #
+ # So, regex it is. Eat dirt, deviantart. You don't deserve the respect
+ # artists give you.
+ def scrape(_uri, url) do
+ url
+ |> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
+ |> extract_data!()
+ |> try_intermediary_hires!()
+ |> try_new_hires!()
+ |> try_old_hires!()
+ end
+ defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
+ [image] = Regex.run(@image_regex, body, capture: :all_but_first)
+ [source] = Regex.run(@source_regex, body, capture: :all_but_first)
+ [artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
+ %{
+ source_url: source,
+ author_name: artist,
+ images: [
+ %{
+ url: image,
+ camo_url: Camo.Image.image_url(image)
+ }
+ ]
+ }
+ end
+ defp try_intermediary_hires!(%{images: [image]} = data) do
+ [domain, object_uuid, object_name] = Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
+ built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
+ case Philomena.Http.head!(built_url) do
+ %HTTPoison.Response{status_code: 200} ->
+ # This is the high resolution URL.
+ %{
+ data |
+ images: [
+ %{
+ url: built_url,
+ camo_url: image.camo_url
+ }
+ ]
+ }
+ _ ->
+ # Nothing to be found here, move along...
+ data
+ end
+ end
+ defp try_new_hires!(%{images: [image]} = data) do
+ cond do
+ String.match?(image.url, @png_regex) ->
+ %{
+ data |
+ images: [
+ %{
+ url: String.replace(image.url, @png_regex, "\\1.png\\3"),
+ camo_url: image.camo_url
+ }
+ ]
+ }
+ String.match?(image.url, @jpg_regex) ->
+ %{
+ data |
+ images: [
+ %{
+ url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"),
+ camo_url: image.camo_url
+ }
+ ]
+ }
+ true ->
+ # Nothing to be found here, move along...
+ data
+ end
+ end
+ defp try_old_hires!(%{source_url: source, images: [image]} = data) do
+ [serial] = Regex.run(@serial_regex, source, capture: :all_but_first)
+ base36 =
+ serial
+ |> String.to_integer()
+ |> Integer.to_string(36)
+ |> String.downcase()
+ built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
+ case Philomena.Http.get!(built_url) do
+ %HTTPoison.Response{status_code: 301, headers: headers} ->
+ # Location header provides URL of high res image.
+ {_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
+ %{
+ data |
+ images: [
+ %{
+ url: link,
+ camo_url: image.camo_url
+ }
+ ]
+ }
+ _ ->
+ # Nothing to be found here, move along...
+ data
+ end
+ end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/raw.ex b/lib/philomena/scrapers/raw.ex
new file mode 100644
index 00000000..548f34c5
--- /dev/null
+++ b/lib/philomena/scrapers/raw.ex
@@ -0,0 +1,30 @@
+defmodule Philomena.Scrapers.Raw do
+ @mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"]
+ @spec can_handle?(URI.t(), String.t()) :: true | false
+ def can_handle?(_uri, url) do
+ Philomena.Http.head!(url, [], max_body_length: 30_000_000)
+ |> case do
+ %HTTPoison.Response{status_code: 200, headers: headers} ->
+ headers
+ |> Enum.any?(fn {k, v} ->
+ String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
+ end)
+ _ ->
+ false
+ end
+ end
+ def scrape(_uri, url) do
+ %{
+ source_url: url,
+ images: [
+ %{
+ url: url,
+ camo_url: Camo.Image.image_url(url)
+ }
+ ]
+ }
+ end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/tumblr.ex b/lib/philomena/scrapers/tumblr.ex
new file mode 100644
index 00000000..a720e932
--- /dev/null
+++ b/lib/philomena/scrapers/tumblr.ex
@@ -0,0 +1,97 @@
+defmodule Philomena.Scrapers.Tumblr do
+ @url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
+ @inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
+ @size_regex ~r|_(\d+)(\..+)\z|
+ @sizes [1280, 540, 500, 400, 250, 100, 75]
+ @tumblr_ranges [
+ InetCidr.parse(""),
+ InetCidr.parse("")
+ ]
+ @spec can_handle?(URI.t(), String.t()) :: true | false
+ def can_handle?(uri, url) do
+ String.match?(url, @url_regex) and tumblr_domain?(uri.host)
+ end
+ def scrape(uri, url) do
+ [post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
+ api_url = "https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
+ Philomena.Http.get!(api_url)
+ |> json!()
+ |> process_response!()
+ end
+ defp json!(%HTTPoison.Response{body: body, status_code: 200}),
+ do: Jason.decode!(body)
+ defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
+ do: process_post!(post)
+ defp process_post!(%{"type" => "photo"} = post) do
+ images =
+ post["photos"]
+ |> Enum.map(fn photo ->
+ image = upsize(photo["original_size"]["url"])
+ %{"url" => preview} =
+ Enum.find(photo["alt_sizes"], & &1["width"] == 400) || %{"url" => image}
+ %{url: image, camo_url: Camo.Image.image_url(preview)}
+ end)
+ add_meta(post, images)
+ end
+ defp process_post!(%{"type" => "text"} = post) do
+ images =
+ @inline_regex
+ |> Regex.scan(post["text"])
+ |> Enum.map(fn url ->
+ %{url: upsize(url), camo_url: Camo.Image.image_url(url)}
+ end)
+ add_meta(post, images)
+ end
+ defp upsize(image_url) do
+ @sizes
+ |> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2"))
+ |> Enum.find(&url_ok?/1)
+ end
+ defp url_ok?(url) do
+ match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
+ end
+ defp add_meta(post, images) do
+ source = post["post_url"]
+ author = post["blog_name"]
+ description = post["summary"]
+ %{
+ source_url: source,
+ author_name: author,
+ description: description,
+ images: images
+ }
+ end
+ defp tumblr_domain?(host) do
+ host
+ |> String.to_charlist()
+ |> :inet_res.lookup(:in, :a)
+ |> case do
+ [address | _rest] ->
+ Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address))
+ _ ->
+ false
+ end
+ end
+ defp tumblr_api_key do
+ Application.get_env(:philomena, :tumblr_api_key)
+ end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/twitter.ex b/lib/philomena/scrapers/twitter.ex
new file mode 100644
index 00000000..f036fbc1
--- /dev/null
+++ b/lib/philomena/scrapers/twitter.ex
@@ -0,0 +1,66 @@
+defmodule Philomena.Scrapers.Twitter do
+ @gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);|
+ @url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
+ @script_regex ~r|