From fbfa572a1e9ff0407d5dbd59ccc5e0ff2b257e33 Mon Sep 17 00:00:00 2001
From: "byte[]" <byteslice@airmail.cc>
Date: Thu, 28 Nov 2019 12:12:10 -0500
Subject: [PATCH] add scrapers

---
 config/config.exs                             |   5 +-
 config/prod.secret.exs                        |   2 +
 lib/philomena/http.ex                         |  17 +++
 lib/philomena/processors/webm.ex              |   4 +-
 lib/philomena/scrapers.ex                     |  24 ++++
 lib/philomena/scrapers/deviantart.ex          | 135 ++++++++++++++++++
 lib/philomena/scrapers/raw.ex                 |  30 ++++
 lib/philomena/scrapers/tumblr.ex              |  97 +++++++++++++
 lib/philomena/scrapers/twitter.ex             |  66 +++++++++
 lib/philomena_web/router.ex                   |   2 +-
 .../views/user_attribution_view.ex            |   8 +-
 11 files changed, 384 insertions(+), 6 deletions(-)
 create mode 100644 lib/philomena/http.ex
 create mode 100644 lib/philomena/scrapers.ex
 create mode 100644 lib/philomena/scrapers/deviantart.ex
 create mode 100644 lib/philomena/scrapers/raw.ex
 create mode 100644 lib/philomena/scrapers/tumblr.ex
 create mode 100644 lib/philomena/scrapers/twitter.ex

diff --git a/config/config.exs b/config/config.exs
index f72ebe32..ed213b64 100644
--- a/config/config.exs
+++ b/config/config.exs
@@ -12,10 +12,13 @@ config :philomena,
   elasticsearch_url: "http://localhost:9200",
   password_pepper: "dn2e0EpZrvBLoxUM3gfQveBhjf0bG/6/bYhrOyq3L3hV9hdo/bimJ+irbDWsuXLP",
   otp_secret_key: "Wn7O/8DD+qxL0X4X7bvT90wOkVGcA90bIHww4twR03Ci//zq7PnMw8ypqyyT/b/C",
+  tumblr_api_key: "fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4",
   image_url_root: "/img",
   avatar_url_root: "/avatars",
   badge_url_root: "/media",
-  image_file_root: "priv/static/system/images"
+  image_file_root: "priv/static/system/images",
+  cdn_host: "",
+  proxy_host: nil
 
 config :philomena, :pow,
   user: Philomena.Users.User,
diff --git a/config/prod.secret.exs b/config/prod.secret.exs
index 5e0c2a00..a612dfab 100644
--- a/config/prod.secret.exs
+++ b/config/prod.secret.exs
@@ -19,10 +19,12 @@ config :philomena,
   password_pepper: System.get_env("PASSWORD_PEPPER"),
   avatar_url_root: System.get_env("AVATAR_URL_ROOT"),
   image_file_root: System.get_env("IMAGE_FILE_ROOT"),
+  tumblr_api_key: System.get_env("TUMBLR_API_KEY"),
   otp_secret_key: System.get_env("OTP_SECRET_KEY"),
   image_url_root: System.get_env("IMAGE_URL_ROOT"),
   badge_url_root: System.get_env("BADGE_URL_ROOT"),
   mailer_address: System.get_env("MAILER_ADDRESS"),
+  proxy_host: System.get_env("PROXY_HOST"),
   camo_host: System.get_env("CAMO_HOST"),
   camo_key: System.get_env("CAMO_KEY"),
   cdn_host: System.get_env("CDN_HOST")
diff --git a/lib/philomena/http.ex b/lib/philomena/http.ex
new file mode 100644
index 00000000..12705e89
--- /dev/null
+++ b/lib/philomena/http.ex
@@ -0,0 +1,17 @@
+defmodule Philomena.Http do
+  def get!(url, headers \\ [], options \\ []) do
+    options = Keyword.merge(options, proxy: proxy_host())
+
+    HTTPoison.get!(url, headers, options)
+  end
+
+  def head!(url, headers \\ [], options \\ []) do
+    options = Keyword.merge(options, proxy: proxy_host())
+
+    HTTPoison.head!(url, headers, options)
+  end
+
+  defp proxy_host do
+    Application.get_env(:philomena, :proxy_host)
+  end
+end
\ No newline at end of file
diff --git a/lib/philomena/processors/webm.ex b/lib/philomena/processors/webm.ex
index 6aa5b1ae..aad0923d 100644
--- a/lib/philomena/processors/webm.ex
+++ b/lib/philomena/processors/webm.ex
@@ -30,10 +30,10 @@ defmodule Philomena.Processors.Webm do
   end
 
   defp scale_if_smaller(file, palette, dimensions, {:full, _target_dim}) do
-    {webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
+    {_webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
 
     [
-      {:copy, webm, "full.webm"},
+      {:symlink_original, "full.webm"},
       {:copy, mp4, "full.mp4"}
     ]
   end
diff --git a/lib/philomena/scrapers.ex b/lib/philomena/scrapers.ex
new file mode 100644
index 00000000..39668e5b
--- /dev/null
+++ b/lib/philomena/scrapers.ex
@@ -0,0 +1,24 @@
+defmodule Philomena.Scrapers do
+  @scrapers [
+    Philomena.Scrapers.Deviantart,
+    Philomena.Scrapers.Twitter,
+    Philomena.Scrapers.Tumblr,
+    Philomena.Scrapers.Raw
+  ]
+
+  def scrape!(url) do
+    uri = URI.parse(url)
+
+    @scrapers
+    |> Enum.find(& &1.can_handle?(uri, url))
+    |> wrap()
+    |> Enum.map(& &1.scrape(uri, url))
+    |> unwrap()
+  end
+
+  defp wrap(nil), do: []
+  defp wrap(res), do: [res]
+
+  defp unwrap([result]), do: result
+  defp unwrap(_result), do: nil
+end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/deviantart.ex b/lib/philomena/scrapers/deviantart.ex
new file mode 100644
index 00000000..b6c1841e
--- /dev/null
+++ b/lib/philomena/scrapers/deviantart.ex
@@ -0,0 +1,135 @@
+defmodule Philomena.Scrapers.Deviantart do
+  @image_regex ~r|<link data-rh="true" rel="preload" href="([^"]*)" as="image"/>|
+  @source_regex ~r|<link data-rh="true" rel="canonical" href="([^"]*)"/>|
+  @artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
+  @serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z|
+  @cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
+  @png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
+  @jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(uri, _url) do
+    String.ends_with?(uri.host, "deviantart.com")
+  end
+
+  # https://github.com/DeviantArt/DeviantArt-API/issues/153
+  #
+  # Note that Erlang (and by extension Elixir) do not have any sort of
+  # reliable HTML/XML parsers that can accept untrusted input. As an example,
+  # xmerl is vulnerable to almost every XML attack which has ever been
+  # created, and also exposes the runtime to symbol DoS as an added bonus.
+  #
+  # So, regex it is. Eat dirt, deviantart. You don't deserve the respect
+  # artists give you.
+  def scrape(_uri, url) do
+    url
+    |> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
+    |> extract_data!()
+    |> try_intermediary_hires!()
+    |> try_new_hires!()
+    |> try_old_hires!()
+  end
+
+  defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
+    [image] = Regex.run(@image_regex, body, capture: :all_but_first)
+    [source] = Regex.run(@source_regex, body, capture: :all_but_first)
+    [artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
+
+    %{
+      source_url: source,
+      author_name: artist,
+      images: [
+        %{
+          url: image,
+          camo_url: Camo.Image.image_url(image)
+        }
+      ]
+    }
+  end
+
+  defp try_intermediary_hires!(%{images: [image]} = data) do
+    [domain, object_uuid, object_name] = Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
+
+    built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
+
+    case Philomena.Http.head!(built_url) do
+      %HTTPoison.Response{status_code: 200} ->
+        # This is the high resolution URL.
+
+        %{
+          data |
+          images: [
+            %{
+              url: built_url,
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      _ ->
+        # Nothing to be found here, move along...
+        data
+    end
+  end
+
+  defp try_new_hires!(%{images: [image]} = data) do
+    cond do
+      String.match?(image.url, @png_regex) ->
+        %{
+          data |
+          images: [
+            %{
+              url: String.replace(image.url, @png_regex, "\\1.png\\3"),
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      String.match?(image.url, @jpg_regex) ->
+        %{
+          data |
+          images: [
+            %{
+              url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"),
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      true ->
+        # Nothing to be found here, move along...
+        data
+    end
+  end
+
+  defp try_old_hires!(%{source_url: source, images: [image]} = data) do
+    [serial] = Regex.run(@serial_regex, source, capture: :all_but_first)
+    base36 =
+      serial
+      |> String.to_integer()
+      |> Integer.to_string(36)
+      |> String.downcase()
+
+    built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
+
+    case Philomena.Http.get!(built_url) do
+      %HTTPoison.Response{status_code: 301, headers: headers} ->
+        # Location header provides URL of high res image.
+        {_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
+
+        %{
+          data |
+          images: [
+            %{
+              url: link,
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      _ ->
+        # Nothing to be found here, move along...
+        data
+    end
+  end
+end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/raw.ex b/lib/philomena/scrapers/raw.ex
new file mode 100644
index 00000000..548f34c5
--- /dev/null
+++ b/lib/philomena/scrapers/raw.ex
@@ -0,0 +1,30 @@
+defmodule Philomena.Scrapers.Raw do
+  @mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"]
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(_uri, url) do
+    Philomena.Http.head!(url, [], max_body_length: 30_000_000)
+    |> case do
+      %HTTPoison.Response{status_code: 200, headers: headers} ->
+        headers
+        |> Enum.any?(fn {k, v} ->
+          String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
+        end)
+
+      _ ->
+        false
+    end
+  end
+
+  def scrape(_uri, url) do
+    %{
+      source_url: url,
+      images: [
+        %{
+          url: url,
+          camo_url: Camo.Image.image_url(url)
+        }
+      ]
+    }
+  end
+end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/tumblr.ex b/lib/philomena/scrapers/tumblr.ex
new file mode 100644
index 00000000..a720e932
--- /dev/null
+++ b/lib/philomena/scrapers/tumblr.ex
@@ -0,0 +1,97 @@
+defmodule Philomena.Scrapers.Tumblr do
+  @url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
+  @inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
+  @size_regex ~r|_(\d+)(\..+)\z|
+  @sizes [1280, 540, 500, 400, 250, 100, 75]
+  @tumblr_ranges [
+    InetCidr.parse("66.6.32.0/23"),
+    InetCidr.parse("66.6.44.0/24")
+  ]
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(uri, url) do
+    String.match?(url, @url_regex) and tumblr_domain?(uri.host)
+  end
+
+  def scrape(uri, url) do
+    [post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
+
+    api_url = "https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
+
+    Philomena.Http.get!(api_url)
+    |> json!()
+    |> process_response!()
+  end
+
+  defp json!(%HTTPoison.Response{body: body, status_code: 200}),
+    do: Jason.decode!(body)
+
+  defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
+    do: process_post!(post)
+
+  defp process_post!(%{"type" => "photo"} = post) do
+    images =
+      post["photos"]
+      |> Enum.map(fn photo ->
+        image = upsize(photo["original_size"]["url"])
+
+        %{"url" => preview} =
+          Enum.find(photo["alt_sizes"], & &1["width"] == 400) || %{"url" => image}
+
+        %{url: image, camo_url: Camo.Image.image_url(preview)}
+      end)
+
+    add_meta(post, images)
+  end
+
+  defp process_post!(%{"type" => "text"} = post) do
+    images =
+      @inline_regex
+      |> Regex.scan(post["text"])
+      |> Enum.map(fn url ->
+        %{url: upsize(url), camo_url: Camo.Image.image_url(url)}
+      end)
+
+    add_meta(post, images)
+  end
+
+  defp upsize(image_url) do
+    @sizes
+    |> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2"))
+    |> Enum.find(&url_ok?/1)
+  end
+
+  defp url_ok?(url) do
+    match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
+  end
+
+  defp add_meta(post, images) do
+    source = post["post_url"]
+    author = post["blog_name"]
+    description = post["summary"]
+
+    %{
+      source_url: source,
+      author_name: author,
+      description: description,
+      images: images
+    }
+  end
+
+  defp tumblr_domain?(host) do
+    host
+    |> String.to_charlist()
+    |> :inet_res.lookup(:in, :a)
+    |> case do
+      [address | _rest] ->
+        Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address))
+
+      _ ->
+        false
+    end
+  end
+
+  defp tumblr_api_key do
+    Application.get_env(:philomena, :tumblr_api_key)
+  end
+end
\ No newline at end of file
diff --git a/lib/philomena/scrapers/twitter.ex b/lib/philomena/scrapers/twitter.ex
new file mode 100644
index 00000000..f036fbc1
--- /dev/null
+++ b/lib/philomena/scrapers/twitter.ex
@@ -0,0 +1,66 @@
+defmodule Philomena.Scrapers.Twitter do
+  @gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);|
+  @url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
+  @script_regex ~r|<script type="text/javascript" .*? src="(https://abs.twimg.com/responsive-web/web/main\.[\da-z]+\.js)">|
+  @bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
+  @user_agent ["User-Agent": "Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"]
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(_uri, url) do
+    String.match?(url, @url_regex)
+  end
+
+  def scrape(_uri, url) do
+    api_response!(url)
+    |> extract_data()
+  end
+
+  defp extract_data(tweet) do
+    images =
+      tweet["entities"]["media"]
+      |> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
+
+    %{
+      source_url: tweet["url"],
+      author_name: tweet["user"],
+      description: tweet["text"],
+      images: images
+    }
+  end
+
+  # We'd like to use the API anonymously. In order to do this, we need to
+  # extract the anonymous bearer token. Fortunately, this is pretty easy
+  # to identify in the minified mobile script source.
+  def api_response!(url) do
+    [user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
+
+    mobile_url = "https://mobile.twitter.com/#{user}/status/#{status_id}"
+    api_url = "https://api.twitter.com/2/timeline/conversation/#{status_id}.json"
+    url = "https://twitter.com/#{user}/status/#{status_id}"
+
+    {gt, bearer} =
+      Philomena.Http.get!(mobile_url, @user_agent)
+      |> Map.get(:body)
+      |> extract_guest_token_and_bearer()
+
+    Philomena.Http.get!(api_url, ["Authorization": "Bearer #{bearer}", "x-guest-token": gt])
+    |> Map.get(:body)
+    |> Jason.decode!()
+    |> Map.get("globalObjects")
+    |> Map.get("tweets")
+    |> Map.get(status_id)
+    |> Map.put("user", user)
+    |> Map.put("url", url)
+  end
+
+  defp extract_guest_token_and_bearer(page) do
+    [gt] = Regex.run(@gt_regex, page, capture: :all_but_first)
+    [script] = Regex.run(@script_regex, page, capture: :all_but_first)
+
+    %{body: body} = Philomena.Http.get!(script)
+
+    [bearer] = Regex.run(@bearer_regex, body, capture: :all_but_first)
+
+    {gt, bearer}
+  end
+end
\ No newline at end of file
diff --git a/lib/philomena_web/router.ex b/lib/philomena_web/router.ex
index a8e48934..d003462c 100644
--- a/lib/philomena_web/router.ex
+++ b/lib/philomena_web/router.ex
@@ -120,7 +120,7 @@ defmodule PhilomenaWeb.Router do
     # get "/:forum_id", ForumController, :show # impossible to do without constraints
     get "/:forum_id/:id", TopicController, :show
     get "/:forum_id/:id/:page", TopicController, :show
-    get "/:forum_id/:id/posts/:post_id", TopicController, :show
+    get "/:forum_id/:id/post/:post_id", TopicController, :show
   end
 
   # Other scopes may use custom stacks.
diff --git a/lib/philomena_web/views/user_attribution_view.ex b/lib/philomena_web/views/user_attribution_view.ex
index 8645df72..b9f9c137 100644
--- a/lib/philomena_web/views/user_attribution_view.ex
+++ b/lib/philomena_web/views/user_attribution_view.ex
@@ -21,7 +21,11 @@ defmodule PhilomenaWeb.UserAttributionView do
   end
 
   def anonymous_avatar(_object, class \\ "avatar--100px") do
-    img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"), class: class)
+    class = Enum.join(["image-constrained", class], " ")
+
+    content_tag :div, [class: class] do
+      img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"))
+    end
   end
 
   def user_avatar(object, class \\ "avatar--100px")
@@ -54,7 +58,7 @@ defmodule PhilomenaWeb.UserAttributionView do
     do: [{"label--danger", "Site Administrator"} | labels]
   defp staff_role(labels, %{hide_default_role: false, role: "moderator"}),
     do: [{"label--success", "Site Moderator"} | labels]
-  defp staff_role(labels, %{hide_default_role: false, role: "assisant"}),
+  defp staff_role(labels, %{hide_default_role: false, role: "assistant"}),
     do: [{"label--purple", "Site Assistant"} | labels]
   defp staff_role(labels, _user),
     do: labels