add scrapers

2025-03-30 16:27:45 +02:00 · 2019-11-28 12:12:10 -05:00 · 2019-11-28 12:12:10 -05:00 · fbfa572a1e
commit fbfa572a1e
parent d3b45e303d
11 changed files with 384 additions and 6 deletions
--- a/config/config.exs
+++ b/config/config.exs
@ -12,10 +12,13 @@ config :philomena,
  elasticsearch_url: "http://localhost:9200",
  password_pepper: "dn2e0EpZrvBLoxUM3gfQveBhjf0bG/6/bYhrOyq3L3hV9hdo/bimJ+irbDWsuXLP",
  otp_secret_key: "Wn7O/8DD+qxL0X4X7bvT90wOkVGcA90bIHww4twR03Ci//zq7PnMw8ypqyyT/b/C",
+  tumblr_api_key: "fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4",
  image_url_root: "/img",
  avatar_url_root: "/avatars",
  badge_url_root: "/media",
-  image_file_root: "priv/static/system/images"
+  image_file_root: "priv/static/system/images",
+  cdn_host: "",
+  proxy_host: nil

 config :philomena, :pow,
  user: Philomena.Users.User,
--- a/config/prod.secret.exs
+++ b/config/prod.secret.exs
@ -19,10 +19,12 @@ config :philomena,
  password_pepper: System.get_env("PASSWORD_PEPPER"),
  avatar_url_root: System.get_env("AVATAR_URL_ROOT"),
  image_file_root: System.get_env("IMAGE_FILE_ROOT"),
+  tumblr_api_key: System.get_env("TUMBLR_API_KEY"),
  otp_secret_key: System.get_env("OTP_SECRET_KEY"),
  image_url_root: System.get_env("IMAGE_URL_ROOT"),
  badge_url_root: System.get_env("BADGE_URL_ROOT"),
  mailer_address: System.get_env("MAILER_ADDRESS"),
+  proxy_host: System.get_env("PROXY_HOST"),
  camo_host: System.get_env("CAMO_HOST"),
  camo_key: System.get_env("CAMO_KEY"),
  cdn_host: System.get_env("CDN_HOST")
--- a/lib/philomena/http.ex
+++ b/lib/philomena/http.ex
@ -0,0 +1,17 @@
+defmodule Philomena.Http do
+  def get!(url, headers \\ [], options \\ []) do
+    options = Keyword.merge(options, proxy: proxy_host())
+
+    HTTPoison.get!(url, headers, options)
+  end
+
+  def head!(url, headers \\ [], options \\ []) do
+    options = Keyword.merge(options, proxy: proxy_host())
+
+    HTTPoison.head!(url, headers, options)
+  end
+
+  defp proxy_host do
+    Application.get_env(:philomena, :proxy_host)
+  end
+end
--- a/lib/philomena/processors/webm.ex
+++ b/lib/philomena/processors/webm.ex
@ -30,10 +30,10 @@ defmodule Philomena.Processors.Webm do
  end

  defp scale_if_smaller(file, palette, dimensions, {:full, _target_dim}) do
-    {webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
+    {_webm, mp4} = scale_videos(file, palette, dimensions, dimensions)

    [
-      {:copy, webm, "full.webm"},
+      {:symlink_original, "full.webm"},
      {:copy, mp4, "full.mp4"}
    ]
  end
--- a/lib/philomena/scrapers.ex
+++ b/lib/philomena/scrapers.ex
@ -0,0 +1,24 @@
+defmodule Philomena.Scrapers do
+  @scrapers [
+    Philomena.Scrapers.Deviantart,
+    Philomena.Scrapers.Twitter,
+    Philomena.Scrapers.Tumblr,
+    Philomena.Scrapers.Raw
+  ]
+
+  def scrape!(url) do
+    uri = URI.parse(url)
+
+    @scrapers
+    |> Enum.find(& &1.can_handle?(uri, url))
+    |> wrap()
+    |> Enum.map(& &1.scrape(uri, url))
+    |> unwrap()
+  end
+
+  defp wrap(nil), do: []
+  defp wrap(res), do: [res]
+
+  defp unwrap([result]), do: result
+  defp unwrap(_result), do: nil
+end
--- a/lib/philomena/scrapers/deviantart.ex
+++ b/lib/philomena/scrapers/deviantart.ex
@ -0,0 +1,135 @@
+defmodule Philomena.Scrapers.Deviantart do
+  @image_regex ~r|<link data-rh="true" rel="preload" href="([^"]*)" as="image"/>|
+  @source_regex ~r|<link data-rh="true" rel="canonical" href="([^"]*)"/>|
+  @artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
+  @serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z|
+  @cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
+  @png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
+  @jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(uri, _url) do
+    String.ends_with?(uri.host, "deviantart.com")
+  end
+
+  # https://github.com/DeviantArt/DeviantArt-API/issues/153
+  #
+  # Note that Erlang (and by extension Elixir) do not have any sort of
+  # reliable HTML/XML parsers that can accept untrusted input. As an example,
+  # xmerl is vulnerable to almost every XML attack which has ever been
+  # created, and also exposes the runtime to symbol DoS as an added bonus.
+  #
+  # So, regex it is. Eat dirt, deviantart. You don't deserve the respect
+  # artists give you.
+  def scrape(_uri, url) do
+    url
+    |> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
+    |> extract_data!()
+    |> try_intermediary_hires!()
+    |> try_new_hires!()
+    |> try_old_hires!()
+  end
+
+  defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
+    [image] = Regex.run(@image_regex, body, capture: :all_but_first)
+    [source] = Regex.run(@source_regex, body, capture: :all_but_first)
+    [artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
+
+    %{
+      source_url: source,
+      author_name: artist,
+      images: [
+        %{
+          url: image,
+          camo_url: Camo.Image.image_url(image)
+        }
+      ]
+    }
+  end
+
+  defp try_intermediary_hires!(%{images: [image]} = data) do
+    [domain, object_uuid, object_name] = Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
+
+    built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
+
+    case Philomena.Http.head!(built_url) do
+      %HTTPoison.Response{status_code: 200} ->
+        # This is the high resolution URL.
+
+        %{
+          data |
+          images: [
+            %{
+              url: built_url,
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      _ ->
+        # Nothing to be found here, move along...
+        data
+    end
+  end
+
+  defp try_new_hires!(%{images: [image]} = data) do
+    cond do
+      String.match?(image.url, @png_regex) ->
+        %{
+          data |
+          images: [
+            %{
+              url: String.replace(image.url, @png_regex, "\\1.png\\3"),
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      String.match?(image.url, @jpg_regex) ->
+        %{
+          data |
+          images: [
+            %{
+              url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"),
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      true ->
+        # Nothing to be found here, move along...
+        data
+    end
+  end
+
+  defp try_old_hires!(%{source_url: source, images: [image]} = data) do
+    [serial] = Regex.run(@serial_regex, source, capture: :all_but_first)
+    base36 =
+      serial
+      |> String.to_integer()
+      |> Integer.to_string(36)
+      |> String.downcase()
+
+    built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
+
+    case Philomena.Http.get!(built_url) do
+      %HTTPoison.Response{status_code: 301, headers: headers} ->
+        # Location header provides URL of high res image.
+        {_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
+
+        %{
+          data |
+          images: [
+            %{
+              url: link,
+              camo_url: image.camo_url
+            }
+          ]
+        }
+
+      _ ->
+        # Nothing to be found here, move along...
+        data
+    end
+  end
+end
--- a/lib/philomena/scrapers/raw.ex
+++ b/lib/philomena/scrapers/raw.ex
@ -0,0 +1,30 @@
+defmodule Philomena.Scrapers.Raw do
+  @mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"]
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(_uri, url) do
+    Philomena.Http.head!(url, [], max_body_length: 30_000_000)
+    |> case do
+      %HTTPoison.Response{status_code: 200, headers: headers} ->
+        headers
+        |> Enum.any?(fn {k, v} ->
+          String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
+        end)
+
+      _ ->
+        false
+    end
+  end
+
+  def scrape(_uri, url) do
+    %{
+      source_url: url,
+      images: [
+        %{
+          url: url,
+          camo_url: Camo.Image.image_url(url)
+        }
+      ]
+    }
+  end
+end
--- a/lib/philomena/scrapers/tumblr.ex
+++ b/lib/philomena/scrapers/tumblr.ex
@ -0,0 +1,97 @@
+defmodule Philomena.Scrapers.Tumblr do
+  @url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
+  @inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
+  @size_regex ~r|_(\d+)(\..+)\z|
+  @sizes [1280, 540, 500, 400, 250, 100, 75]
+  @tumblr_ranges [
+    InetCidr.parse("66.6.32.0/23"),
+    InetCidr.parse("66.6.44.0/24")
+  ]
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(uri, url) do
+    String.match?(url, @url_regex) and tumblr_domain?(uri.host)
+  end
+
+  def scrape(uri, url) do
+    [post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
+
+    api_url = "https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
+
+    Philomena.Http.get!(api_url)
+    |> json!()
+    |> process_response!()
+  end
+
+  defp json!(%HTTPoison.Response{body: body, status_code: 200}),
+    do: Jason.decode!(body)
+
+  defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
+    do: process_post!(post)
+
+  defp process_post!(%{"type" => "photo"} = post) do
+    images =
+      post["photos"]
+      |> Enum.map(fn photo ->
+        image = upsize(photo["original_size"]["url"])
+
+        %{"url" => preview} =
+          Enum.find(photo["alt_sizes"], & &1["width"] == 400) || %{"url" => image}
+
+        %{url: image, camo_url: Camo.Image.image_url(preview)}
+      end)
+
+    add_meta(post, images)
+  end
+
+  defp process_post!(%{"type" => "text"} = post) do
+    images =
+      @inline_regex
+      |> Regex.scan(post["text"])
+      |> Enum.map(fn url ->
+        %{url: upsize(url), camo_url: Camo.Image.image_url(url)}
+      end)
+
+    add_meta(post, images)
+  end
+
+  defp upsize(image_url) do
+    @sizes
+    |> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2"))
+    |> Enum.find(&url_ok?/1)
+  end
+
+  defp url_ok?(url) do
+    match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
+  end
+
+  defp add_meta(post, images) do
+    source = post["post_url"]
+    author = post["blog_name"]
+    description = post["summary"]
+
+    %{
+      source_url: source,
+      author_name: author,
+      description: description,
+      images: images
+    }
+  end
+
+  defp tumblr_domain?(host) do
+    host
+    |> String.to_charlist()
+    |> :inet_res.lookup(:in, :a)
+    |> case do
+      [address | _rest] ->
+        Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address))
+
+      _ ->
+        false
+    end
+  end
+
+  defp tumblr_api_key do
+    Application.get_env(:philomena, :tumblr_api_key)
+  end
+end
--- a/lib/philomena/scrapers/twitter.ex
+++ b/lib/philomena/scrapers/twitter.ex
@ -0,0 +1,66 @@
+defmodule Philomena.Scrapers.Twitter do
+  @gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);|
+  @url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
+  @script_regex ~r|<script type="text/javascript" .*? src="(https://abs.twimg.com/responsive-web/web/main\.[\da-z]+\.js)">|
+  @bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
+  @user_agent ["User-Agent": "Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"]
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(_uri, url) do
+    String.match?(url, @url_regex)
+  end
+
+  def scrape(_uri, url) do
+    api_response!(url)
+    |> extract_data()
+  end
+
+  defp extract_data(tweet) do
+    images =
+      tweet["entities"]["media"]
+      |> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
+
+    %{
+      source_url: tweet["url"],
+      author_name: tweet["user"],
+      description: tweet["text"],
+      images: images
+    }
+  end
+
+  # We'd like to use the API anonymously. In order to do this, we need to
+  # extract the anonymous bearer token. Fortunately, this is pretty easy
+  # to identify in the minified mobile script source.
+  def api_response!(url) do
+    [user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
+
+    mobile_url = "https://mobile.twitter.com/#{user}/status/#{status_id}"
+    api_url = "https://api.twitter.com/2/timeline/conversation/#{status_id}.json"
+    url = "https://twitter.com/#{user}/status/#{status_id}"
+
+    {gt, bearer} =
+      Philomena.Http.get!(mobile_url, @user_agent)
+      |> Map.get(:body)
+      |> extract_guest_token_and_bearer()
+
+    Philomena.Http.get!(api_url, ["Authorization": "Bearer #{bearer}", "x-guest-token": gt])
+    |> Map.get(:body)
+    |> Jason.decode!()
+    |> Map.get("globalObjects")
+    |> Map.get("tweets")
+    |> Map.get(status_id)
+    |> Map.put("user", user)
+    |> Map.put("url", url)
+  end
+
+  defp extract_guest_token_and_bearer(page) do
+    [gt] = Regex.run(@gt_regex, page, capture: :all_but_first)
+    [script] = Regex.run(@script_regex, page, capture: :all_but_first)
+
+    %{body: body} = Philomena.Http.get!(script)
+
+    [bearer] = Regex.run(@bearer_regex, body, capture: :all_but_first)
+
+    {gt, bearer}
+  end
+end
--- a/lib/philomena_web/router.ex
+++ b/lib/philomena_web/router.ex
@ -120,7 +120,7 @@ defmodule PhilomenaWeb.Router do
    # get "/:forum_id", ForumController, :show # impossible to do without constraints
    get "/:forum_id/:id", TopicController, :show
    get "/:forum_id/:id/:page", TopicController, :show
-    get "/:forum_id/:id/posts/:post_id", TopicController, :show
+    get "/:forum_id/:id/post/:post_id", TopicController, :show
  end

  # Other scopes may use custom stacks.
--- a/lib/philomena_web/views/user_attribution_view.ex
+++ b/lib/philomena_web/views/user_attribution_view.ex
@ -21,7 +21,11 @@ defmodule PhilomenaWeb.UserAttributionView do
  end

  def anonymous_avatar(_object, class \\ "avatar--100px") do
-    img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"), class: class)
+    class = Enum.join(["image-constrained", class], " ")
+
+    content_tag :div, [class: class] do
+      img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"))
+    end
  end

  def user_avatar(object, class \\ "avatar--100px")
@ -54,7 +58,7 @@ defmodule PhilomenaWeb.UserAttributionView do
    do: [{"label--danger", "Site Administrator"} | labels]
  defp staff_role(labels, %{hide_default_role: false, role: "moderator"}),
    do: [{"label--success", "Site Moderator"} | labels]
-  defp staff_role(labels, %{hide_default_role: false, role: "assisant"}),
+  defp staff_role(labels, %{hide_default_role: false, role: "assistant"}),
    do: [{"label--purple", "Site Assistant"} | labels]
  defp staff_role(labels, _user),
    do: labels