From fbfa572a1e9ff0407d5dbd59ccc5e0ff2b257e33 Mon Sep 17 00:00:00 2001 From: "byte[]" Date: Thu, 28 Nov 2019 12:12:10 -0500 Subject: [PATCH] add scrapers --- config/config.exs | 5 +- config/prod.secret.exs | 2 + lib/philomena/http.ex | 17 +++ lib/philomena/processors/webm.ex | 4 +- lib/philomena/scrapers.ex | 24 ++++ lib/philomena/scrapers/deviantart.ex | 135 ++++++++++++++++++ lib/philomena/scrapers/raw.ex | 30 ++++ lib/philomena/scrapers/tumblr.ex | 97 +++++++++++++ lib/philomena/scrapers/twitter.ex | 66 +++++++++ lib/philomena_web/router.ex | 2 +- .../views/user_attribution_view.ex | 8 +- 11 files changed, 384 insertions(+), 6 deletions(-) create mode 100644 lib/philomena/http.ex create mode 100644 lib/philomena/scrapers.ex create mode 100644 lib/philomena/scrapers/deviantart.ex create mode 100644 lib/philomena/scrapers/raw.ex create mode 100644 lib/philomena/scrapers/tumblr.ex create mode 100644 lib/philomena/scrapers/twitter.ex diff --git a/config/config.exs b/config/config.exs index f72ebe32..ed213b64 100644 --- a/config/config.exs +++ b/config/config.exs @@ -12,10 +12,13 @@ config :philomena, elasticsearch_url: "http://localhost:9200", password_pepper: "dn2e0EpZrvBLoxUM3gfQveBhjf0bG/6/bYhrOyq3L3hV9hdo/bimJ+irbDWsuXLP", otp_secret_key: "Wn7O/8DD+qxL0X4X7bvT90wOkVGcA90bIHww4twR03Ci//zq7PnMw8ypqyyT/b/C", + tumblr_api_key: "fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4", image_url_root: "/img", avatar_url_root: "/avatars", badge_url_root: "/media", - image_file_root: "priv/static/system/images" + image_file_root: "priv/static/system/images", + cdn_host: "", + proxy_host: nil config :philomena, :pow, user: Philomena.Users.User, diff --git a/config/prod.secret.exs b/config/prod.secret.exs index 5e0c2a00..a612dfab 100644 --- a/config/prod.secret.exs +++ b/config/prod.secret.exs @@ -19,10 +19,12 @@ config :philomena, password_pepper: System.get_env("PASSWORD_PEPPER"), avatar_url_root: System.get_env("AVATAR_URL_ROOT"), image_file_root: System.get_env("IMAGE_FILE_ROOT"), + tumblr_api_key: System.get_env("TUMBLR_API_KEY"), otp_secret_key: System.get_env("OTP_SECRET_KEY"), image_url_root: System.get_env("IMAGE_URL_ROOT"), badge_url_root: System.get_env("BADGE_URL_ROOT"), mailer_address: System.get_env("MAILER_ADDRESS"), + proxy_host: System.get_env("PROXY_HOST"), camo_host: System.get_env("CAMO_HOST"), camo_key: System.get_env("CAMO_KEY"), cdn_host: System.get_env("CDN_HOST") diff --git a/lib/philomena/http.ex b/lib/philomena/http.ex new file mode 100644 index 00000000..12705e89 --- /dev/null +++ b/lib/philomena/http.ex @@ -0,0 +1,17 @@ +defmodule Philomena.Http do + def get!(url, headers \\ [], options \\ []) do + options = Keyword.merge(options, proxy: proxy_host()) + + HTTPoison.get!(url, headers, options) + end + + def head!(url, headers \\ [], options \\ []) do + options = Keyword.merge(options, proxy: proxy_host()) + + HTTPoison.head!(url, headers, options) + end + + defp proxy_host do + Application.get_env(:philomena, :proxy_host) + end +end \ No newline at end of file diff --git a/lib/philomena/processors/webm.ex b/lib/philomena/processors/webm.ex index 6aa5b1ae..aad0923d 100644 --- a/lib/philomena/processors/webm.ex +++ b/lib/philomena/processors/webm.ex @@ -30,10 +30,10 @@ defmodule Philomena.Processors.Webm do end defp scale_if_smaller(file, palette, dimensions, {:full, _target_dim}) do - {webm, mp4} = scale_videos(file, palette, dimensions, dimensions) + {_webm, mp4} = scale_videos(file, palette, dimensions, dimensions) [ - {:copy, webm, "full.webm"}, + {:symlink_original, "full.webm"}, {:copy, mp4, "full.mp4"} ] end diff --git a/lib/philomena/scrapers.ex b/lib/philomena/scrapers.ex new file mode 100644 index 00000000..39668e5b --- /dev/null +++ b/lib/philomena/scrapers.ex @@ -0,0 +1,24 @@ +defmodule Philomena.Scrapers do + @scrapers [ + Philomena.Scrapers.Deviantart, + Philomena.Scrapers.Twitter, + Philomena.Scrapers.Tumblr, + Philomena.Scrapers.Raw + ] + + def scrape!(url) do + uri = URI.parse(url) + + @scrapers + |> Enum.find(& &1.can_handle?(uri, url)) + |> wrap() + |> Enum.map(& &1.scrape(uri, url)) + |> unwrap() + end + + defp wrap(nil), do: [] + defp wrap(res), do: [res] + + defp unwrap([result]), do: result + defp unwrap(_result), do: nil +end \ No newline at end of file diff --git a/lib/philomena/scrapers/deviantart.ex b/lib/philomena/scrapers/deviantart.ex new file mode 100644 index 00000000..b6c1841e --- /dev/null +++ b/lib/philomena/scrapers/deviantart.ex @@ -0,0 +1,135 @@ +defmodule Philomena.Scrapers.Deviantart do + @image_regex ~r|| + @source_regex ~r|| + @artist_regex ~r|https://www.deviantart.com/([^/]*)/art| + @serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z| + @cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)| + @png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)| + @jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)| + + @spec can_handle?(URI.t(), String.t()) :: true | false + def can_handle?(uri, _url) do + String.ends_with?(uri.host, "deviantart.com") + end + + # https://github.com/DeviantArt/DeviantArt-API/issues/153 + # + # Note that Erlang (and by extension Elixir) do not have any sort of + # reliable HTML/XML parsers that can accept untrusted input. As an example, + # xmerl is vulnerable to almost every XML attack which has ever been + # created, and also exposes the runtime to symbol DoS as an added bonus. + # + # So, regex it is. Eat dirt, deviantart. You don't deserve the respect + # artists give you. + def scrape(_uri, url) do + url + |> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2) + |> extract_data!() + |> try_intermediary_hires!() + |> try_new_hires!() + |> try_old_hires!() + end + + defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do + [image] = Regex.run(@image_regex, body, capture: :all_but_first) + [source] = Regex.run(@source_regex, body, capture: :all_but_first) + [artist] = Regex.run(@artist_regex, source, capture: :all_but_first) + + %{ + source_url: source, + author_name: artist, + images: [ + %{ + url: image, + camo_url: Camo.Image.image_url(image) + } + ] + } + end + + defp try_intermediary_hires!(%{images: [image]} = data) do + [domain, object_uuid, object_name] = Regex.run(@cdnint_regex, image.url, capture: :all_but_first) + + built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}" + + case Philomena.Http.head!(built_url) do + %HTTPoison.Response{status_code: 200} -> + # This is the high resolution URL. + + %{ + data | + images: [ + %{ + url: built_url, + camo_url: image.camo_url + } + ] + } + + _ -> + # Nothing to be found here, move along... + data + end + end + + defp try_new_hires!(%{images: [image]} = data) do + cond do + String.match?(image.url, @png_regex) -> + %{ + data | + images: [ + %{ + url: String.replace(image.url, @png_regex, "\\1.png\\3"), + camo_url: image.camo_url + } + ] + } + + String.match?(image.url, @jpg_regex) -> + %{ + data | + images: [ + %{ + url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"), + camo_url: image.camo_url + } + ] + } + + true -> + # Nothing to be found here, move along... + data + end + end + + defp try_old_hires!(%{source_url: source, images: [image]} = data) do + [serial] = Regex.run(@serial_regex, source, capture: :all_but_first) + base36 = + serial + |> String.to_integer() + |> Integer.to_string(36) + |> String.downcase() + + built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png" + + case Philomena.Http.get!(built_url) do + %HTTPoison.Response{status_code: 301, headers: headers} -> + # Location header provides URL of high res image. + {_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end) + + %{ + data | + images: [ + %{ + url: link, + camo_url: image.camo_url + } + ] + } + + _ -> + # Nothing to be found here, move along... + data + end + end +end \ No newline at end of file diff --git a/lib/philomena/scrapers/raw.ex b/lib/philomena/scrapers/raw.ex new file mode 100644 index 00000000..548f34c5 --- /dev/null +++ b/lib/philomena/scrapers/raw.ex @@ -0,0 +1,30 @@ +defmodule Philomena.Scrapers.Raw do + @mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"] + + @spec can_handle?(URI.t(), String.t()) :: true | false + def can_handle?(_uri, url) do + Philomena.Http.head!(url, [], max_body_length: 30_000_000) + |> case do + %HTTPoison.Response{status_code: 200, headers: headers} -> + headers + |> Enum.any?(fn {k, v} -> + String.downcase(k) == "content-type" and String.downcase(v) in @mime_types + end) + + _ -> + false + end + end + + def scrape(_uri, url) do + %{ + source_url: url, + images: [ + %{ + url: url, + camo_url: Camo.Image.image_url(url) + } + ] + } + end +end \ No newline at end of file diff --git a/lib/philomena/scrapers/tumblr.ex b/lib/philomena/scrapers/tumblr.ex new file mode 100644 index 00000000..a720e932 --- /dev/null +++ b/lib/philomena/scrapers/tumblr.ex @@ -0,0 +1,97 @@ +defmodule Philomena.Scrapers.Tumblr do + @url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])| + @inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i + @size_regex ~r|_(\d+)(\..+)\z| + @sizes [1280, 540, 500, 400, 250, 100, 75] + @tumblr_ranges [ + InetCidr.parse("66.6.32.0/23"), + InetCidr.parse("66.6.44.0/24") + ] + + @spec can_handle?(URI.t(), String.t()) :: true | false + def can_handle?(uri, url) do + String.match?(url, @url_regex) and tumblr_domain?(uri.host) + end + + def scrape(uri, url) do + [post_id] = Regex.run(@url_regex, url, capture: :all_but_first) + + api_url = "https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}" + + Philomena.Http.get!(api_url) + |> json!() + |> process_response!() + end + + defp json!(%HTTPoison.Response{body: body, status_code: 200}), + do: Jason.decode!(body) + + defp process_response!(%{"response" => %{"posts" => [post | _rest]}}), + do: process_post!(post) + + defp process_post!(%{"type" => "photo"} = post) do + images = + post["photos"] + |> Enum.map(fn photo -> + image = upsize(photo["original_size"]["url"]) + + %{"url" => preview} = + Enum.find(photo["alt_sizes"], & &1["width"] == 400) || %{"url" => image} + + %{url: image, camo_url: Camo.Image.image_url(preview)} + end) + + add_meta(post, images) + end + + defp process_post!(%{"type" => "text"} = post) do + images = + @inline_regex + |> Regex.scan(post["text"]) + |> Enum.map(fn url -> + %{url: upsize(url), camo_url: Camo.Image.image_url(url)} + end) + + add_meta(post, images) + end + + defp upsize(image_url) do + @sizes + |> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2")) + |> Enum.find(&url_ok?/1) + end + + defp url_ok?(url) do + match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url)) + end + + defp add_meta(post, images) do + source = post["post_url"] + author = post["blog_name"] + description = post["summary"] + + %{ + source_url: source, + author_name: author, + description: description, + images: images + } + end + + defp tumblr_domain?(host) do + host + |> String.to_charlist() + |> :inet_res.lookup(:in, :a) + |> case do + [address | _rest] -> + Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address)) + + _ -> + false + end + end + + defp tumblr_api_key do + Application.get_env(:philomena, :tumblr_api_key) + end +end \ No newline at end of file diff --git a/lib/philomena/scrapers/twitter.ex b/lib/philomena/scrapers/twitter.ex new file mode 100644 index 00000000..f036fbc1 --- /dev/null +++ b/lib/philomena/scrapers/twitter.ex @@ -0,0 +1,66 @@ +defmodule Philomena.Scrapers.Twitter do + @gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);| + @url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?| + @script_regex ~r|