add scrapers

This commit is contained in:
byte[] 2019-11-28 12:12:10 -05:00
parent d3b45e303d
commit fbfa572a1e
11 changed files with 384 additions and 6 deletions

View file

@ -12,10 +12,13 @@ config :philomena,
elasticsearch_url: "http://localhost:9200",
password_pepper: "dn2e0EpZrvBLoxUM3gfQveBhjf0bG/6/bYhrOyq3L3hV9hdo/bimJ+irbDWsuXLP",
otp_secret_key: "Wn7O/8DD+qxL0X4X7bvT90wOkVGcA90bIHww4twR03Ci//zq7PnMw8ypqyyT/b/C",
tumblr_api_key: "fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4",
image_url_root: "/img",
avatar_url_root: "/avatars",
badge_url_root: "/media",
image_file_root: "priv/static/system/images"
image_file_root: "priv/static/system/images",
cdn_host: "",
proxy_host: nil
config :philomena, :pow,
user: Philomena.Users.User,

View file

@ -19,10 +19,12 @@ config :philomena,
password_pepper: System.get_env("PASSWORD_PEPPER"),
avatar_url_root: System.get_env("AVATAR_URL_ROOT"),
image_file_root: System.get_env("IMAGE_FILE_ROOT"),
tumblr_api_key: System.get_env("TUMBLR_API_KEY"),
otp_secret_key: System.get_env("OTP_SECRET_KEY"),
image_url_root: System.get_env("IMAGE_URL_ROOT"),
badge_url_root: System.get_env("BADGE_URL_ROOT"),
mailer_address: System.get_env("MAILER_ADDRESS"),
proxy_host: System.get_env("PROXY_HOST"),
camo_host: System.get_env("CAMO_HOST"),
camo_key: System.get_env("CAMO_KEY"),
cdn_host: System.get_env("CDN_HOST")

17
lib/philomena/http.ex Normal file
View file

@ -0,0 +1,17 @@
defmodule Philomena.Http do
def get!(url, headers \\ [], options \\ []) do
options = Keyword.merge(options, proxy: proxy_host())
HTTPoison.get!(url, headers, options)
end
def head!(url, headers \\ [], options \\ []) do
options = Keyword.merge(options, proxy: proxy_host())
HTTPoison.head!(url, headers, options)
end
defp proxy_host do
Application.get_env(:philomena, :proxy_host)
end
end

View file

@ -30,10 +30,10 @@ defmodule Philomena.Processors.Webm do
end
defp scale_if_smaller(file, palette, dimensions, {:full, _target_dim}) do
{webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
{_webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
[
{:copy, webm, "full.webm"},
{:symlink_original, "full.webm"},
{:copy, mp4, "full.mp4"}
]
end

24
lib/philomena/scrapers.ex Normal file
View file

@ -0,0 +1,24 @@
defmodule Philomena.Scrapers do
@scrapers [
Philomena.Scrapers.Deviantart,
Philomena.Scrapers.Twitter,
Philomena.Scrapers.Tumblr,
Philomena.Scrapers.Raw
]
def scrape!(url) do
uri = URI.parse(url)
@scrapers
|> Enum.find(& &1.can_handle?(uri, url))
|> wrap()
|> Enum.map(& &1.scrape(uri, url))
|> unwrap()
end
defp wrap(nil), do: []
defp wrap(res), do: [res]
defp unwrap([result]), do: result
defp unwrap(_result), do: nil
end

View file

@ -0,0 +1,135 @@
defmodule Philomena.Scrapers.Deviantart do
@image_regex ~r|<link data-rh="true" rel="preload" href="([^"]*)" as="image"/>|
@source_regex ~r|<link data-rh="true" rel="canonical" href="([^"]*)"/>|
@artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
@serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z|
@cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
@png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
@jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
@spec can_handle?(URI.t(), String.t()) :: true | false
def can_handle?(uri, _url) do
String.ends_with?(uri.host, "deviantart.com")
end
# https://github.com/DeviantArt/DeviantArt-API/issues/153
#
# Note that Erlang (and by extension Elixir) do not have any sort of
# reliable HTML/XML parsers that can accept untrusted input. As an example,
# xmerl is vulnerable to almost every XML attack which has ever been
# created, and also exposes the runtime to symbol DoS as an added bonus.
#
# So, regex it is. Eat dirt, deviantart. You don't deserve the respect
# artists give you.
def scrape(_uri, url) do
url
|> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
|> extract_data!()
|> try_intermediary_hires!()
|> try_new_hires!()
|> try_old_hires!()
end
defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
[image] = Regex.run(@image_regex, body, capture: :all_but_first)
[source] = Regex.run(@source_regex, body, capture: :all_but_first)
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
%{
source_url: source,
author_name: artist,
images: [
%{
url: image,
camo_url: Camo.Image.image_url(image)
}
]
}
end
defp try_intermediary_hires!(%{images: [image]} = data) do
[domain, object_uuid, object_name] = Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
case Philomena.Http.head!(built_url) do
%HTTPoison.Response{status_code: 200} ->
# This is the high resolution URL.
%{
data |
images: [
%{
url: built_url,
camo_url: image.camo_url
}
]
}
_ ->
# Nothing to be found here, move along...
data
end
end
defp try_new_hires!(%{images: [image]} = data) do
cond do
String.match?(image.url, @png_regex) ->
%{
data |
images: [
%{
url: String.replace(image.url, @png_regex, "\\1.png\\3"),
camo_url: image.camo_url
}
]
}
String.match?(image.url, @jpg_regex) ->
%{
data |
images: [
%{
url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"),
camo_url: image.camo_url
}
]
}
true ->
# Nothing to be found here, move along...
data
end
end
defp try_old_hires!(%{source_url: source, images: [image]} = data) do
[serial] = Regex.run(@serial_regex, source, capture: :all_but_first)
base36 =
serial
|> String.to_integer()
|> Integer.to_string(36)
|> String.downcase()
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
case Philomena.Http.get!(built_url) do
%HTTPoison.Response{status_code: 301, headers: headers} ->
# Location header provides URL of high res image.
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
%{
data |
images: [
%{
url: link,
camo_url: image.camo_url
}
]
}
_ ->
# Nothing to be found here, move along...
data
end
end
end

View file

@ -0,0 +1,30 @@
defmodule Philomena.Scrapers.Raw do
@mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"]
@spec can_handle?(URI.t(), String.t()) :: true | false
def can_handle?(_uri, url) do
Philomena.Http.head!(url, [], max_body_length: 30_000_000)
|> case do
%HTTPoison.Response{status_code: 200, headers: headers} ->
headers
|> Enum.any?(fn {k, v} ->
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
end)
_ ->
false
end
end
def scrape(_uri, url) do
%{
source_url: url,
images: [
%{
url: url,
camo_url: Camo.Image.image_url(url)
}
]
}
end
end

View file

@ -0,0 +1,97 @@
defmodule Philomena.Scrapers.Tumblr do
@url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
@inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
@size_regex ~r|_(\d+)(\..+)\z|
@sizes [1280, 540, 500, 400, 250, 100, 75]
@tumblr_ranges [
InetCidr.parse("66.6.32.0/23"),
InetCidr.parse("66.6.44.0/24")
]
@spec can_handle?(URI.t(), String.t()) :: true | false
def can_handle?(uri, url) do
String.match?(url, @url_regex) and tumblr_domain?(uri.host)
end
def scrape(uri, url) do
[post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
api_url = "https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
Philomena.Http.get!(api_url)
|> json!()
|> process_response!()
end
defp json!(%HTTPoison.Response{body: body, status_code: 200}),
do: Jason.decode!(body)
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
do: process_post!(post)
defp process_post!(%{"type" => "photo"} = post) do
images =
post["photos"]
|> Enum.map(fn photo ->
image = upsize(photo["original_size"]["url"])
%{"url" => preview} =
Enum.find(photo["alt_sizes"], & &1["width"] == 400) || %{"url" => image}
%{url: image, camo_url: Camo.Image.image_url(preview)}
end)
add_meta(post, images)
end
defp process_post!(%{"type" => "text"} = post) do
images =
@inline_regex
|> Regex.scan(post["text"])
|> Enum.map(fn url ->
%{url: upsize(url), camo_url: Camo.Image.image_url(url)}
end)
add_meta(post, images)
end
defp upsize(image_url) do
@sizes
|> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2"))
|> Enum.find(&url_ok?/1)
end
defp url_ok?(url) do
match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
end
defp add_meta(post, images) do
source = post["post_url"]
author = post["blog_name"]
description = post["summary"]
%{
source_url: source,
author_name: author,
description: description,
images: images
}
end
defp tumblr_domain?(host) do
host
|> String.to_charlist()
|> :inet_res.lookup(:in, :a)
|> case do
[address | _rest] ->
Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address))
_ ->
false
end
end
defp tumblr_api_key do
Application.get_env(:philomena, :tumblr_api_key)
end
end

View file

@ -0,0 +1,66 @@
defmodule Philomena.Scrapers.Twitter do
@gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);|
@url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
@script_regex ~r|<script type="text/javascript" .*? src="(https://abs.twimg.com/responsive-web/web/main\.[\da-z]+\.js)">|
@bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
@user_agent ["User-Agent": "Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"]
@spec can_handle?(URI.t(), String.t()) :: true | false
def can_handle?(_uri, url) do
String.match?(url, @url_regex)
end
def scrape(_uri, url) do
api_response!(url)
|> extract_data()
end
defp extract_data(tweet) do
images =
tweet["entities"]["media"]
|> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
%{
source_url: tweet["url"],
author_name: tweet["user"],
description: tweet["text"],
images: images
}
end
# We'd like to use the API anonymously. In order to do this, we need to
# extract the anonymous bearer token. Fortunately, this is pretty easy
# to identify in the minified mobile script source.
def api_response!(url) do
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
mobile_url = "https://mobile.twitter.com/#{user}/status/#{status_id}"
api_url = "https://api.twitter.com/2/timeline/conversation/#{status_id}.json"
url = "https://twitter.com/#{user}/status/#{status_id}"
{gt, bearer} =
Philomena.Http.get!(mobile_url, @user_agent)
|> Map.get(:body)
|> extract_guest_token_and_bearer()
Philomena.Http.get!(api_url, ["Authorization": "Bearer #{bearer}", "x-guest-token": gt])
|> Map.get(:body)
|> Jason.decode!()
|> Map.get("globalObjects")
|> Map.get("tweets")
|> Map.get(status_id)
|> Map.put("user", user)
|> Map.put("url", url)
end
defp extract_guest_token_and_bearer(page) do
[gt] = Regex.run(@gt_regex, page, capture: :all_but_first)
[script] = Regex.run(@script_regex, page, capture: :all_but_first)
%{body: body} = Philomena.Http.get!(script)
[bearer] = Regex.run(@bearer_regex, body, capture: :all_but_first)
{gt, bearer}
end
end

View file

@ -120,7 +120,7 @@ defmodule PhilomenaWeb.Router do
# get "/:forum_id", ForumController, :show # impossible to do without constraints
get "/:forum_id/:id", TopicController, :show
get "/:forum_id/:id/:page", TopicController, :show
get "/:forum_id/:id/posts/:post_id", TopicController, :show
get "/:forum_id/:id/post/:post_id", TopicController, :show
end
# Other scopes may use custom stacks.

View file

@ -21,7 +21,11 @@ defmodule PhilomenaWeb.UserAttributionView do
end
def anonymous_avatar(_object, class \\ "avatar--100px") do
img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"), class: class)
class = Enum.join(["image-constrained", class], " ")
content_tag :div, [class: class] do
img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"))
end
end
def user_avatar(object, class \\ "avatar--100px")
@ -54,7 +58,7 @@ defmodule PhilomenaWeb.UserAttributionView do
do: [{"label--danger", "Site Administrator"} | labels]
defp staff_role(labels, %{hide_default_role: false, role: "moderator"}),
do: [{"label--success", "Site Moderator"} | labels]
defp staff_role(labels, %{hide_default_role: false, role: "assisant"}),
defp staff_role(labels, %{hide_default_role: false, role: "assistant"}),
do: [{"label--purple", "Site Assistant"} | labels]
defp staff_role(labels, _user),
do: labels