mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-11-27 13:47:58 +01:00
add scrapers
This commit is contained in:
parent
d3b45e303d
commit
fbfa572a1e
11 changed files with 384 additions and 6 deletions
|
@ -12,10 +12,13 @@ config :philomena,
|
|||
elasticsearch_url: "http://localhost:9200",
|
||||
password_pepper: "dn2e0EpZrvBLoxUM3gfQveBhjf0bG/6/bYhrOyq3L3hV9hdo/bimJ+irbDWsuXLP",
|
||||
otp_secret_key: "Wn7O/8DD+qxL0X4X7bvT90wOkVGcA90bIHww4twR03Ci//zq7PnMw8ypqyyT/b/C",
|
||||
tumblr_api_key: "fuiKNFp9vQFvjLNvx4sUwti4Yb5yGutBN4Xh10LXZhhRKjWlV4",
|
||||
image_url_root: "/img",
|
||||
avatar_url_root: "/avatars",
|
||||
badge_url_root: "/media",
|
||||
image_file_root: "priv/static/system/images"
|
||||
image_file_root: "priv/static/system/images",
|
||||
cdn_host: "",
|
||||
proxy_host: nil
|
||||
|
||||
config :philomena, :pow,
|
||||
user: Philomena.Users.User,
|
||||
|
|
|
@ -19,10 +19,12 @@ config :philomena,
|
|||
password_pepper: System.get_env("PASSWORD_PEPPER"),
|
||||
avatar_url_root: System.get_env("AVATAR_URL_ROOT"),
|
||||
image_file_root: System.get_env("IMAGE_FILE_ROOT"),
|
||||
tumblr_api_key: System.get_env("TUMBLR_API_KEY"),
|
||||
otp_secret_key: System.get_env("OTP_SECRET_KEY"),
|
||||
image_url_root: System.get_env("IMAGE_URL_ROOT"),
|
||||
badge_url_root: System.get_env("BADGE_URL_ROOT"),
|
||||
mailer_address: System.get_env("MAILER_ADDRESS"),
|
||||
proxy_host: System.get_env("PROXY_HOST"),
|
||||
camo_host: System.get_env("CAMO_HOST"),
|
||||
camo_key: System.get_env("CAMO_KEY"),
|
||||
cdn_host: System.get_env("CDN_HOST")
|
||||
|
|
17
lib/philomena/http.ex
Normal file
17
lib/philomena/http.ex
Normal file
|
@ -0,0 +1,17 @@
|
|||
defmodule Philomena.Http do
|
||||
def get!(url, headers \\ [], options \\ []) do
|
||||
options = Keyword.merge(options, proxy: proxy_host())
|
||||
|
||||
HTTPoison.get!(url, headers, options)
|
||||
end
|
||||
|
||||
def head!(url, headers \\ [], options \\ []) do
|
||||
options = Keyword.merge(options, proxy: proxy_host())
|
||||
|
||||
HTTPoison.head!(url, headers, options)
|
||||
end
|
||||
|
||||
defp proxy_host do
|
||||
Application.get_env(:philomena, :proxy_host)
|
||||
end
|
||||
end
|
|
@ -30,10 +30,10 @@ defmodule Philomena.Processors.Webm do
|
|||
end
|
||||
|
||||
defp scale_if_smaller(file, palette, dimensions, {:full, _target_dim}) do
|
||||
{webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
|
||||
{_webm, mp4} = scale_videos(file, palette, dimensions, dimensions)
|
||||
|
||||
[
|
||||
{:copy, webm, "full.webm"},
|
||||
{:symlink_original, "full.webm"},
|
||||
{:copy, mp4, "full.mp4"}
|
||||
]
|
||||
end
|
||||
|
|
24
lib/philomena/scrapers.ex
Normal file
24
lib/philomena/scrapers.ex
Normal file
|
@ -0,0 +1,24 @@
|
|||
defmodule Philomena.Scrapers do
|
||||
@scrapers [
|
||||
Philomena.Scrapers.Deviantart,
|
||||
Philomena.Scrapers.Twitter,
|
||||
Philomena.Scrapers.Tumblr,
|
||||
Philomena.Scrapers.Raw
|
||||
]
|
||||
|
||||
def scrape!(url) do
|
||||
uri = URI.parse(url)
|
||||
|
||||
@scrapers
|
||||
|> Enum.find(& &1.can_handle?(uri, url))
|
||||
|> wrap()
|
||||
|> Enum.map(& &1.scrape(uri, url))
|
||||
|> unwrap()
|
||||
end
|
||||
|
||||
defp wrap(nil), do: []
|
||||
defp wrap(res), do: [res]
|
||||
|
||||
defp unwrap([result]), do: result
|
||||
defp unwrap(_result), do: nil
|
||||
end
|
135
lib/philomena/scrapers/deviantart.ex
Normal file
135
lib/philomena/scrapers/deviantart.ex
Normal file
|
@ -0,0 +1,135 @@
|
|||
defmodule Philomena.Scrapers.Deviantart do
|
||||
@image_regex ~r|<link data-rh="true" rel="preload" href="([^"]*)" as="image"/>|
|
||||
@source_regex ~r|<link data-rh="true" rel="canonical" href="([^"]*)"/>|
|
||||
@artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
|
||||
@serial_regex ~r|https://www.deviantart.com/(?:.*?)-(\d+)\z|
|
||||
@cdnint_regex ~r|(https://images-wixmp-[0-9a-f]+.wixmp.com)(?:/intermediary)?/f/([^/]*)/([^/?]*)|
|
||||
@png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
|
||||
@jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
|
||||
|
||||
@spec can_handle?(URI.t(), String.t()) :: true | false
|
||||
def can_handle?(uri, _url) do
|
||||
String.ends_with?(uri.host, "deviantart.com")
|
||||
end
|
||||
|
||||
# https://github.com/DeviantArt/DeviantArt-API/issues/153
|
||||
#
|
||||
# Note that Erlang (and by extension Elixir) do not have any sort of
|
||||
# reliable HTML/XML parsers that can accept untrusted input. As an example,
|
||||
# xmerl is vulnerable to almost every XML attack which has ever been
|
||||
# created, and also exposes the runtime to symbol DoS as an added bonus.
|
||||
#
|
||||
# So, regex it is. Eat dirt, deviantart. You don't deserve the respect
|
||||
# artists give you.
|
||||
def scrape(_uri, url) do
|
||||
url
|
||||
|> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
|
||||
|> extract_data!()
|
||||
|> try_intermediary_hires!()
|
||||
|> try_new_hires!()
|
||||
|> try_old_hires!()
|
||||
end
|
||||
|
||||
defp extract_data!(%HTTPoison.Response{body: body, status_code: 200}) do
|
||||
[image] = Regex.run(@image_regex, body, capture: :all_but_first)
|
||||
[source] = Regex.run(@source_regex, body, capture: :all_but_first)
|
||||
[artist] = Regex.run(@artist_regex, source, capture: :all_but_first)
|
||||
|
||||
%{
|
||||
source_url: source,
|
||||
author_name: artist,
|
||||
images: [
|
||||
%{
|
||||
url: image,
|
||||
camo_url: Camo.Image.image_url(image)
|
||||
}
|
||||
]
|
||||
}
|
||||
end
|
||||
|
||||
defp try_intermediary_hires!(%{images: [image]} = data) do
|
||||
[domain, object_uuid, object_name] = Regex.run(@cdnint_regex, image.url, capture: :all_but_first)
|
||||
|
||||
built_url = "#{domain}/intermediary/f/#{object_uuid}/#{object_name}"
|
||||
|
||||
case Philomena.Http.head!(built_url) do
|
||||
%HTTPoison.Response{status_code: 200} ->
|
||||
# This is the high resolution URL.
|
||||
|
||||
%{
|
||||
data |
|
||||
images: [
|
||||
%{
|
||||
url: built_url,
|
||||
camo_url: image.camo_url
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
_ ->
|
||||
# Nothing to be found here, move along...
|
||||
data
|
||||
end
|
||||
end
|
||||
|
||||
defp try_new_hires!(%{images: [image]} = data) do
|
||||
cond do
|
||||
String.match?(image.url, @png_regex) ->
|
||||
%{
|
||||
data |
|
||||
images: [
|
||||
%{
|
||||
url: String.replace(image.url, @png_regex, "\\1.png\\3"),
|
||||
camo_url: image.camo_url
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
String.match?(image.url, @jpg_regex) ->
|
||||
%{
|
||||
data |
|
||||
images: [
|
||||
%{
|
||||
url: String.replace(image.url, @jpg_regex, "\\g{1}100\\3"),
|
||||
camo_url: image.camo_url
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
true ->
|
||||
# Nothing to be found here, move along...
|
||||
data
|
||||
end
|
||||
end
|
||||
|
||||
defp try_old_hires!(%{source_url: source, images: [image]} = data) do
|
||||
[serial] = Regex.run(@serial_regex, source, capture: :all_but_first)
|
||||
base36 =
|
||||
serial
|
||||
|> String.to_integer()
|
||||
|> Integer.to_string(36)
|
||||
|> String.downcase()
|
||||
|
||||
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
|
||||
|
||||
case Philomena.Http.get!(built_url) do
|
||||
%HTTPoison.Response{status_code: 301, headers: headers} ->
|
||||
# Location header provides URL of high res image.
|
||||
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "Location" end)
|
||||
|
||||
%{
|
||||
data |
|
||||
images: [
|
||||
%{
|
||||
url: link,
|
||||
camo_url: image.camo_url
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
_ ->
|
||||
# Nothing to be found here, move along...
|
||||
data
|
||||
end
|
||||
end
|
||||
end
|
30
lib/philomena/scrapers/raw.ex
Normal file
30
lib/philomena/scrapers/raw.ex
Normal file
|
@ -0,0 +1,30 @@
|
|||
defmodule Philomena.Scrapers.Raw do
|
||||
@mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"]
|
||||
|
||||
@spec can_handle?(URI.t(), String.t()) :: true | false
|
||||
def can_handle?(_uri, url) do
|
||||
Philomena.Http.head!(url, [], max_body_length: 30_000_000)
|
||||
|> case do
|
||||
%HTTPoison.Response{status_code: 200, headers: headers} ->
|
||||
headers
|
||||
|> Enum.any?(fn {k, v} ->
|
||||
String.downcase(k) == "content-type" and String.downcase(v) in @mime_types
|
||||
end)
|
||||
|
||||
_ ->
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
def scrape(_uri, url) do
|
||||
%{
|
||||
source_url: url,
|
||||
images: [
|
||||
%{
|
||||
url: url,
|
||||
camo_url: Camo.Image.image_url(url)
|
||||
}
|
||||
]
|
||||
}
|
||||
end
|
||||
end
|
97
lib/philomena/scrapers/tumblr.ex
Normal file
97
lib/philomena/scrapers/tumblr.ex
Normal file
|
@ -0,0 +1,97 @@
|
|||
defmodule Philomena.Scrapers.Tumblr do
|
||||
@url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
|
||||
@inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
|
||||
@size_regex ~r|_(\d+)(\..+)\z|
|
||||
@sizes [1280, 540, 500, 400, 250, 100, 75]
|
||||
@tumblr_ranges [
|
||||
InetCidr.parse("66.6.32.0/23"),
|
||||
InetCidr.parse("66.6.44.0/24")
|
||||
]
|
||||
|
||||
@spec can_handle?(URI.t(), String.t()) :: true | false
|
||||
def can_handle?(uri, url) do
|
||||
String.match?(url, @url_regex) and tumblr_domain?(uri.host)
|
||||
end
|
||||
|
||||
def scrape(uri, url) do
|
||||
[post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
||||
|
||||
api_url = "https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
|
||||
|
||||
Philomena.Http.get!(api_url)
|
||||
|> json!()
|
||||
|> process_response!()
|
||||
end
|
||||
|
||||
defp json!(%HTTPoison.Response{body: body, status_code: 200}),
|
||||
do: Jason.decode!(body)
|
||||
|
||||
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
|
||||
do: process_post!(post)
|
||||
|
||||
defp process_post!(%{"type" => "photo"} = post) do
|
||||
images =
|
||||
post["photos"]
|
||||
|> Enum.map(fn photo ->
|
||||
image = upsize(photo["original_size"]["url"])
|
||||
|
||||
%{"url" => preview} =
|
||||
Enum.find(photo["alt_sizes"], & &1["width"] == 400) || %{"url" => image}
|
||||
|
||||
%{url: image, camo_url: Camo.Image.image_url(preview)}
|
||||
end)
|
||||
|
||||
add_meta(post, images)
|
||||
end
|
||||
|
||||
defp process_post!(%{"type" => "text"} = post) do
|
||||
images =
|
||||
@inline_regex
|
||||
|> Regex.scan(post["text"])
|
||||
|> Enum.map(fn url ->
|
||||
%{url: upsize(url), camo_url: Camo.Image.image_url(url)}
|
||||
end)
|
||||
|
||||
add_meta(post, images)
|
||||
end
|
||||
|
||||
defp upsize(image_url) do
|
||||
@sizes
|
||||
|> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2"))
|
||||
|> Enum.find(&url_ok?/1)
|
||||
end
|
||||
|
||||
defp url_ok?(url) do
|
||||
match?(%HTTPoison.Response{status_code: 200}, Philomena.Http.head!(url))
|
||||
end
|
||||
|
||||
defp add_meta(post, images) do
|
||||
source = post["post_url"]
|
||||
author = post["blog_name"]
|
||||
description = post["summary"]
|
||||
|
||||
%{
|
||||
source_url: source,
|
||||
author_name: author,
|
||||
description: description,
|
||||
images: images
|
||||
}
|
||||
end
|
||||
|
||||
defp tumblr_domain?(host) do
|
||||
host
|
||||
|> String.to_charlist()
|
||||
|> :inet_res.lookup(:in, :a)
|
||||
|> case do
|
||||
[address | _rest] ->
|
||||
Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address))
|
||||
|
||||
_ ->
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
defp tumblr_api_key do
|
||||
Application.get_env(:philomena, :tumblr_api_key)
|
||||
end
|
||||
end
|
66
lib/philomena/scrapers/twitter.ex
Normal file
66
lib/philomena/scrapers/twitter.ex
Normal file
|
@ -0,0 +1,66 @@
|
|||
defmodule Philomena.Scrapers.Twitter do
|
||||
@gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);|
|
||||
@url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
|
||||
@script_regex ~r|<script type="text/javascript" .*? src="(https://abs.twimg.com/responsive-web/web/main\.[\da-z]+\.js)">|
|
||||
@bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
|
||||
@user_agent ["User-Agent": "Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"]
|
||||
|
||||
@spec can_handle?(URI.t(), String.t()) :: true | false
|
||||
def can_handle?(_uri, url) do
|
||||
String.match?(url, @url_regex)
|
||||
end
|
||||
|
||||
def scrape(_uri, url) do
|
||||
api_response!(url)
|
||||
|> extract_data()
|
||||
end
|
||||
|
||||
defp extract_data(tweet) do
|
||||
images =
|
||||
tweet["entities"]["media"]
|
||||
|> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
|
||||
|
||||
%{
|
||||
source_url: tweet["url"],
|
||||
author_name: tweet["user"],
|
||||
description: tweet["text"],
|
||||
images: images
|
||||
}
|
||||
end
|
||||
|
||||
# We'd like to use the API anonymously. In order to do this, we need to
|
||||
# extract the anonymous bearer token. Fortunately, this is pretty easy
|
||||
# to identify in the minified mobile script source.
|
||||
def api_response!(url) do
|
||||
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
||||
|
||||
mobile_url = "https://mobile.twitter.com/#{user}/status/#{status_id}"
|
||||
api_url = "https://api.twitter.com/2/timeline/conversation/#{status_id}.json"
|
||||
url = "https://twitter.com/#{user}/status/#{status_id}"
|
||||
|
||||
{gt, bearer} =
|
||||
Philomena.Http.get!(mobile_url, @user_agent)
|
||||
|> Map.get(:body)
|
||||
|> extract_guest_token_and_bearer()
|
||||
|
||||
Philomena.Http.get!(api_url, ["Authorization": "Bearer #{bearer}", "x-guest-token": gt])
|
||||
|> Map.get(:body)
|
||||
|> Jason.decode!()
|
||||
|> Map.get("globalObjects")
|
||||
|> Map.get("tweets")
|
||||
|> Map.get(status_id)
|
||||
|> Map.put("user", user)
|
||||
|> Map.put("url", url)
|
||||
end
|
||||
|
||||
defp extract_guest_token_and_bearer(page) do
|
||||
[gt] = Regex.run(@gt_regex, page, capture: :all_but_first)
|
||||
[script] = Regex.run(@script_regex, page, capture: :all_but_first)
|
||||
|
||||
%{body: body} = Philomena.Http.get!(script)
|
||||
|
||||
[bearer] = Regex.run(@bearer_regex, body, capture: :all_but_first)
|
||||
|
||||
{gt, bearer}
|
||||
end
|
||||
end
|
|
@ -120,7 +120,7 @@ defmodule PhilomenaWeb.Router do
|
|||
# get "/:forum_id", ForumController, :show # impossible to do without constraints
|
||||
get "/:forum_id/:id", TopicController, :show
|
||||
get "/:forum_id/:id/:page", TopicController, :show
|
||||
get "/:forum_id/:id/posts/:post_id", TopicController, :show
|
||||
get "/:forum_id/:id/post/:post_id", TopicController, :show
|
||||
end
|
||||
|
||||
# Other scopes may use custom stacks.
|
||||
|
|
|
@ -21,7 +21,11 @@ defmodule PhilomenaWeb.UserAttributionView do
|
|||
end
|
||||
|
||||
def anonymous_avatar(_object, class \\ "avatar--100px") do
|
||||
img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"), class: class)
|
||||
class = Enum.join(["image-constrained", class], " ")
|
||||
|
||||
content_tag :div, [class: class] do
|
||||
img_tag(Routes.static_path(PhilomenaWeb.Endpoint, "/images/no_avatar.svg"))
|
||||
end
|
||||
end
|
||||
|
||||
def user_avatar(object, class \\ "avatar--100px")
|
||||
|
@ -54,7 +58,7 @@ defmodule PhilomenaWeb.UserAttributionView do
|
|||
do: [{"label--danger", "Site Administrator"} | labels]
|
||||
defp staff_role(labels, %{hide_default_role: false, role: "moderator"}),
|
||||
do: [{"label--success", "Site Moderator"} | labels]
|
||||
defp staff_role(labels, %{hide_default_role: false, role: "assisant"}),
|
||||
defp staff_role(labels, %{hide_default_role: false, role: "assistant"}),
|
||||
do: [{"label--purple", "Site Assistant"} | labels]
|
||||
defp staff_role(labels, _user),
|
||||
do: labels
|
||||
|
|
Loading…
Reference in a new issue