2024-05-25 03:15:05 +02:00
|
|
|
defmodule PhilomenaProxy.Scrapers.Tumblr do
|
|
|
|
@moduledoc false
|
|
|
|
|
|
|
|
alias PhilomenaProxy.Scrapers.Scraper
|
|
|
|
alias PhilomenaProxy.Scrapers
|
|
|
|
|
|
|
|
@behaviour Scraper
|
|
|
|
|
2019-11-28 18:12:10 +01:00
|
|
|
@url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
|
2020-08-28 19:51:26 +02:00
|
|
|
@media_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com/[a-f\d]+/[a-f\d]+-[a-f\d]+/s\d+x\d+/[a-f\d]+\.(?:png\|jpe?g\|gif)|i
|
2019-11-28 18:12:10 +01:00
|
|
|
@size_regex ~r|_(\d+)(\..+)\z|
|
|
|
|
@sizes [1280, 540, 500, 400, 250, 100, 75]
|
|
|
|
@tumblr_ranges [
|
2024-03-04 17:56:56 +01:00
|
|
|
InetCidr.parse_cidr!("66.6.32.0/24"),
|
|
|
|
InetCidr.parse_cidr!("66.6.33.0/24"),
|
|
|
|
InetCidr.parse_cidr!("66.6.44.0/24"),
|
|
|
|
InetCidr.parse_cidr!("74.114.152.0/24"),
|
|
|
|
InetCidr.parse_cidr!("74.114.153.0/24"),
|
|
|
|
InetCidr.parse_cidr!("74.114.154.0/24"),
|
|
|
|
InetCidr.parse_cidr!("74.114.155.0/24")
|
2019-11-28 18:12:10 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
@spec can_handle?(URI.t(), String.t()) :: true | false
|
|
|
|
def can_handle?(uri, url) do
|
|
|
|
String.match?(url, @url_regex) and tumblr_domain?(uri.host)
|
|
|
|
end
|
|
|
|
|
2024-05-25 03:15:05 +02:00
|
|
|
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
|
2019-11-28 18:12:10 +01:00
|
|
|
def scrape(uri, url) do
|
|
|
|
[post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
|
|
|
|
2020-01-11 05:20:19 +01:00
|
|
|
api_url =
|
2021-11-07 15:33:41 +01:00
|
|
|
"https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
|
2019-11-28 18:12:10 +01:00
|
|
|
|
2024-05-25 03:15:05 +02:00
|
|
|
PhilomenaProxy.Http.get(api_url)
|
2019-11-28 18:12:10 +01:00
|
|
|
|> json!()
|
|
|
|
|> process_response!()
|
|
|
|
end
|
|
|
|
|
2020-09-10 05:12:54 +02:00
|
|
|
defp json!({:ok, %Tesla.Env{body: body, status: 200}}),
|
2019-11-28 18:12:10 +01:00
|
|
|
do: Jason.decode!(body)
|
|
|
|
|
|
|
|
defp process_response!(%{"response" => %{"posts" => [post | _rest]}}),
|
|
|
|
do: process_post!(post)
|
|
|
|
|
|
|
|
defp process_post!(%{"type" => "photo"} = post) do
|
|
|
|
images =
|
|
|
|
post["photos"]
|
|
|
|
|> Enum.map(fn photo ->
|
|
|
|
image = upsize(photo["original_size"]["url"])
|
|
|
|
|
|
|
|
%{"url" => preview} =
|
2020-01-11 05:20:19 +01:00
|
|
|
Enum.find(photo["alt_sizes"], &(&1["width"] == 400)) || %{"url" => image}
|
2019-11-28 18:12:10 +01:00
|
|
|
|
2024-05-25 03:15:05 +02:00
|
|
|
%{url: image, camo_url: PhilomenaProxy.Camo.image_url(preview)}
|
2019-11-28 18:12:10 +01:00
|
|
|
end)
|
|
|
|
|
|
|
|
add_meta(post, images)
|
|
|
|
end
|
|
|
|
|
|
|
|
defp process_post!(%{"type" => "text"} = post) do
|
|
|
|
images =
|
2020-08-28 19:51:26 +02:00
|
|
|
@media_regex
|
|
|
|
|> Regex.scan(post["body"])
|
|
|
|
|> Enum.map(fn [url | _captures] ->
|
2024-05-25 03:15:05 +02:00
|
|
|
%{url: url, camo_url: PhilomenaProxy.Camo.image_url(url)}
|
2019-11-28 18:12:10 +01:00
|
|
|
end)
|
|
|
|
|
|
|
|
add_meta(post, images)
|
|
|
|
end
|
|
|
|
|
|
|
|
defp upsize(image_url) do
|
|
|
|
@sizes
|
|
|
|
|> Enum.map(&String.replace(image_url, @size_regex, "_#{&1}\\2"))
|
|
|
|
|> Enum.find(&url_ok?/1)
|
|
|
|
end
|
|
|
|
|
|
|
|
defp url_ok?(url) do
|
2024-05-25 03:15:05 +02:00
|
|
|
match?({:ok, %Tesla.Env{status: 200}}, PhilomenaProxy.Http.head(url))
|
2019-11-28 18:12:10 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
defp add_meta(post, images) do
|
|
|
|
source = post["post_url"]
|
|
|
|
author = post["blog_name"]
|
|
|
|
description = post["summary"]
|
|
|
|
|
|
|
|
%{
|
|
|
|
source_url: source,
|
|
|
|
author_name: author,
|
|
|
|
description: description,
|
|
|
|
images: images
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
|
|
|
defp tumblr_domain?(host) do
|
|
|
|
host
|
|
|
|
|> String.to_charlist()
|
|
|
|
|> :inet_res.lookup(:in, :a)
|
|
|
|
|> case do
|
|
|
|
[address | _rest] ->
|
|
|
|
Enum.any?(@tumblr_ranges, &InetCidr.contains?(&1, address))
|
|
|
|
|
|
|
|
_ ->
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp tumblr_api_key do
|
|
|
|
Application.get_env(:philomena, :tumblr_api_key)
|
|
|
|
end
|
2020-01-11 05:20:19 +01:00
|
|
|
end
|