fix tumblr scrapes for text posts

This commit is contained in:
byte[] 2020-08-28 13:51:26 -04:00
parent 6a3d292983
commit 2347d018ca

View file

@ -1,6 +1,6 @@
defmodule Philomena.Scrapers.Tumblr do
@url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
@inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
@media_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com/[a-f\d]+/[a-f\d]+-[a-f\d]+/s\d+x\d+/[a-f\d]+\.(?:png\|jpe?g\|gif)|i
@size_regex ~r|_(\d+)(\..+)\z|
@sizes [1280, 540, 500, 400, 250, 100, 75]
@tumblr_ranges [
@ -54,10 +54,10 @@ defmodule Philomena.Scrapers.Tumblr do
defp process_post!(%{"type" => "text"} = post) do
images =
@inline_regex
|> Regex.scan(post["text"])
|> Enum.map(fn url ->
%{url: upsize(url), camo_url: Camo.Image.image_url(url)}
@media_regex
|> Regex.scan(post["body"])
|> Enum.map(fn [url | _captures] ->
%{url: url, camo_url: Camo.Image.image_url(url)}
end)
add_meta(post, images)