fix tumblr scrapes for text posts

2025-04-21 02:33:59 +02:00 · 2020-08-28 13:51:26 -04:00 · 2020-08-28 13:51:26 -04:00 · 2347d018ca
commit 2347d018ca
parent 6a3d292983
1 changed files with 5 additions and 5 deletions
--- a/lib/philomena/scrapers/tumblr.ex
+++ b/lib/philomena/scrapers/tumblr.ex
@ -1,6 +1,6 @@
 defmodule Philomena.Scrapers.Tumblr do
  @url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
-  @inline_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com\/[a-f\d]+\/tumblr(?:_inline)?_[a-z\d]+_\d+\.(?:png\|jpe?g\|gif)|i
+  @media_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com/[a-f\d]+/[a-f\d]+-[a-f\d]+/s\d+x\d+/[a-f\d]+\.(?:png\|jpe?g\|gif)|i
  @size_regex ~r|_(\d+)(\..+)\z|
  @sizes [1280, 540, 500, 400, 250, 100, 75]
  @tumblr_ranges [
@ -54,10 +54,10 @@ defmodule Philomena.Scrapers.Tumblr do

  defp process_post!(%{"type" => "text"} = post) do
    images =
-      @inline_regex
-      |> Regex.scan(post["text"])
-      |> Enum.map(fn url ->
-        %{url: upsize(url), camo_url: Camo.Image.image_url(url)}
+      @media_regex
+      |> Regex.scan(post["body"])
+      |> Enum.map(fn [url | _captures] ->
+        %{url: url, camo_url: Camo.Image.image_url(url)}
      end)

    add_meta(post, images)