From a08510f21efa7ec3a32082f70d4420b64b73c156 Mon Sep 17 00:00:00 2001 From: Yay295 Date: Mon, 28 Oct 2024 15:13:42 -0500 Subject: [PATCH] Add Bluesky scraper (#290) * add bluesky scraper * use com.atproto.sync.getBlob to get original bluesky image * ignore data after bluesky post id * fix json access and missing function * fix bluesky fullsize image regex --- lib/philomena_proxy/scrapers.ex | 1 + lib/philomena_proxy/scrapers/bluesky.ex | 48 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 lib/philomena_proxy/scrapers/bluesky.ex diff --git a/lib/philomena_proxy/scrapers.ex b/lib/philomena_proxy/scrapers.ex index a96f0817..08674d44 100644 --- a/lib/philomena_proxy/scrapers.ex +++ b/lib/philomena_proxy/scrapers.ex @@ -21,6 +21,7 @@ defmodule PhilomenaProxy.Scrapers do } @scrapers [ + PhilomenaProxy.Scrapers.Bluesky, PhilomenaProxy.Scrapers.Deviantart, PhilomenaProxy.Scrapers.Pillowfort, PhilomenaProxy.Scrapers.Twitter, diff --git a/lib/philomena_proxy/scrapers/bluesky.ex b/lib/philomena_proxy/scrapers/bluesky.ex new file mode 100644 index 00000000..598d1470 --- /dev/null +++ b/lib/philomena_proxy/scrapers/bluesky.ex @@ -0,0 +1,48 @@ +defmodule PhilomenaProxy.Scrapers.Bluesky do + @moduledoc false + + alias PhilomenaProxy.Scrapers.Scraper + alias PhilomenaProxy.Scrapers + + @behaviour Scraper + + @url_regex ~r|https://bsky\.app/profile/([^/]+)/post/([^/?#]+)| + @fullsize_image_regex ~r|.*/img/feed_fullsize/plain/([^/]+)/([^@]+).*| + @blob_image_url_pattern "https://bsky.social/xrpc/com.atproto.sync.getBlob/?did=\\1&cid=\\2" + + @spec can_handle?(URI.t(), String.t()) :: boolean() + def can_handle?(_uri, url) do + String.match?(url, @url_regex) + end + + @spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result() + def scrape(_uri, url) do + [handle, id] = Regex.run(@url_regex, url, capture: :all_but_first) + + api_url_resolve_handle = + "https://public.api.bsky.app/xrpc/com.atproto.identity.resolveHandle?handle=#{handle}" + + did = PhilomenaProxy.Http.get(api_url_resolve_handle) |> json!() |> Map.fetch!(:did) + + api_url_get_posts = + "https://public.api.bsky.app/xrpc/app.bsky.feed.getPosts?uris=at://#{did}/app.bsky.feed.post/#{id}" + + post_json = PhilomenaProxy.Http.get(api_url_get_posts) |> json!() |> Map.fetch!(:posts) |> hd + + %{ + source_url: url, + author_name: post_json["author"]["handle"], + description: post_json["record"]["text"], + images: + post_json["embed"]["images"] + |> Enum.map( + &%{ + url: String.replace(&1["fullsize"], @fullsize_image_regex, @blob_image_url_pattern), + camo_url: PhilomenaProxy.Camo.image_url(&1["thumb"]) + } + ) + } + end + + defp json!({:ok, %{body: body, status: 200}}), do: Jason.decode!(body) +end