philomena/lib/philomena_proxy/scrapers.ex
tantabus-oneiros 1a90f8cdc4 Add CivitAI post scraper
Added a scraper for CivitAI posts. Only supports post URLs (not single images) due to API limitations. API also does not provide the "description" visible on the site itself. Meta information like model, prompt, seed, etc. not used as we do not (yet) has a way to store or show this neatly.
2025-03-14 21:26:25 +01:00

74 lines
2 KiB
Elixir

defmodule PhilomenaProxy.Scrapers do
@moduledoc """
Scrape utilities to facilitate uploading media from other websites.
"""
@typedoc "The URL to fetch, as a string."
@type url :: String.t()
@typedoc "An individual image in a list associated with a scrape result."
@type image_result :: %{
url: url(),
camo_url: url()
}
@typedoc "Result of a successful scrape."
@type scrape_result :: %{
source_url: url(),
description: String.t() | nil,
author_name: String.t() | nil,
images: [image_result()]
}
@scrapers [
PhilomenaProxy.Scrapers.Bluesky,
PhilomenaProxy.Scrapers.Civitai,
PhilomenaProxy.Scrapers.Deviantart,
PhilomenaProxy.Scrapers.Pillowfort,
PhilomenaProxy.Scrapers.Twitter,
PhilomenaProxy.Scrapers.Tumblr,
PhilomenaProxy.Scrapers.Raw
]
@doc """
Scrape a URL for content.
The scrape result is intended for serialization to JSON.
## Examples
iex> PhilomenaProxy.Scrapers.scrape!("http://example.org/image-page")
%{
source_url: "http://example.org/image-page",
description: "Test",
author_name: "myself",
images: [
%{
url: "http://example.org/image.png"
camo_url: "http://example.net/UT2YIjkWDas6CQBmQcYlcNGmKfQ/aHR0cDovL2V4YW1wbGUub3JnL2ltY"
}
]
}
iex> PhilomenaProxy.Scrapers.scrape!("http://example.org/nonexistent-path")
nil
"""
@spec scrape!(url()) :: scrape_result() | nil
def scrape!(url) do
uri = URI.parse(url)
cond do
is_nil(uri.host) ->
# Scraping without a hostname doesn't make sense because the proxy cannot fetch it, and
# some scrapers may test properties of the hostname.
nil
true ->
# Find the first scraper which can handle the URL and process, or return nil
Enum.find_value(@scrapers, nil, fn scraper ->
scraper.can_handle?(uri, url) && scraper.scrape(uri, url)
end)
end
end
end