mirror of
https://github.com/philomena-dev/philomena.git
synced 2025-03-17 17:10:03 +01:00
Added a scraper for CivitAI posts. Only supports post URLs (not single images) due to API limitations. API also does not provide the "description" visible on the site itself. Meta information like model, prompt, seed, etc. not used as we do not (yet) has a way to store or show this neatly.
74 lines
2 KiB
Elixir
74 lines
2 KiB
Elixir
defmodule PhilomenaProxy.Scrapers do
|
|
@moduledoc """
|
|
Scrape utilities to facilitate uploading media from other websites.
|
|
"""
|
|
|
|
@typedoc "The URL to fetch, as a string."
|
|
@type url :: String.t()
|
|
|
|
@typedoc "An individual image in a list associated with a scrape result."
|
|
@type image_result :: %{
|
|
url: url(),
|
|
camo_url: url()
|
|
}
|
|
|
|
@typedoc "Result of a successful scrape."
|
|
@type scrape_result :: %{
|
|
source_url: url(),
|
|
description: String.t() | nil,
|
|
author_name: String.t() | nil,
|
|
images: [image_result()]
|
|
}
|
|
|
|
@scrapers [
|
|
PhilomenaProxy.Scrapers.Bluesky,
|
|
PhilomenaProxy.Scrapers.Civitai,
|
|
PhilomenaProxy.Scrapers.Deviantart,
|
|
PhilomenaProxy.Scrapers.Pillowfort,
|
|
PhilomenaProxy.Scrapers.Twitter,
|
|
PhilomenaProxy.Scrapers.Tumblr,
|
|
PhilomenaProxy.Scrapers.Raw
|
|
]
|
|
|
|
@doc """
|
|
Scrape a URL for content.
|
|
|
|
The scrape result is intended for serialization to JSON.
|
|
|
|
## Examples
|
|
|
|
iex> PhilomenaProxy.Scrapers.scrape!("http://example.org/image-page")
|
|
%{
|
|
source_url: "http://example.org/image-page",
|
|
description: "Test",
|
|
author_name: "myself",
|
|
images: [
|
|
%{
|
|
url: "http://example.org/image.png"
|
|
camo_url: "http://example.net/UT2YIjkWDas6CQBmQcYlcNGmKfQ/aHR0cDovL2V4YW1wbGUub3JnL2ltY"
|
|
}
|
|
]
|
|
}
|
|
|
|
iex> PhilomenaProxy.Scrapers.scrape!("http://example.org/nonexistent-path")
|
|
nil
|
|
|
|
"""
|
|
@spec scrape!(url()) :: scrape_result() | nil
|
|
def scrape!(url) do
|
|
uri = URI.parse(url)
|
|
|
|
cond do
|
|
is_nil(uri.host) ->
|
|
# Scraping without a hostname doesn't make sense because the proxy cannot fetch it, and
|
|
# some scrapers may test properties of the hostname.
|
|
nil
|
|
|
|
true ->
|
|
# Find the first scraper which can handle the URL and process, or return nil
|
|
Enum.find_value(@scrapers, nil, fn scraper ->
|
|
scraper.can_handle?(uri, url) && scraper.scrape(uri, url)
|
|
end)
|
|
end
|
|
end
|
|
end
|