From f2eec71ec5debc0cb3c6ffbbd0a336ebd4e73fa4 Mon Sep 17 00:00:00 2001 From: Chaska <166928710+chaskayote@users.noreply.github.com> Date: Tue, 23 Apr 2024 01:08:00 -0500 Subject: [PATCH] Trying to add Pixiv scraper --- lib/philomena/scrapers.ex | 1 + lib/philomena/scrapers/pixiv.ex | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 lib/philomena/scrapers/pixiv.ex diff --git a/lib/philomena/scrapers.ex b/lib/philomena/scrapers.ex index d0ac22ae..fe8657f2 100644 --- a/lib/philomena/scrapers.ex +++ b/lib/philomena/scrapers.ex @@ -7,6 +7,7 @@ defmodule Philomena.Scrapers do Philomena.Scrapers.Inkbunny, Philomena.Scrapers.E621, Philomena.Scrapers.Furaffinity, + Philomena.Scrapers.Pixiv, Philomena.Scrapers.Raw ] diff --git a/lib/philomena/scrapers/pixiv.ex b/lib/philomena/scrapers/pixiv.ex new file mode 100644 index 00000000..fbb2e24c --- /dev/null +++ b/lib/philomena/scrapers/pixiv.ex @@ -0,0 +1,35 @@ +defmodule Philomena.Scrapers.Pixiv do + @url_regex ~r|\Ahttps?://pixiv\.net/en/artworks/([0-9]+)| + + @spec can_handle?(URI.t(), String.t()) :: true | false + def can_handle?(_uri, url) do + String.match?(url, @url_regex) + end + + def scrape(_uri, url) do + [_, submission_id] = Regex.run(@url_regex, url, capture: :all) + api_url = "https://www.pixiv.net/touch/ajax/illust/details?illust_id=#{submission_id}" + {:ok, %Tesla.Env{status: 200, body: body}} = Philomena.Http.get(api_url) + + submission = Jason.decode!(body) + + description = submission["illust_details"]["comment"] + |> HtmlSanitizeEx.strip_tags() + |> String.replace(~r/ +/, " ") + |> String.replace(~r/\n \n +/, "\n") + |> String.replace(~r/\n /, "\n") + |> String.trim() + + %{ + source_url: url, + author_name: submission["author_details"]["user_account"], + description: description, + images: [ + %{ + url: "#{submission["illust_details"]["manga_a"]["url_big"]}", + camo_url: Camo.Image.image_url(submission["illust_details"]["manga_a"]["url"]) + } + ] + } + end +end