From f2eec71ec5debc0cb3c6ffbbd0a336ebd4e73fa4 Mon Sep 17 00:00:00 2001
From: Chaska <166928710+chaskayote@users.noreply.github.com>
Date: Tue, 23 Apr 2024 01:08:00 -0500
Subject: [PATCH] Trying to add Pixiv scraper

---
 lib/philomena/scrapers.ex       |  1 +
 lib/philomena/scrapers/pixiv.ex | 35 +++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 lib/philomena/scrapers/pixiv.ex

diff --git a/lib/philomena/scrapers.ex b/lib/philomena/scrapers.ex
index d0ac22ae..fe8657f2 100644
--- a/lib/philomena/scrapers.ex
+++ b/lib/philomena/scrapers.ex
@@ -7,6 +7,7 @@ defmodule Philomena.Scrapers do
     Philomena.Scrapers.Inkbunny,
     Philomena.Scrapers.E621,
     Philomena.Scrapers.Furaffinity,
+    Philomena.Scrapers.Pixiv,
     Philomena.Scrapers.Raw
   ]
 
diff --git a/lib/philomena/scrapers/pixiv.ex b/lib/philomena/scrapers/pixiv.ex
new file mode 100644
index 00000000..fbb2e24c
--- /dev/null
+++ b/lib/philomena/scrapers/pixiv.ex
@@ -0,0 +1,35 @@
+defmodule Philomena.Scrapers.Pixiv do
+  @url_regex ~r|\Ahttps?://pixiv\.net/en/artworks/([0-9]+)|
+
+  @spec can_handle?(URI.t(), String.t()) :: true | false
+  def can_handle?(_uri, url) do
+    String.match?(url, @url_regex)
+  end
+
+  def scrape(_uri, url) do
+    [_, submission_id] = Regex.run(@url_regex, url, capture: :all)
+    api_url = "https://www.pixiv.net/touch/ajax/illust/details?illust_id=#{submission_id}"
+    {:ok, %Tesla.Env{status: 200, body: body}} = Philomena.Http.get(api_url)
+
+    submission = Jason.decode!(body)
+
+    description = submission["illust_details"]["comment"]
+    |> HtmlSanitizeEx.strip_tags()
+    |> String.replace(~r/  +/, " ")
+    |> String.replace(~r/\n \n +/, "\n")
+    |> String.replace(~r/\n /, "\n")
+    |> String.trim()
+
+    %{
+      source_url: url,
+      author_name: submission["author_details"]["user_account"],
+      description: description,
+      images: [
+        %{
+          url: "#{submission["illust_details"]["manga_a"]["url_big"]}",
+          camo_url: Camo.Image.image_url(submission["illust_details"]["manga_a"]["url"])
+        }
+      ]
+    }
+  end
+end