Feature/main/246 scrape pillowfort (#139)

This commit is contained in:
Erhannis 2021-12-19 22:39:40 -05:00 committed by GitHub
parent faa9b0784a
commit 12ce4f82e1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 0 deletions

View file

@ -1,6 +1,7 @@
defmodule Philomena.Scrapers do
@scrapers [
Philomena.Scrapers.Deviantart,
Philomena.Scrapers.Pillowfort,
Philomena.Scrapers.Twitter,
Philomena.Scrapers.Tumblr,
Philomena.Scrapers.Raw

View file

@ -0,0 +1,50 @@
defmodule Philomena.Scrapers.Pillowfort do
@url_regex ~r|\Ahttps?://www\.pillowfort\.social/posts/([0-9]+)|
@spec can_handle?(URI.t(), String.t()) :: boolean()
def can_handle?(_uri, url) do
String.match?(url, @url_regex)
end
def scrape(_uri, url) do
[post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
api_url = "https://www.pillowfort.social/posts/#{post_id}/json"
Philomena.Http.get(api_url)
|> json!()
|> process_response!(url)
end
defp json!({:ok, %Tesla.Env{body: body, status: 200}}),
do: Jason.decode!(body)
defp process_response!(post_json, url) do
images =
post_json["media"]
|> Enum.map(
&%{
url: &1["url"],
camo_url: Camo.Image.image_url(&1["small_image_url"])
}
)
%{
source_url: url,
author_name: post_json["username"],
description: Enum.join(title(post_json) ++ content(post_json), "\n\n---\n\n"),
images: images
}
end
defp title(%{"title" => title}) when title not in [nil, ""], do: [remove_html_tags(title)]
defp title(_), do: []
defp content(%{"content" => content}) when content not in [nil, ""], do: [remove_html_tags(content)]
defp content(_), do: []
defp remove_html_tags(text) do
# The markup parser won't render these tags, so remove them
String.replace(text, ~r|<.+?>|, "")
end
end