Merge pull request #268 from philomena-dev/proxy-namespace

Split out HTTP client interaction into PhilomenaProxy namespace
This commit is contained in:
liamwhite 2024-06-02 19:33:45 -04:00 committed by GitHub
commit 5729cac98b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 294 additions and 113 deletions

View file

@ -1,8 +0,0 @@
defmodule Camo.Image do
@doc """
Convert a potentially untrusted external image URL into a trusted one
loaded through a gocamo proxy (specified by the environment).
"""
@spec image_url(String.t()) :: String.t()
def image_url(input), do: Philomena.Native.camo_image_url(input)
end

View file

@ -1,7 +1,7 @@
defmodule Philomena.ArtistLinks.AutomaticVerifier do
def check_link(artist_link, recheck_time) do
artist_link.uri
|> Philomena.Http.get()
|> PhilomenaProxy.Http.get()
|> contains_verification_code?(artist_link.verification_code)
|> case do
true ->

View file

@ -4,7 +4,7 @@ defmodule Philomena.Channels.PicartoChannel do
@spec live_channels(DateTime.t()) :: map()
def live_channels(now) do
@api_online
|> Philomena.Http.get()
|> PhilomenaProxy.Http.get()
|> case do
{:ok, %Tesla.Env{body: body, status: 200}} ->
body

View file

@ -4,7 +4,7 @@ defmodule Philomena.Channels.PiczelChannel do
@spec live_channels(DateTime.t()) :: map()
def live_channels(now) do
@api_online
|> Philomena.Http.get()
|> PhilomenaProxy.Http.get()
|> case do
{:ok, %Tesla.Env{body: body, status: 200}} ->
body

View file

@ -1,46 +0,0 @@
defmodule Philomena.Http do
def get(url, headers \\ [], options \\ []) do
Tesla.get(client(headers), url, opts: [adapter: adapter_opts(options)])
end
def head(url, headers \\ [], options \\ []) do
Tesla.head(client(headers), url, opts: [adapter: adapter_opts(options)])
end
def post(url, body, headers \\ [], options \\ []) do
Tesla.post(client(headers), url, body, opts: [adapter: adapter_opts(options)])
end
defp adapter_opts(opts) do
opts = Keyword.merge(opts, max_body: 125_000_000, inet6: true)
case Application.get_env(:philomena, :proxy_host) do
nil ->
opts
url ->
Keyword.merge(opts, proxy: proxy_opts(URI.parse(url)))
end
end
defp proxy_opts(%{host: host, port: port, scheme: "https"}),
do: {:https, host, port, [transport_opts: [inet6: true]]}
defp proxy_opts(%{host: host, port: port, scheme: "http"}),
do: {:http, host, port, [transport_opts: [inet6: true]]}
defp client(headers) do
Tesla.client(
[
{Tesla.Middleware.FollowRedirects, max_redirects: 1},
{Tesla.Middleware.Headers,
[
{"User-Agent",
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"}
| headers
]}
],
Tesla.Adapter.Mint
)
end
end

View file

@ -1,25 +0,0 @@
defmodule Philomena.Scrapers do
@scrapers [
Philomena.Scrapers.Deviantart,
Philomena.Scrapers.Pillowfort,
Philomena.Scrapers.Twitter,
Philomena.Scrapers.Tumblr,
Philomena.Scrapers.Raw
]
def scrape!(url) do
uri = URI.parse(url)
@scrapers
|> Enum.find(& &1.can_handle?(uri, url))
|> wrap()
|> Enum.map(& &1.scrape(uri, url))
|> unwrap()
end
defp wrap(nil), do: []
defp wrap(res), do: [res]
defp unwrap([result]), do: result
defp unwrap(_result), do: nil
end

View file

@ -0,0 +1,24 @@
defmodule PhilomenaProxy.Camo do
@moduledoc """
Image proxying utilities.
"""
@doc """
Convert a potentially untrusted external image URL into a trusted one
loaded through a gocamo proxy (specified by the environment).
Configuration is read from environment variables at runtime by Philomena.
config :philomena,
camo_host: System.get_env("CAMO_HOST"),
camo_key: System.get_env("CAMO_KEY"),
## Example
iex> PhilomenaProxy.Camo.image_url("https://example.org/img/view/2024/1/1/1.png")
"https://example.net/L5MqSmYq1ZEqiBGGvsvSDpILyJI/aHR0cHM6Ly9leGFtcGxlLm9yZy9pbWcvdmlldy8yMDI0LzEvMS8xLnBuZwo"
"""
@spec image_url(String.t()) :: String.t()
def image_url(input), do: Philomena.Native.camo_image_url(input)
end

107
lib/philomena_proxy/http.ex Normal file
View file

@ -0,0 +1,107 @@
defmodule PhilomenaProxy.Http do
@moduledoc """
HTTP client implementation.
This applies the Philomena User-Agent header, and optionally proxies traffic through a SOCKS5
HTTP proxy to allow the application to connect when the local network is restricted.
If a proxy host is not specified in the configuration, then a proxy is not used and external
traffic is originated from the same network as application.
Proxy options are read from environment variables at runtime by Philomena.
config :philomena,
proxy_host: System.get_env("PROXY_HOST"),
"""
@type url :: String.t()
@type header_list :: [{String.t(), String.t()}]
@type body :: binary()
@type client_options :: keyword()
@doc ~S"""
Perform a HTTP GET request.
## Example
iex> PhilomenaProxy.Http.get("http://example.com", [{"authorization", "Bearer #{token}"}])
{:ok, %Tesla.Env{...}}
iex> PhilomenaProxy.Http.get("http://nonexistent.example.com")
{:error, %Mint.TransportError{reason: :nxdomain}}
"""
@spec get(url(), header_list(), client_options()) :: Tesla.Env.result()
def get(url, headers \\ [], options \\ []) do
Tesla.get(client(headers), url, opts: [adapter: adapter_opts(options)])
end
@doc ~S"""
Perform a HTTP HEAD request.
## Example
iex> PhilomenaProxy.Http.head("http://example.com", [{"authorization", "Bearer #{token}"}])
{:ok, %Tesla.Env{...}}
iex> PhilomenaProxy.Http.head("http://nonexistent.example.com")
{:error, %Mint.TransportError{reason: :nxdomain}}
"""
@spec head(url(), header_list(), client_options()) :: Tesla.Env.result()
def head(url, headers \\ [], options \\ []) do
Tesla.head(client(headers), url, opts: [adapter: adapter_opts(options)])
end
@doc ~S"""
Perform a HTTP POST request.
## Example
iex> PhilomenaProxy.Http.post("http://example.com", "", [{"authorization", "Bearer #{token}"}])
{:ok, %Tesla.Env{...}}
iex> PhilomenaProxy.Http.post("http://nonexistent.example.com", "")
{:error, %Mint.TransportError{reason: :nxdomain}}
"""
@spec post(url(), body(), header_list(), client_options()) :: Tesla.Env.result()
def post(url, body, headers \\ [], options \\ []) do
Tesla.post(client(headers), url, body, opts: [adapter: adapter_opts(options)])
end
defp adapter_opts(opts) do
opts = Keyword.merge(opts, max_body: 125_000_000, inet6: true)
case Application.get_env(:philomena, :proxy_host) do
nil ->
opts
url ->
Keyword.merge(opts, proxy: proxy_opts(URI.parse(url)))
end
end
defp proxy_opts(%{host: host, port: port, scheme: "https"}),
do: {:https, host, port, [transport_opts: [inet6: true]]}
defp proxy_opts(%{host: host, port: port, scheme: "http"}),
do: {:http, host, port, [transport_opts: [inet6: true]]}
defp client(headers) do
Tesla.client(
[
{Tesla.Middleware.FollowRedirects, max_redirects: 1},
{Tesla.Middleware.Headers,
[
{"User-Agent",
"Mozilla/5.0 (X11; Philomena; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"}
| headers
]}
],
Tesla.Adapter.Mint
)
end
end

View file

@ -0,0 +1,71 @@
defmodule PhilomenaProxy.Scrapers do
@moduledoc """
Scrape utilities to facilitate uploading media from other websites.
"""
# The URL to fetch, as a string.
@type url :: String.t()
# An individual image in a list associated with a scrape result.
@type image_result :: %{
url: url(),
camo_url: url()
}
# Result of a successful scrape.
@type scrape_result :: %{
source_url: url(),
description: String.t() | nil,
author_name: String.t() | nil,
images: [image_result()]
}
@scrapers [
PhilomenaProxy.Scrapers.Deviantart,
PhilomenaProxy.Scrapers.Pillowfort,
PhilomenaProxy.Scrapers.Twitter,
PhilomenaProxy.Scrapers.Tumblr,
PhilomenaProxy.Scrapers.Raw
]
@doc """
Scrape a URL for content.
The scrape result is intended for serialization to JSON.
## Examples
iex> PhilomenaProxy.Scrapers.scrape!("http://example.org/image-page")
%{
source_url: "http://example.org/image-page",
description: "Test",
author_name: "myself",
images: [
%{
url: "http://example.org/image.png"
camo_url: "http://example.net/UT2YIjkWDas6CQBmQcYlcNGmKfQ/aHR0cDovL2V4YW1wbGUub3JnL2ltY"
}
]
}
iex> PhilomenaProxy.Scrapers.scrape!("http://example.org/nonexistent-path")
nil
"""
@spec scrape!(url()) :: scrape_result() | nil
def scrape!(url) do
uri = URI.parse(url)
@scrapers
|> Enum.find(& &1.can_handle?(uri, url))
|> wrap()
|> Enum.map(& &1.scrape(uri, url))
|> unwrap()
end
defp wrap(nil), do: []
defp wrap(res), do: [res]
defp unwrap([result]), do: result
defp unwrap(_result), do: nil
end

View file

@ -1,4 +1,11 @@
defmodule Philomena.Scrapers.Deviantart do
defmodule PhilomenaProxy.Scrapers.Deviantart do
@moduledoc false
alias PhilomenaProxy.Scrapers.Scraper
alias PhilomenaProxy.Scrapers
@behaviour Scraper
@image_regex ~r|data-rh="true" rel="preload" href="([^"]*)" as="image"|
@source_regex ~r|rel="canonical" href="([^"]*)"|
@artist_regex ~r|https://www.deviantart.com/([^/]*)/art|
@ -7,7 +14,7 @@ defmodule Philomena.Scrapers.Deviantart do
@png_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.png/v1/fill/[0-9a-z_,]+/[0-9a-z_\-]+)(\.png)(.*)|
@jpg_regex ~r|(https://[0-9a-z\-\.]+(?:/intermediary)?/f/[0-9a-f\-]+/[0-9a-z\-]+\.jpg/v1/fill/w_[0-9]+,h_[0-9]+,q_)([0-9]+)(,[a-z]+\/[a-z0-6_\-]+\.jpe?g.*)|
@spec can_handle?(URI.t(), String.t()) :: true | false
@spec can_handle?(URI.t(), String.t()) :: boolean()
def can_handle?(uri, _url) do
String.ends_with?(uri.host, "deviantart.com")
end
@ -21,6 +28,7 @@ defmodule Philomena.Scrapers.Deviantart do
#
# So, regex it is. Eat dirt, deviantart. You don't deserve the respect
# artists give you.
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
def scrape(_uri, url) do
url
|> follow_redirect(2)
@ -38,10 +46,11 @@ defmodule Philomena.Scrapers.Deviantart do
%{
source_url: source,
author_name: artist,
description: "",
images: [
%{
url: image,
camo_url: Camo.Image.image_url(image)
camo_url: PhilomenaProxy.Camo.image_url(image)
}
]
}
@ -51,7 +60,7 @@ defmodule Philomena.Scrapers.Deviantart do
with [domain, object_uuid, object_name] <-
Regex.run(@cdnint_regex, image.url, capture: :all_but_first),
built_url <- "#{domain}/intermediary/f/#{object_uuid}/#{object_name}",
{:ok, %Tesla.Env{status: 200}} <- Philomena.Http.head(built_url) do
{:ok, %Tesla.Env{status: 200}} <- PhilomenaProxy.Http.head(built_url) do
# This is the high resolution URL.
%{
data
@ -110,7 +119,7 @@ defmodule Philomena.Scrapers.Deviantart do
built_url = "http://orig01.deviantart.net/x_by_x-d#{base36}.png"
case Philomena.Http.get(built_url) do
case PhilomenaProxy.Http.get(built_url) do
{:ok, %Tesla.Env{status: 301, headers: headers}} ->
# Location header provides URL of high res image.
{_location, link} = Enum.find(headers, fn {header, _val} -> header == "location" end)
@ -135,7 +144,7 @@ defmodule Philomena.Scrapers.Deviantart do
defp follow_redirect(_url, 0), do: nil
defp follow_redirect(url, max_times) do
case Philomena.Http.get(url) do
case PhilomenaProxy.Http.get(url) do
{:ok, %Tesla.Env{headers: headers, status: code}} when code in [301, 302] ->
location = Enum.find_value(headers, &location_header/1)
follow_redirect(location, max_times - 1)

View file

@ -1,4 +1,11 @@
defmodule Philomena.Scrapers.Pillowfort do
defmodule PhilomenaProxy.Scrapers.Pillowfort do
@moduledoc false
alias PhilomenaProxy.Scrapers.Scraper
alias PhilomenaProxy.Scrapers
@behaviour Scraper
@url_regex ~r|\Ahttps?://www\.pillowfort\.social/posts/([0-9]+)|
@spec can_handle?(URI.t(), String.t()) :: boolean()
@ -6,12 +13,13 @@ defmodule Philomena.Scrapers.Pillowfort do
String.match?(url, @url_regex)
end
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
def scrape(_uri, url) do
[post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
api_url = "https://www.pillowfort.social/posts/#{post_id}/json"
Philomena.Http.get(api_url)
PhilomenaProxy.Http.get(api_url)
|> json!()
|> process_response!(url)
end
@ -25,7 +33,7 @@ defmodule Philomena.Scrapers.Pillowfort do
|> Enum.map(
&%{
url: &1["url"],
camo_url: Camo.Image.image_url(&1["small_image_url"])
camo_url: PhilomenaProxy.Camo.image_url(&1["small_image_url"])
}
)

View file

@ -1,9 +1,16 @@
defmodule Philomena.Scrapers.Raw do
defmodule PhilomenaProxy.Scrapers.Raw do
@moduledoc false
alias PhilomenaProxy.Scrapers.Scraper
alias PhilomenaProxy.Scrapers
@behaviour Scraper
@mime_types ["image/gif", "image/jpeg", "image/png", "image/svg", "image/svg+xml", "video/webm"]
@spec can_handle?(URI.t(), String.t()) :: true | false
@spec can_handle?(URI.t(), String.t()) :: boolean()
def can_handle?(_uri, url) do
Philomena.Http.head(url)
PhilomenaProxy.Http.head(url)
|> case do
{:ok, %Tesla.Env{status: 200, headers: headers}} ->
headers
@ -16,13 +23,16 @@ defmodule Philomena.Scrapers.Raw do
end
end
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
def scrape(_uri, url) do
%{
source_url: url,
author_name: "",
description: "",
images: [
%{
url: url,
camo_url: Camo.Image.image_url(url)
camo_url: PhilomenaProxy.Camo.image_url(url)
}
]
}

View file

@ -0,0 +1,11 @@
defmodule PhilomenaProxy.Scrapers.Scraper do
@moduledoc false
alias PhilomenaProxy.Scrapers
# Return whether the given URL can be parsed by the scraper
@callback can_handle?(URI.t(), Scrapers.url()) :: boolean()
# Collect upload information from the URL
@callback scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
end

View file

@ -1,4 +1,11 @@
defmodule Philomena.Scrapers.Tumblr do
defmodule PhilomenaProxy.Scrapers.Tumblr do
@moduledoc false
alias PhilomenaProxy.Scrapers.Scraper
alias PhilomenaProxy.Scrapers
@behaviour Scraper
@url_regex ~r|\Ahttps?://(?:.*)/(?:image\|post)/(\d+)(?:\z\|[/?#])|
@media_regex ~r|https?://(?:\d+\.)?media\.tumblr\.com/[a-f\d]+/[a-f\d]+-[a-f\d]+/s\d+x\d+/[a-f\d]+\.(?:png\|jpe?g\|gif)|i
@size_regex ~r|_(\d+)(\..+)\z|
@ -18,13 +25,14 @@ defmodule Philomena.Scrapers.Tumblr do
String.match?(url, @url_regex) and tumblr_domain?(uri.host)
end
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
def scrape(uri, url) do
[post_id] = Regex.run(@url_regex, url, capture: :all_but_first)
api_url =
"https://api.tumblr.com/v2/blog/#{uri.host}/posts/photo?id=#{post_id}&api_key=#{tumblr_api_key()}"
Philomena.Http.get(api_url)
PhilomenaProxy.Http.get(api_url)
|> json!()
|> process_response!()
end
@ -44,7 +52,7 @@ defmodule Philomena.Scrapers.Tumblr do
%{"url" => preview} =
Enum.find(photo["alt_sizes"], &(&1["width"] == 400)) || %{"url" => image}
%{url: image, camo_url: Camo.Image.image_url(preview)}
%{url: image, camo_url: PhilomenaProxy.Camo.image_url(preview)}
end)
add_meta(post, images)
@ -55,7 +63,7 @@ defmodule Philomena.Scrapers.Tumblr do
@media_regex
|> Regex.scan(post["body"])
|> Enum.map(fn [url | _captures] ->
%{url: url, camo_url: Camo.Image.image_url(url)}
%{url: url, camo_url: PhilomenaProxy.Camo.image_url(url)}
end)
add_meta(post, images)
@ -68,7 +76,7 @@ defmodule Philomena.Scrapers.Tumblr do
end
defp url_ok?(url) do
match?({:ok, %Tesla.Env{status: 200}}, Philomena.Http.head(url))
match?({:ok, %Tesla.Env{status: 200}}, PhilomenaProxy.Http.head(url))
end
defp add_meta(post, images) do

View file

@ -1,16 +1,24 @@
defmodule Philomena.Scrapers.Twitter do
defmodule PhilomenaProxy.Scrapers.Twitter do
@moduledoc false
alias PhilomenaProxy.Scrapers.Scraper
alias PhilomenaProxy.Scrapers
@behaviour Scraper
@url_regex ~r|\Ahttps?://(?:mobile\.)?(?:twitter\|x).com/([A-Za-z\d_]+)/status/([\d]+)/?|
@spec can_handle?(URI.t(), String.t()) :: true | false
@spec can_handle?(URI.t(), String.t()) :: boolean()
def can_handle?(_uri, url) do
String.match?(url, @url_regex)
end
@spec scrape(URI.t(), Scrapers.url()) :: Scrapers.scrape_result()
def scrape(_uri, url) do
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
api_url = "https://api.fxtwitter.com/#{user}/status/#{status_id}"
{:ok, %Tesla.Env{status: 200, body: body}} = Philomena.Http.get(api_url)
{:ok, %Tesla.Env{status: 200, body: body}} = PhilomenaProxy.Http.get(api_url)
json = Jason.decode!(body)
tweet = json["tweet"]
@ -19,7 +27,7 @@ defmodule Philomena.Scrapers.Twitter do
Enum.map(tweet["media"]["photos"], fn p ->
%{
url: "#{p["url"]}:orig",
camo_url: Camo.Image.image_url(p["url"])
camo_url: PhilomenaProxy.Camo.image_url(p["url"])
}
end)

View file

@ -1,7 +1,7 @@
defmodule PhilomenaWeb.Image.ScrapeController do
use PhilomenaWeb, :controller
alias Philomena.Scrapers
alias PhilomenaProxy.Scrapers
def create(conn, params) do
result =

View file

@ -31,7 +31,7 @@ defmodule PhilomenaWeb.CheckCaptchaPlug do
defp valid_solution?(%{"h-captcha-response" => captcha_token}) do
{:ok, %{body: body, status: 200}} =
Philomena.Http.post(
PhilomenaProxy.Http.post(
"https://hcaptcha.com/siteverify",
URI.encode_query(%{"response" => captcha_token, "secret" => hcaptcha_secret_key()}),
[{"Content-Type", "application/x-www-form-urlencoded"}]

View file

@ -35,7 +35,7 @@ defmodule PhilomenaWeb.CompromisedPasswordCheckPlug do
:crypto.hash(:sha, password)
|> Base.encode16()
case Philomena.Http.get(make_api_url(prefix)) do
case PhilomenaProxy.Http.get(make_api_url(prefix)) do
{:ok, %Tesla.Env{body: body, status: 200}} -> String.contains?(body, rest)
_ -> false
end

View file

@ -15,7 +15,7 @@ defmodule PhilomenaWeb.ScraperPlug do
%{"scraper_cache" => url} when not is_nil(url) and url != "" ->
url
|> Philomena.Http.get()
|> PhilomenaProxy.Http.get()
|> maybe_fixup_params(url, opts, conn)
_ ->

View file

@ -4,20 +4,24 @@ defmodule PhilomenaWeb.ChannelView do
def channel_image(%{type: "LivestreamChannel", short_name: short_name}) do
now = DateTime.utc_now() |> DateTime.to_unix(:microsecond)
Camo.Image.image_url(
PhilomenaProxy.Camo.image_url(
"https://thumbnail.api.livestream.com/thumbnail?name=#{short_name}&rand=#{now}"
)
end
def channel_image(%{type: "PicartoChannel", thumbnail_url: thumbnail_url}),
do: Camo.Image.image_url(thumbnail_url || "https://picarto.tv/images/missingthumb.jpg")
do:
PhilomenaProxy.Camo.image_url(thumbnail_url || "https://picarto.tv/images/missingthumb.jpg")
def channel_image(%{type: "PiczelChannel", remote_stream_id: remote_stream_id}),
do: Camo.Image.image_url("https://piczel.tv/api/thumbnail/stream_#{remote_stream_id}.jpg")
do:
PhilomenaProxy.Camo.image_url(
"https://piczel.tv/api/thumbnail/stream_#{remote_stream_id}.jpg"
)
def channel_image(%{type: "TwitchChannel", short_name: short_name}),
do:
Camo.Image.image_url(
PhilomenaProxy.Camo.image_url(
"https://static-cdn.jtvnw.net/previews-ttv/live_user_#{String.downcase(short_name)}-320x180.jpg"
)
end

View file

@ -52,7 +52,7 @@ for image_def <- resources["remote_images"] do
now = DateTime.utc_now() |> DateTime.to_unix(:microsecond)
IO.puts "Fetching #{image_def["url"]} ..."
{:ok, %{body: body}} = Philomena.Http.get(image_def["url"])
{:ok, %{body: body}} = PhilomenaProxy.Http.get(image_def["url"])
File.write!(file, body)