various scraper fixes

This commit is contained in:
byte[] 2019-12-18 18:51:02 -05:00
parent d6e89a1449
commit db1787aa45
5 changed files with 31 additions and 7 deletions

View file

@ -20,6 +20,7 @@ function setupImageUpload() {
const [ fileField, remoteUrl, scraperError ] = $$('.js-scraper', form);
const [ sourceEl, tagsEl, descrEl ] = $$('.js-image-input', form);
const fetchButton = $('#js-scraper-preview');
if (!fetchButton) return;
function showImages(images) {
clearEl(imgPreviews);

View file

@ -2,7 +2,7 @@ defmodule Philomena.Scrapers do
@scrapers [
Philomena.Scrapers.Deviantart,
Philomena.Scrapers.Twitter,
#Philomena.Scrapers.Tumblr, # blocked on benoitc/hackney#566
Philomena.Scrapers.Tumblr,
Philomena.Scrapers.Raw
]
@ -21,4 +21,4 @@ defmodule Philomena.Scrapers do
defp unwrap([result]), do: result
defp unwrap(_result), do: nil
end
end

View file

@ -23,7 +23,7 @@ defmodule Philomena.Scrapers.Deviantart do
# artists give you.
def scrape(_uri, url) do
url
|> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
|> follow_redirect(2)
|> extract_data!()
|> try_intermediary_hires!()
|> try_new_hires!()
@ -132,4 +132,21 @@ defmodule Philomena.Scrapers.Deviantart do
data
end
end
end
# Workaround for benoitc/hackney#273
defp follow_redirect(_url, 0), do: nil
defp follow_redirect(url, max_times) do
case Philomena.Http.get!(url) do
%HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] ->
location = Enum.find_value(headers, &location_header/1)
follow_redirect(location, max_times - 1)
response ->
response
end
end
defp location_header({"Location", location}), do: location
defp location_header({"location", location}), do: location
defp location_header(_), do: nil
end

View file

@ -17,7 +17,7 @@ defmodule Philomena.Scrapers.Twitter do
defp extract_data(tweet) do
images =
tweet["entities"]["media"]
|> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
|> Enum.map(&%{url: &1["media_url_https"] <> ":orig", camo_url: Camo.Image.image_url(&1["media_url_https"])})
%{
source_url: tweet["url"],
@ -62,4 +62,4 @@ defmodule Philomena.Scrapers.Twitter do
{gt, bearer}
end
end
end

View file

@ -2,7 +2,13 @@ defmodule PhilomenaWeb.ScraperPlug do
def init(opts), do: opts
def call(conn, opts) do
params_name = Keyword.get(opts, :params_name, "image")
params_key = Keyword.get(opts, :params_key, "image")
case conn.params do
%{^params_name => %{^params_key => %Plug.Upload{}}} ->
conn
%{"scraper_cache" => url} ->
Philomena.Http.get!(url, [], max_body_length: 30_000_000)
|> maybe_fixup_params(opts, conn)
@ -36,4 +42,4 @@ defmodule PhilomenaWeb.ScraperPlug do
%{conn | params: updated_params}
end
defp maybe_fixup_params(_response, _opts, conn), do: conn
end
end