From db1787aa457a67a7e57e4369fe4bf101bee5a5d1 Mon Sep 17 00:00:00 2001 From: "byte[]" Date: Wed, 18 Dec 2019 18:51:02 -0500 Subject: [PATCH] various scraper fixes --- assets/js/upload.js | 1 + lib/philomena/scrapers.ex | 4 ++-- lib/philomena/scrapers/deviantart.ex | 21 +++++++++++++++++++-- lib/philomena/scrapers/twitter.ex | 4 ++-- lib/philomena_web/plugs/scraper_plug.ex | 8 +++++++- 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/assets/js/upload.js b/assets/js/upload.js index 4f1a0d57..f6368568 100644 --- a/assets/js/upload.js +++ b/assets/js/upload.js @@ -20,6 +20,7 @@ function setupImageUpload() { const [ fileField, remoteUrl, scraperError ] = $$('.js-scraper', form); const [ sourceEl, tagsEl, descrEl ] = $$('.js-image-input', form); const fetchButton = $('#js-scraper-preview'); + if (!fetchButton) return; function showImages(images) { clearEl(imgPreviews); diff --git a/lib/philomena/scrapers.ex b/lib/philomena/scrapers.ex index 91c07571..ed88a9bb 100644 --- a/lib/philomena/scrapers.ex +++ b/lib/philomena/scrapers.ex @@ -2,7 +2,7 @@ defmodule Philomena.Scrapers do @scrapers [ Philomena.Scrapers.Deviantart, Philomena.Scrapers.Twitter, - #Philomena.Scrapers.Tumblr, # blocked on benoitc/hackney#566 + Philomena.Scrapers.Tumblr, Philomena.Scrapers.Raw ] @@ -21,4 +21,4 @@ defmodule Philomena.Scrapers do defp unwrap([result]), do: result defp unwrap(_result), do: nil -end \ No newline at end of file +end diff --git a/lib/philomena/scrapers/deviantart.ex b/lib/philomena/scrapers/deviantart.ex index b6c1841e..f86b93df 100644 --- a/lib/philomena/scrapers/deviantart.ex +++ b/lib/philomena/scrapers/deviantart.ex @@ -23,7 +23,7 @@ defmodule Philomena.Scrapers.Deviantart do # artists give you. def scrape(_uri, url) do url - |> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2) + |> follow_redirect(2) |> extract_data!() |> try_intermediary_hires!() |> try_new_hires!() @@ -132,4 +132,21 @@ defmodule Philomena.Scrapers.Deviantart do data end end -end \ No newline at end of file + + # Workaround for benoitc/hackney#273 + defp follow_redirect(_url, 0), do: nil + defp follow_redirect(url, max_times) do + case Philomena.Http.get!(url) do + %HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] -> + location = Enum.find_value(headers, &location_header/1) + follow_redirect(location, max_times - 1) + + response -> + response + end + end + + defp location_header({"Location", location}), do: location + defp location_header({"location", location}), do: location + defp location_header(_), do: nil +end diff --git a/lib/philomena/scrapers/twitter.ex b/lib/philomena/scrapers/twitter.ex index b319a8c0..31ffc594 100644 --- a/lib/philomena/scrapers/twitter.ex +++ b/lib/philomena/scrapers/twitter.ex @@ -17,7 +17,7 @@ defmodule Philomena.Scrapers.Twitter do defp extract_data(tweet) do images = tweet["entities"]["media"] - |> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])}) + |> Enum.map(&%{url: &1["media_url_https"] <> ":orig", camo_url: Camo.Image.image_url(&1["media_url_https"])}) %{ source_url: tweet["url"], @@ -62,4 +62,4 @@ defmodule Philomena.Scrapers.Twitter do {gt, bearer} end -end \ No newline at end of file +end diff --git a/lib/philomena_web/plugs/scraper_plug.ex b/lib/philomena_web/plugs/scraper_plug.ex index 24d61544..a094ec9f 100644 --- a/lib/philomena_web/plugs/scraper_plug.ex +++ b/lib/philomena_web/plugs/scraper_plug.ex @@ -2,7 +2,13 @@ defmodule PhilomenaWeb.ScraperPlug do def init(opts), do: opts def call(conn, opts) do + params_name = Keyword.get(opts, :params_name, "image") + params_key = Keyword.get(opts, :params_key, "image") + case conn.params do + %{^params_name => %{^params_key => %Plug.Upload{}}} -> + conn + %{"scraper_cache" => url} -> Philomena.Http.get!(url, [], max_body_length: 30_000_000) |> maybe_fixup_params(opts, conn) @@ -36,4 +42,4 @@ defmodule PhilomenaWeb.ScraperPlug do %{conn | params: updated_params} end defp maybe_fixup_params(_response, _opts, conn), do: conn -end \ No newline at end of file +end