mirror of
https://github.com/philomena-dev/philomena.git
synced 2025-02-01 03:46:44 +01:00
various scraper fixes
This commit is contained in:
parent
d6e89a1449
commit
db1787aa45
5 changed files with 31 additions and 7 deletions
|
@ -20,6 +20,7 @@ function setupImageUpload() {
|
||||||
const [ fileField, remoteUrl, scraperError ] = $$('.js-scraper', form);
|
const [ fileField, remoteUrl, scraperError ] = $$('.js-scraper', form);
|
||||||
const [ sourceEl, tagsEl, descrEl ] = $$('.js-image-input', form);
|
const [ sourceEl, tagsEl, descrEl ] = $$('.js-image-input', form);
|
||||||
const fetchButton = $('#js-scraper-preview');
|
const fetchButton = $('#js-scraper-preview');
|
||||||
|
if (!fetchButton) return;
|
||||||
|
|
||||||
function showImages(images) {
|
function showImages(images) {
|
||||||
clearEl(imgPreviews);
|
clearEl(imgPreviews);
|
||||||
|
|
|
@ -2,7 +2,7 @@ defmodule Philomena.Scrapers do
|
||||||
@scrapers [
|
@scrapers [
|
||||||
Philomena.Scrapers.Deviantart,
|
Philomena.Scrapers.Deviantart,
|
||||||
Philomena.Scrapers.Twitter,
|
Philomena.Scrapers.Twitter,
|
||||||
#Philomena.Scrapers.Tumblr, # blocked on benoitc/hackney#566
|
Philomena.Scrapers.Tumblr,
|
||||||
Philomena.Scrapers.Raw
|
Philomena.Scrapers.Raw
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ defmodule Philomena.Scrapers.Deviantart do
|
||||||
# artists give you.
|
# artists give you.
|
||||||
def scrape(_uri, url) do
|
def scrape(_uri, url) do
|
||||||
url
|
url
|
||||||
|> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
|
|> follow_redirect(2)
|
||||||
|> extract_data!()
|
|> extract_data!()
|
||||||
|> try_intermediary_hires!()
|
|> try_intermediary_hires!()
|
||||||
|> try_new_hires!()
|
|> try_new_hires!()
|
||||||
|
@ -132,4 +132,21 @@ defmodule Philomena.Scrapers.Deviantart do
|
||||||
data
|
data
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Workaround for benoitc/hackney#273
|
||||||
|
defp follow_redirect(_url, 0), do: nil
|
||||||
|
defp follow_redirect(url, max_times) do
|
||||||
|
case Philomena.Http.get!(url) do
|
||||||
|
%HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] ->
|
||||||
|
location = Enum.find_value(headers, &location_header/1)
|
||||||
|
follow_redirect(location, max_times - 1)
|
||||||
|
|
||||||
|
response ->
|
||||||
|
response
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp location_header({"Location", location}), do: location
|
||||||
|
defp location_header({"location", location}), do: location
|
||||||
|
defp location_header(_), do: nil
|
||||||
end
|
end
|
|
@ -17,7 +17,7 @@ defmodule Philomena.Scrapers.Twitter do
|
||||||
defp extract_data(tweet) do
|
defp extract_data(tweet) do
|
||||||
images =
|
images =
|
||||||
tweet["entities"]["media"]
|
tweet["entities"]["media"]
|
||||||
|> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
|
|> Enum.map(&%{url: &1["media_url_https"] <> ":orig", camo_url: Camo.Image.image_url(&1["media_url_https"])})
|
||||||
|
|
||||||
%{
|
%{
|
||||||
source_url: tweet["url"],
|
source_url: tweet["url"],
|
||||||
|
|
|
@ -2,7 +2,13 @@ defmodule PhilomenaWeb.ScraperPlug do
|
||||||
def init(opts), do: opts
|
def init(opts), do: opts
|
||||||
|
|
||||||
def call(conn, opts) do
|
def call(conn, opts) do
|
||||||
|
params_name = Keyword.get(opts, :params_name, "image")
|
||||||
|
params_key = Keyword.get(opts, :params_key, "image")
|
||||||
|
|
||||||
case conn.params do
|
case conn.params do
|
||||||
|
%{^params_name => %{^params_key => %Plug.Upload{}}} ->
|
||||||
|
conn
|
||||||
|
|
||||||
%{"scraper_cache" => url} ->
|
%{"scraper_cache" => url} ->
|
||||||
Philomena.Http.get!(url, [], max_body_length: 30_000_000)
|
Philomena.Http.get!(url, [], max_body_length: 30_000_000)
|
||||||
|> maybe_fixup_params(opts, conn)
|
|> maybe_fixup_params(opts, conn)
|
||||||
|
|
Loading…
Reference in a new issue