mirror of
https://github.com/philomena-dev/philomena.git
synced 2025-02-01 03:46:44 +01:00
various scraper fixes
This commit is contained in:
parent
d6e89a1449
commit
db1787aa45
5 changed files with 31 additions and 7 deletions
|
@ -20,6 +20,7 @@ function setupImageUpload() {
|
|||
const [ fileField, remoteUrl, scraperError ] = $$('.js-scraper', form);
|
||||
const [ sourceEl, tagsEl, descrEl ] = $$('.js-image-input', form);
|
||||
const fetchButton = $('#js-scraper-preview');
|
||||
if (!fetchButton) return;
|
||||
|
||||
function showImages(images) {
|
||||
clearEl(imgPreviews);
|
||||
|
|
|
@ -2,7 +2,7 @@ defmodule Philomena.Scrapers do
|
|||
@scrapers [
|
||||
Philomena.Scrapers.Deviantart,
|
||||
Philomena.Scrapers.Twitter,
|
||||
#Philomena.Scrapers.Tumblr, # blocked on benoitc/hackney#566
|
||||
Philomena.Scrapers.Tumblr,
|
||||
Philomena.Scrapers.Raw
|
||||
]
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ defmodule Philomena.Scrapers.Deviantart do
|
|||
# artists give you.
|
||||
def scrape(_uri, url) do
|
||||
url
|
||||
|> Philomena.Http.get!([], follow_redirect: true, max_redirect: 2)
|
||||
|> follow_redirect(2)
|
||||
|> extract_data!()
|
||||
|> try_intermediary_hires!()
|
||||
|> try_new_hires!()
|
||||
|
@ -132,4 +132,21 @@ defmodule Philomena.Scrapers.Deviantart do
|
|||
data
|
||||
end
|
||||
end
|
||||
|
||||
# Workaround for benoitc/hackney#273
|
||||
defp follow_redirect(_url, 0), do: nil
|
||||
defp follow_redirect(url, max_times) do
|
||||
case Philomena.Http.get!(url) do
|
||||
%HTTPoison.Response{headers: headers, status_code: code} when code in [301, 302] ->
|
||||
location = Enum.find_value(headers, &location_header/1)
|
||||
follow_redirect(location, max_times - 1)
|
||||
|
||||
response ->
|
||||
response
|
||||
end
|
||||
end
|
||||
|
||||
defp location_header({"Location", location}), do: location
|
||||
defp location_header({"location", location}), do: location
|
||||
defp location_header(_), do: nil
|
||||
end
|
|
@ -17,7 +17,7 @@ defmodule Philomena.Scrapers.Twitter do
|
|||
defp extract_data(tweet) do
|
||||
images =
|
||||
tweet["entities"]["media"]
|
||||
|> Enum.map(&%{url: &1["media_url_https"], camo_url: Camo.Image.image_url(&1["media_url_https"])})
|
||||
|> Enum.map(&%{url: &1["media_url_https"] <> ":orig", camo_url: Camo.Image.image_url(&1["media_url_https"])})
|
||||
|
||||
%{
|
||||
source_url: tweet["url"],
|
||||
|
|
|
@ -2,7 +2,13 @@ defmodule PhilomenaWeb.ScraperPlug do
|
|||
def init(opts), do: opts
|
||||
|
||||
def call(conn, opts) do
|
||||
params_name = Keyword.get(opts, :params_name, "image")
|
||||
params_key = Keyword.get(opts, :params_key, "image")
|
||||
|
||||
case conn.params do
|
||||
%{^params_name => %{^params_key => %Plug.Upload{}}} ->
|
||||
conn
|
||||
|
||||
%{"scraper_cache" => url} ->
|
||||
Philomena.Http.get!(url, [], max_body_length: 30_000_000)
|
||||
|> maybe_fixup_params(opts, conn)
|
||||
|
|
Loading…
Reference in a new issue