mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-11-23 20:18:00 +01:00
switch to maintained twitter scraper implementation (#187)
This commit is contained in:
parent
95fae63e0a
commit
9ece7c327c
1 changed files with 12 additions and 63 deletions
|
@ -1,8 +1,5 @@
|
||||||
defmodule Philomena.Scrapers.Twitter do
|
defmodule Philomena.Scrapers.Twitter do
|
||||||
@url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
|
@url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
|
||||||
@script_regex ~r|="(https://abs.twimg.com/responsive-web/client-web(?:-legacy)?/main\.[\da-z]+\.js)"|
|
|
||||||
@bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
|
|
||||||
@activate_url "https://api.twitter.com/1.1/guest/activate.json"
|
|
||||||
|
|
||||||
@spec can_handle?(URI.t(), String.t()) :: true | false
|
@spec can_handle?(URI.t(), String.t()) :: true | false
|
||||||
def can_handle?(_uri, url) do
|
def can_handle?(_uri, url) do
|
||||||
|
@ -10,69 +7,21 @@ defmodule Philomena.Scrapers.Twitter do
|
||||||
end
|
end
|
||||||
|
|
||||||
def scrape(_uri, url) do
|
def scrape(_uri, url) do
|
||||||
api_response!(url)
|
|
||||||
|> extract_data()
|
|
||||||
end
|
|
||||||
|
|
||||||
defp extract_data(tweet) do
|
|
||||||
images =
|
|
||||||
tweet["entities"]["media"]
|
|
||||||
|> Enum.map(
|
|
||||||
&%{
|
|
||||||
url: &1["media_url_https"] <> "?format=jpg&name=4096x4096",
|
|
||||||
camo_url: Camo.Image.image_url(&1["media_url_https"])
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
%{
|
|
||||||
source_url: tweet["url"],
|
|
||||||
author_name: tweet["user"],
|
|
||||||
description: tweet["text"] || tweet["full_text"],
|
|
||||||
images: images
|
|
||||||
}
|
|
||||||
end
|
|
||||||
|
|
||||||
# We'd like to use the API anonymously. In order to do this, we need to
|
|
||||||
# extract the anonymous bearer token. Fortunately, this is pretty easy
|
|
||||||
# to identify in the minified mobile script source.
|
|
||||||
def api_response!(url) do
|
|
||||||
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
||||||
|
|
||||||
page_url = "https://twitter.com/#{user}/status/#{status_id}"
|
image_url = "https://d.fxtwitter.com/#{user}/status/#{status_id}.jpg"
|
||||||
|
|
||||||
api_url =
|
{:ok, %Tesla.Env{status: 200}} = Philomena.Http.head(image_url)
|
||||||
"https://api.twitter.com/2/timeline/conversation/#{status_id}.json?tweet_mode=extended"
|
|
||||||
|
|
||||||
url = "https://twitter.com/#{user}/status/#{status_id}"
|
%{
|
||||||
|
source_url: "https://twitter.com/#{user}/status/#{status_id}",
|
||||||
{gt, bearer} =
|
author_name: user,
|
||||||
Philomena.Http.get(page_url)
|
images: [
|
||||||
|> extract_guest_token_and_bearer()
|
%{
|
||||||
|
url: image_url,
|
||||||
{:ok, api_resp} =
|
camo_url: Camo.Image.image_url(image_url),
|
||||||
Philomena.Http.get(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}])
|
},
|
||||||
|
]
|
||||||
api_resp
|
}
|
||||||
|> Map.get(:body)
|
|
||||||
|> Jason.decode!()
|
|
||||||
|> Map.get("globalObjects")
|
|
||||||
|> Map.get("tweets")
|
|
||||||
|> Map.get(status_id)
|
|
||||||
|> Map.put("user", user)
|
|
||||||
|> Map.put("url", url)
|
|
||||||
end
|
|
||||||
|
|
||||||
defp extract_guest_token_and_bearer({:ok, %Tesla.Env{body: page}}) do
|
|
||||||
[script | _] = Regex.run(@script_regex, page, capture: :all_but_first)
|
|
||||||
{:ok, %{body: body}} = Philomena.Http.get(script)
|
|
||||||
|
|
||||||
[bearer] = Regex.run(@bearer_regex, body, capture: :all_but_first)
|
|
||||||
|
|
||||||
{:ok, %{body: body}} =
|
|
||||||
Philomena.Http.post(@activate_url, nil, [{"Authorization", "Bearer #{bearer}"}])
|
|
||||||
|
|
||||||
gt = Map.fetch!(Jason.decode!(body), "guest_token")
|
|
||||||
|
|
||||||
{gt, bearer}
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue