mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-11-27 13:47:58 +01:00
fix twitter scraper
This commit is contained in:
parent
b33dece707
commit
d0e0f3dc00
1 changed files with 6 additions and 6 deletions
|
@ -1,5 +1,5 @@
|
||||||
defmodule Philomena.Scrapers.Twitter do
|
defmodule Philomena.Scrapers.Twitter do
|
||||||
@gt_regex ~r|document.cookie = decodeURIComponent\("gt=(\d+);|
|
@gt_regex ~r|gt=(\d+?);|
|
||||||
@url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
|
@url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?|
|
||||||
@script_regex ~r|<script type="text/javascript" .*? src="(https://abs.twimg.com/responsive-web/web/main\.[\da-z]+\.js)">|
|
@script_regex ~r|<script type="text/javascript" .*? src="(https://abs.twimg.com/responsive-web/web/main\.[\da-z]+\.js)">|
|
||||||
@bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
|
@bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"|
|
||||||
|
@ -38,7 +38,7 @@ defmodule Philomena.Scrapers.Twitter do
|
||||||
def api_response!(url) do
|
def api_response!(url) do
|
||||||
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
[user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first)
|
||||||
|
|
||||||
mobile_url = "https://mobile.twitter.com/#{user}/status/#{status_id}"
|
page_url = "https://twitter.com/#{user}/status/#{status_id}"
|
||||||
|
|
||||||
api_url =
|
api_url =
|
||||||
"https://api.twitter.com/2/timeline/conversation/#{status_id}.json?tweet_mode=extended"
|
"https://api.twitter.com/2/timeline/conversation/#{status_id}.json?tweet_mode=extended"
|
||||||
|
@ -46,8 +46,7 @@ defmodule Philomena.Scrapers.Twitter do
|
||||||
url = "https://twitter.com/#{user}/status/#{status_id}"
|
url = "https://twitter.com/#{user}/status/#{status_id}"
|
||||||
|
|
||||||
{gt, bearer} =
|
{gt, bearer} =
|
||||||
Philomena.Http.get!(mobile_url)
|
Philomena.Http.get!(page_url)
|
||||||
|> Map.get(:body)
|
|
||||||
|> extract_guest_token_and_bearer()
|
|> extract_guest_token_and_bearer()
|
||||||
|
|
||||||
Philomena.Http.get!(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}])
|
Philomena.Http.get!(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}])
|
||||||
|
@ -60,8 +59,9 @@ defmodule Philomena.Scrapers.Twitter do
|
||||||
|> Map.put("url", url)
|
|> Map.put("url", url)
|
||||||
end
|
end
|
||||||
|
|
||||||
defp extract_guest_token_and_bearer(page) do
|
defp extract_guest_token_and_bearer(%Tesla.Env{body: page, headers: headers}) do
|
||||||
[gt] = Regex.run(@gt_regex, page, capture: :all_but_first)
|
[{_, gt}] = Enum.filter(headers, fn {k, v} -> k == "set-cookie" and String.starts_with?(v, "gt=") end)
|
||||||
|
[gt] = Regex.run(@gt_regex, gt, capture: :all_but_first)
|
||||||
[script] = Regex.run(@script_regex, page, capture: :all_but_first)
|
[script] = Regex.run(@script_regex, page, capture: :all_but_first)
|
||||||
|
|
||||||
%{body: body} = Philomena.Http.get!(script)
|
%{body: body} = Philomena.Http.get!(script)
|
||||||
|
|
Loading…
Reference in a new issue