fimfarchive/urlizer.py

120 lines
3.3 KiB
Python
Raw Normal View History

2021-08-20 23:37:40 +02:00
#!/usr/bin/env python3
"""
Web resource printer.
"""
#
# Fimfarchive, preserves stories from Fimfiction.
# Copyright (C) 2023 Joakim Soderlund
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from io import BytesIO
from json import dumps
from multiprocessing import Pool
from sys import argv
from typing import Dict, Iterator, List, Tuple
from urllib.parse import urlparse, parse_qs
from zipfile import ZipFile
from bs4 import BeautifulSoup
from fimfarchive.fetchers import FimfarchiveFetcher
from fimfarchive.stories import Story
from fimfarchive.utils import tqdm
def parse(data: bytes) -> Iterator[str]:
dom = BeautifulSoup(data, features='html.parser')
for tag in dom.find_all('img'):
if 'src' in tag.attrs:
yield tag.attrs['src']
for tag in dom.find_all('a'):
if 'href' in tag.attrs:
yield tag.attrs['href']
def clean(urls: Iterator[str]) -> Iterator[str]:
for url in urls:
url = url.split('[/')[0]
url = url.split('">')[0]
url = url.replace('\n', '')
url = url.replace('\r', '')
if not url.strip():
continue
try:
parts = urlparse(url)
query = parse_qs(parts.query)
lhost = parts.netloc.lower()
except Exception as e:
yield f'error://{e!r}'
continue
if 'imgur' in lhost:
yield url
elif 'camo' in lhost and 'url' in query:
yield from query['url']
elif 'google' in lhost and 'q' in query:
yield from query['q']
elif 'google' in lhost and 'imgurl' in query:
yield from query['imgurl']
elif 'bing' in lhost and 'mediaurl' in query:
yield from query['mediaurl']
else:
yield url
def entries(source: ZipFile) -> Iterator[Tuple[str, List[str]]]:
for info in source.infolist():
data = source.read(info)
name = info.filename
if name.endswith('.htm') or name.endswith('.html'):
try:
yield name, list(clean(parse(data)))
except Exception as e:
yield name, [f'error://{e!r}']
def mapping(data: bytes) -> List[Tuple[str, List[str]]]:
with ZipFile(BytesIO(data)) as source:
return list(entries(source))
def extract(fetcher: FimfarchiveFetcher) -> Iterator[bytes]:
for story in fetcher:
yield story.data
if __name__ == '__main__':
fetcher = FimfarchiveFetcher(argv[1])
progbar = tqdm(total=len(fetcher))
with Pool(4) as pool:
loader = extract(fetcher)
mapper = pool.imap_unordered(mapping, loader)
for results in mapper:
progbar.update(1)
for name, urls in results:
print("\n".join(urls))