TEMPORARY COMMIT - DO NOT MERGE

2024-11-24 06:17:58 +01:00 · 2021-08-20 23:37:40 +02:00 · 2021-08-20 23:37:40 +02:00 · c4bc99877a
commit c4bc99877a
parent a9e71a898f
3 changed files with 121 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 arrow
 bbcode
 beautifulsoup4
 blinker
 flake8
 importlib_resources
--- a/setup.py
+++ b/setup.py
@ -87,6 +87,7 @@ setup(
    install_requires=(
        'arrow',
        'bbcode',
        'beautifulsoup4',
        'blinker',
        'importlib_resources',
        'jinja2',
--- a/urlizer.py
+++ b/urlizer.py
@ -0,0 +1,119 @@
 #!/usr/bin/env python3
 """
 Web resource printer.
 """
 #
 # Fimfarchive, preserves stories from Fimfiction.
 # Copyright (C) 2023  Joakim Soderlund
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 from io import BytesIO
 from json import dumps
 from multiprocessing import Pool
 from sys import argv
 from typing import Dict, Iterator, List, Tuple
 from urllib.parse import urlparse, parse_qs
 from zipfile import ZipFile
 from bs4 import BeautifulSoup
 from fimfarchive.fetchers import FimfarchiveFetcher
 from fimfarchive.stories import Story
 from fimfarchive.utils import tqdm
 def parse(data: bytes) -> Iterator[str]:
    dom = BeautifulSoup(data, features='html.parser')
    for tag in dom.find_all('img'):
        if 'src' in tag.attrs:
            yield tag.attrs['src']
    for tag in dom.find_all('a'):
        if 'href' in tag.attrs:
            yield tag.attrs['href']
 def clean(urls: Iterator[str]) -> Iterator[str]:
    for url in urls:
        url = url.split('[/')[0]
        url = url.split('">')[0]
        url = url.replace('\n', '')
        url = url.replace('\r', '')
        if not url.strip():
            continue
        try:
            parts = urlparse(url)
            query = parse_qs(parts.query)
            lhost = parts.netloc.lower()
        except Exception as e:
            yield f'error://{e!r}'
            continue
        if 'imgur' in lhost:
            yield url
        elif 'camo' in lhost and 'url' in query:
            yield from query['url']
        elif 'google' in lhost and 'q' in query:
            yield from query['q']
        elif 'google' in lhost and 'imgurl' in query:
            yield from query['imgurl']
        elif 'bing' in lhost and 'mediaurl' in query:
            yield from query['mediaurl']
        else:
            yield url
 def entries(source: ZipFile) -> Iterator[Tuple[str, List[str]]]:
    for info in source.infolist():
        data = source.read(info)
        name = info.filename
        if name.endswith('.htm') or name.endswith('.html'):
            try:
                yield name, list(clean(parse(data)))
            except Exception as e:
                yield name, [f'error://{e!r}']
 def mapping(data: bytes) -> List[Tuple[str, List[str]]]:
    with ZipFile(BytesIO(data)) as source:
        return list(entries(source))
 def extract(fetcher: FimfarchiveFetcher) -> Iterator[bytes]:
    for story in fetcher:
        yield story.data
 if __name__ == '__main__':
    fetcher = FimfarchiveFetcher(argv[1])
    progbar = tqdm(total=len(fetcher))
    with Pool(4) as pool:
        loader = extract(fetcher)
        mapper = pool.imap_unordered(mapping, loader)
        for results in mapper:
            progbar.update(1)
            for name, urls in results:
                print("\n".join(urls))