TEMPORARY COMMIT - DO NOT MERGE

2025-03-18 17:27:12 +01:00 · 2021-08-20 23:37:40 +02:00 · 2021-08-20 23:37:40 +02:00 · c4bc99877a
commit c4bc99877a
parent a9e71a898f
3 changed files with 121 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 arrow
 bbcode
+beautifulsoup4
 blinker
 flake8
 importlib_resources
--- a/setup.py
+++ b/setup.py
@ -87,6 +87,7 @@ setup(
    install_requires=(
        'arrow',
        'bbcode',
+        'beautifulsoup4',
        'blinker',
        'importlib_resources',
        'jinja2',
--- a/urlizer.py
+++ b/urlizer.py
@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""
+Web resource printer.
+"""
+
+
+#
+# Fimfarchive, preserves stories from Fimfiction.
+# Copyright (C) 2023  Joakim Soderlund
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+from io import BytesIO
+from json import dumps
+from multiprocessing import Pool
+from sys import argv
+from typing import Dict, Iterator, List, Tuple
+from urllib.parse import urlparse, parse_qs
+from zipfile import ZipFile
+
+from bs4 import BeautifulSoup
+
+from fimfarchive.fetchers import FimfarchiveFetcher
+from fimfarchive.stories import Story
+from fimfarchive.utils import tqdm
+
+
+def parse(data: bytes) -> Iterator[str]:
+    dom = BeautifulSoup(data, features='html.parser')
+
+    for tag in dom.find_all('img'):
+        if 'src' in tag.attrs:
+            yield tag.attrs['src']
+
+    for tag in dom.find_all('a'):
+        if 'href' in tag.attrs:
+            yield tag.attrs['href']
+
+
+def clean(urls: Iterator[str]) -> Iterator[str]:
+    for url in urls:
+        url = url.split('[/')[0]
+        url = url.split('">')[0]
+        url = url.replace('\n', '')
+        url = url.replace('\r', '')
+
+        if not url.strip():
+            continue
+
+        try:
+            parts = urlparse(url)
+            query = parse_qs(parts.query)
+            lhost = parts.netloc.lower()
+        except Exception as e:
+            yield f'error://{e!r}'
+            continue
+
+        if 'imgur' in lhost:
+            yield url
+        elif 'camo' in lhost and 'url' in query:
+            yield from query['url']
+        elif 'google' in lhost and 'q' in query:
+            yield from query['q']
+        elif 'google' in lhost and 'imgurl' in query:
+            yield from query['imgurl']
+        elif 'bing' in lhost and 'mediaurl' in query:
+            yield from query['mediaurl']
+        else:
+            yield url
+
+
+def entries(source: ZipFile) -> Iterator[Tuple[str, List[str]]]:
+    for info in source.infolist():
+        data = source.read(info)
+        name = info.filename
+
+        if name.endswith('.htm') or name.endswith('.html'):
+            try:
+                yield name, list(clean(parse(data)))
+            except Exception as e:
+                yield name, [f'error://{e!r}']
+
+
+def mapping(data: bytes) -> List[Tuple[str, List[str]]]:
+    with ZipFile(BytesIO(data)) as source:
+        return list(entries(source))
+
+
+def extract(fetcher: FimfarchiveFetcher) -> Iterator[bytes]:
+    for story in fetcher:
+        yield story.data
+
+
+if __name__ == '__main__':
+    fetcher = FimfarchiveFetcher(argv[1])
+    progbar = tqdm(total=len(fetcher))
+
+    with Pool(4) as pool:
+        loader = extract(fetcher)
+        mapper = pool.imap_unordered(mapping, loader)
+
+        for results in mapper:
+            progbar.update(1)
+
+            for name, urls in results:
+                print("\n".join(urls))