TEMPORARY COMMIT - DO NOT MERGE

2025-03-21 10:47:11 +01:00 · 2021-08-20 23:37:40 +02:00 · 2021-08-20 23:37:40 +02:00 · 4d03a2ab99
commit 4d03a2ab99
parent c774e71960
4 changed files with 118 additions and 0 deletions
--- a/fimfarchive/converters/init.py
+++ b/fimfarchive/converters/init.py
@ -27,6 +27,7 @@ from .alpha_beta import AlphaBetaConverter
 from .fpub_epub import FpubEpubConverter
 from .json_fpub import JsonFpubConverter
 from .local_utc import LocalUtcConverter
+from .web_local import WebLocalConverter


 __all__ = (
@ -35,4 +36,5 @@ __all__ = (
    'FpubEpubConverter',
    'JsonFpubConverter',
    'LocalUtcConverter',
+    'WebLocalConverter',
 )
--- a/fimfarchive/converters/web_local.py
+++ b/fimfarchive/converters/web_local.py
@ -0,0 +1,114 @@
+"""
+Web to local resource converter.
+"""
+
+
+#
+# Fimfarchive, preserves stories from Fimfiction.
+# Copyright (C) 2021  Joakim Soderlund
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import imghdr
+from hashlib import sha1
+from io import BytesIO
+from typing import Dict, Iterator, Tuple
+from urllib.parse import unquote
+from zipfile import ZipFile
+
+import requests
+from bs4 import BeautifulSoup
+
+from fimfarchive.exceptions import StorySourceError
+from fimfarchive.stories import Story
+
+from .base import Converter
+
+
+class ImageLoader:
+    cache: Dict[str, str]
+    images: Dict[str, bytes]
+    source: ZipFile
+
+    def __init__(self, source: ZipFile) -> None:
+        self.cache = dict()
+        self.images = dict()
+        self.source = source
+
+    def fetch(self, url) -> str:
+        response = requests.get(url, timeout=60)
+
+        if not response.ok:
+            raise StorySourceError("Could not fetch image")
+
+        data = response.content
+        digest = sha1(data).hexdigest()
+        extension = imghdr.what(BytesIO(data))
+
+        if not extension:
+            raise StorySourceError("Could not parse image")
+
+        name = "images/{}.{}".format(digest, extension)
+
+        self.cache[url] = name
+        self.images[name] = data
+
+        return name
+
+    def parse(self, data: bytes) -> bytes:
+        dom = BeautifulSoup(data, features='html.parser')
+
+        for tag in dom.find_all('img'):
+            src = tag.attrs['src']
+            url = unquote(src.split('?url=')[1])
+
+            if url in self.cache:
+                tag.attrs['src'] = self.cache[url]
+            else:
+                tag.attrs['src'] = self.fetch(url)
+
+        return dom.decode_contents().encode()
+
+    def entries(self) -> Iterator[Tuple[str, bytes]]:
+        source = self.source
+
+        for info in source.infolist():
+            data = source.read(info)
+            name = info.filename
+
+            if name.endswith('.html'):
+                yield name, self.parse(data)
+            else:
+                yield name, data
+
+        yield from self.images.items()
+
+
+class WebLocalConverter(Converter):
+    """
+    Converts web resources to local.
+    """
+
+    def __call__(self, story: Story) -> Story:
+        source = ZipFile(BytesIO(story.data))
+        loader = ImageLoader(source)
+        repack = BytesIO()
+
+        with ZipFile(repack, 'w') as target:
+            for info, data in loader.entries():
+                target.writestr(info, data)
+
+        return story.merge(data=repack.getvalue())
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 arrow
 bbcode
+beautifulsoup4
 blinker
 flake8
 importlib_resources
--- a/setup.py
+++ b/setup.py
@ -87,6 +87,7 @@ setup(
    install_requires=(
        'arrow',
        'bbcode',
+        'beautifulsoup4',
        'blinker',
        'importlib_resources',
        'jinja2',