TEMPORARY COMMIT - DO NOT MERGE

2025-02-18 19:14:21 +01:00 · 2021-08-20 23:37:40 +02:00 · 2021-08-20 23:37:40 +02:00 · 4d03a2ab99
commit 4d03a2ab99
parent c774e71960
4 changed files with 118 additions and 0 deletions
--- a/fimfarchive/converters/init.py
+++ b/fimfarchive/converters/init.py
@ -27,6 +27,7 @@ from .alpha_beta import AlphaBetaConverter
 from .fpub_epub import FpubEpubConverter
 from .json_fpub import JsonFpubConverter
 from .local_utc import LocalUtcConverter
 from .web_local import WebLocalConverter
 __all__ = (
@ -35,4 +36,5 @@ __all__ = (
    'FpubEpubConverter',
    'JsonFpubConverter',
    'LocalUtcConverter',
    'WebLocalConverter',
 )
--- a/fimfarchive/converters/web_local.py
+++ b/fimfarchive/converters/web_local.py
@ -0,0 +1,114 @@
 """
 Web to local resource converter.
 """
 #
 # Fimfarchive, preserves stories from Fimfiction.
 # Copyright (C) 2021  Joakim Soderlund
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 import imghdr
 from hashlib import sha1
 from io import BytesIO
 from typing import Dict, Iterator, Tuple
 from urllib.parse import unquote
 from zipfile import ZipFile
 import requests
 from bs4 import BeautifulSoup
 from fimfarchive.exceptions import StorySourceError
 from fimfarchive.stories import Story
 from .base import Converter
 class ImageLoader:
    cache: Dict[str, str]
    images: Dict[str, bytes]
    source: ZipFile
    def __init__(self, source: ZipFile) -> None:
        self.cache = dict()
        self.images = dict()
        self.source = source
    def fetch(self, url) -> str:
        response = requests.get(url, timeout=60)
        if not response.ok:
            raise StorySourceError("Could not fetch image")
        data = response.content
        digest = sha1(data).hexdigest()
        extension = imghdr.what(BytesIO(data))
        if not extension:
            raise StorySourceError("Could not parse image")
        name = "images/{}.{}".format(digest, extension)
        self.cache[url] = name
        self.images[name] = data
        return name
    def parse(self, data: bytes) -> bytes:
        dom = BeautifulSoup(data, features='html.parser')
        for tag in dom.find_all('img'):
            src = tag.attrs['src']
            url = unquote(src.split('?url=')[1])
            if url in self.cache:
                tag.attrs['src'] = self.cache[url]
            else:
                tag.attrs['src'] = self.fetch(url)
        return dom.decode_contents().encode()
    def entries(self) -> Iterator[Tuple[str, bytes]]:
        source = self.source
        for info in source.infolist():
            data = source.read(info)
            name = info.filename
            if name.endswith('.html'):
                yield name, self.parse(data)
            else:
                yield name, data
        yield from self.images.items()
 class WebLocalConverter(Converter):
    """
    Converts web resources to local.
    """
    def __call__(self, story: Story) -> Story:
        source = ZipFile(BytesIO(story.data))
        loader = ImageLoader(source)
        repack = BytesIO()
        with ZipFile(repack, 'w') as target:
            for info, data in loader.entries():
                target.writestr(info, data)
        return story.merge(data=repack.getvalue())
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 arrow
 bbcode
 beautifulsoup4
 blinker
 flake8
 importlib_resources
--- a/setup.py
+++ b/setup.py
@ -87,6 +87,7 @@ setup(
    install_requires=(
        'arrow',
        'bbcode',
        'beautifulsoup4',
        'blinker',
        'importlib_resources',
        'jinja2',