diff --git a/fimfarchive/converters/__init__.py b/fimfarchive/converters/__init__.py index 85abf57..719a49c 100644 --- a/fimfarchive/converters/__init__.py +++ b/fimfarchive/converters/__init__.py @@ -27,6 +27,7 @@ from .alpha_beta import AlphaBetaConverter from .fpub_epub import FpubEpubConverter from .json_fpub import JsonFpubConverter from .local_utc import LocalUtcConverter +from .web_local import WebLocalConverter __all__ = ( @@ -35,4 +36,5 @@ __all__ = ( 'FpubEpubConverter', 'JsonFpubConverter', 'LocalUtcConverter', + 'WebLocalConverter', ) diff --git a/fimfarchive/converters/web_local.py b/fimfarchive/converters/web_local.py new file mode 100644 index 0000000..6f25a76 --- /dev/null +++ b/fimfarchive/converters/web_local.py @@ -0,0 +1,114 @@ +""" +Web to local resource converter. +""" + + +# +# Fimfarchive, preserves stories from Fimfiction. +# Copyright (C) 2021 Joakim Soderlund +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +import imghdr +from hashlib import sha1 +from io import BytesIO +from typing import Dict, Iterator, Tuple +from urllib.parse import unquote +from zipfile import ZipFile + +import requests +from bs4 import BeautifulSoup + +from fimfarchive.exceptions import StorySourceError +from fimfarchive.stories import Story + +from .base import Converter + + +class ImageLoader: + cache: Dict[str, str] + images: Dict[str, bytes] + source: ZipFile + + def __init__(self, source: ZipFile) -> None: + self.cache = dict() + self.images = dict() + self.source = source + + def fetch(self, url) -> str: + response = requests.get(url, timeout=60) + + if not response.ok: + raise StorySourceError("Could not fetch image") + + data = response.content + digest = sha1(data).hexdigest() + extension = imghdr.what(BytesIO(data)) + + if not extension: + raise StorySourceError("Could not parse image") + + name = "images/{}.{}".format(digest, extension) + + self.cache[url] = name + self.images[name] = data + + return name + + def parse(self, data: bytes) -> bytes: + dom = BeautifulSoup(data, features='html.parser') + + for tag in dom.find_all('img'): + src = tag.attrs['src'] + url = unquote(src.split('?url=')[1]) + + if url in self.cache: + tag.attrs['src'] = self.cache[url] + else: + tag.attrs['src'] = self.fetch(url) + + return dom.decode_contents().encode() + + def entries(self) -> Iterator[Tuple[str, bytes]]: + source = self.source + + for info in source.infolist(): + data = source.read(info) + name = info.filename + + if name.endswith('.html'): + yield name, self.parse(data) + else: + yield name, data + + yield from self.images.items() + + +class WebLocalConverter(Converter): + """ + Converts web resources to local. + """ + + def __call__(self, story: Story) -> Story: + source = ZipFile(BytesIO(story.data)) + loader = ImageLoader(source) + repack = BytesIO() + + with ZipFile(repack, 'w') as target: + for info, data in loader.entries(): + target.writestr(info, data) + + return story.merge(data=repack.getvalue()) diff --git a/requirements.txt b/requirements.txt index 4ad9359..ecf1c3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ arrow bbcode +beautifulsoup4 blinker flake8 importlib_resources diff --git a/setup.py b/setup.py index e508244..80ec56e 100755 --- a/setup.py +++ b/setup.py @@ -87,6 +87,7 @@ setup( install_requires=( 'arrow', 'bbcode', + 'beautifulsoup4', 'blinker', 'importlib_resources', 'jinja2',