TEMPORARY COMMIT - DO NOT MERGE

This commit is contained in:
Joakim Soderlund 2021-08-20 23:37:40 +02:00
parent a9e71a898f
commit 3be4c635b5
4 changed files with 118 additions and 0 deletions

View file

@ -27,6 +27,7 @@ from .alpha_beta import AlphaBetaConverter
from .fpub_epub import FpubEpubConverter
from .json_fpub import JsonFpubConverter
from .local_utc import LocalUtcConverter
from .web_local import WebLocalConverter
__all__ = (
@ -35,4 +36,5 @@ __all__ = (
'FpubEpubConverter',
'JsonFpubConverter',
'LocalUtcConverter',
'WebLocalConverter',
)

View file

@ -0,0 +1,114 @@
"""
Web to local resource converter.
"""
#
# Fimfarchive, preserves stories from Fimfiction.
# Copyright (C) 2021 Joakim Soderlund
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import imghdr
from hashlib import sha1
from io import BytesIO
from typing import Dict, Iterator, Tuple
from urllib.parse import unquote
from zipfile import ZipFile
import requests
from bs4 import BeautifulSoup
from fimfarchive.exceptions import StorySourceError
from fimfarchive.stories import Story
from .base import Converter
class ImageLoader:
cache: Dict[str, str]
images: Dict[str, bytes]
source: ZipFile
def __init__(self, source: ZipFile) -> None:
self.cache = dict()
self.images = dict()
self.source = source
def fetch(self, url) -> str:
response = requests.get(url, timeout=60)
if not response.ok:
raise StorySourceError("Could not fetch image")
data = response.content
digest = sha1(data).hexdigest()
extension = imghdr.what(BytesIO(data))
if not extension:
raise StorySourceError("Could not parse image")
name = "images/{}.{}".format(digest, extension)
self.cache[url] = name
self.images[name] = data
return name
def parse(self, data: bytes) -> bytes:
dom = BeautifulSoup(data, features='html.parser')
for tag in dom.find_all('img'):
src = tag.attrs['src']
url = unquote(src.split('?url=')[1])
if url in self.cache:
tag.attrs['src'] = self.cache[url]
else:
tag.attrs['src'] = self.fetch(url)
return dom.decode_contents().encode()
def entries(self) -> Iterator[Tuple[str, bytes]]:
source = self.source
for info in source.infolist():
data = source.read(info)
name = info.filename
if name.endswith('.html'):
yield name, self.parse(data)
else:
yield name, data
yield from self.images.items()
class WebLocalConverter(Converter):
"""
Converts web resources to local.
"""
def __call__(self, story: Story) -> Story:
source = ZipFile(BytesIO(story.data))
loader = ImageLoader(source)
repack = BytesIO()
with ZipFile(repack, 'w') as target:
for info, data in loader.entries():
target.writestr(info, data)
return story.merge(data=repack.getvalue())

View file

@ -1,5 +1,6 @@
arrow
bbcode
beautifulsoup4
blinker
flake8
importlib_resources

View file

@ -87,6 +87,7 @@ setup(
install_requires=(
'arrow',
'bbcode',
'beautifulsoup4',
'blinker',
'importlib_resources',
'jinja2',