Update FimfarchiveFetcher for beta format

This commit is contained in:
Joakim Soderlund 2017-12-05 23:27:52 +01:00
parent 0b8b8182a9
commit f3b8942c65
4 changed files with 134 additions and 63 deletions

View file

@ -22,15 +22,16 @@ Fimfarchive fetcher.
# #
import codecs
import gc
import json import json
from copy import deepcopy from typing import cast, Any, Dict, IO, Iterable, Optional, Tuple, Union
from io import BytesIO
from zipfile import ZipFile, BadZipFile from zipfile import ZipFile, BadZipFile
from boltons.cacheutils import LRU
from jmespath import compile as jmes
from fimfarchive.exceptions import InvalidStoryError, StorySourceError from fimfarchive.exceptions import InvalidStoryError, StorySourceError
from fimfarchive.flavors import StorySource, DataFormat, MetaPurity from fimfarchive.flavors import StorySource, DataFormat, MetaPurity
from fimfarchive.utils import Empty
from .base import Fetcher from .base import Fetcher
@ -40,7 +41,7 @@ __all__ = (
) )
StreamReader = codecs.getreader('utf-8') PATH = jmes('archive.path || path')
class FimfarchiveFetcher(Fetcher): class FimfarchiveFetcher(Fetcher):
@ -48,7 +49,7 @@ class FimfarchiveFetcher(Fetcher):
Fetcher for Fimfarchive. Fetcher for Fimfarchive.
""" """
prefetch_meta = True prefetch_meta = True
prefetch_data = True prefetch_data = False
flavors = frozenset(( flavors = frozenset((
StorySource.FIMFARCHIVE, StorySource.FIMFARCHIVE,
@ -56,106 +57,175 @@ class FimfarchiveFetcher(Fetcher):
MetaPurity.CLEAN, MetaPurity.CLEAN,
)) ))
def __init__(self, file): def __init__(self, source: Union[str, IO[bytes]]) -> None:
""" """
Initializes a `FimfarchiveFetcher` instance. Constructor.
Args: Args:
file: Path or file-like object for a Fimfarchive release. source: Path or file-like object for a Fimfarchive release.
Raises: Raises:
StorySourceError: If no valid Fimfarchive release can be loaded. StorySourceError: If no valid Fimfarchive release can be loaded.
""" """
self.is_open = False self.archive: ZipFile
self.archive = None self.index: Dict[int, str]
self.index = None self.paths: Dict[int, str]
self.is_open: bool = False
try: try:
self._init(file) self.initialize(source)
except Exception: except Exception:
self.close() self.close()
raise raise
else:
self.is_open = True
def _init(self, file): def initialize(self, source: Union[str, IO[bytes]]) -> None:
""" """
Internal initialization method. Internal initialization method.
Args:
source: Path or file-like object for a Fimfarchive release.
Raises:
StorySourceError: If no valid Fimfarchive release can be loaded.
""" """
try: try:
self.archive = ZipFile(file) self.archive = ZipFile(source)
except IOError as e: except IOError as e:
raise StorySourceError("Could not read from file.") from e raise StorySourceError("Could not read from source.") from e
except BadZipFile as e: except BadZipFile as e:
raise StorySourceError("Archive is not a valid ZIP-file.") from e raise StorySourceError("Source is not a valid ZIP-file.") from e
try: try:
with self.archive.open('index.json') as fobj: with self.archive.open('index.json') as fobj:
self.index = json.load(StreamReader(fobj)) self.index = dict(self.load_index(fobj))
except KeyError as e: except KeyError as e:
raise StorySourceError("Archive is missing the index.") from e raise StorySourceError("Archive is missing the index.") from e
except ValueError as e:
raise StorySourceError("Index is not valid JSON.") from e
except UnicodeDecodeError as e:
raise StorySourceError("Index is incorrectly encoded.") from e
except BadZipFile as e: except BadZipFile as e:
raise StorySourceError("Archive is corrupt.") from e raise StorySourceError("Archive is corrupt.") from e
gc.collect() self.paths = LRU()
self.is_open = True
def close(self): def load_index(self, source: IO[bytes]) -> Iterable[Tuple[int, str]]:
self.is_open = False
self.index = None
if self.archive is not None:
self.archive.close()
self.archive = None
gc.collect()
def lookup(self, key):
""" """
Finds meta for a story in the index. Yields unparsed index items from a byte stream.
Args:
source: The stream to read from.
Returns:
An iterable over index items.
Raises:
StorySourceError: If an item is malformed.
"""
for part in source:
if len(part) < 3:
continue
try:
line = part.decode().strip()
except UnicodeDecodeError as e:
raise StorySourceError("Incorrectly encoded index.") from e
key, meta = line.split(':', 1)
key = key.strip(' "')
meta = meta.strip(' ,')
if meta[0] != '{' or meta[-1] != '}':
raise StorySourceError(f"Malformed index meta: {meta}")
try:
yield int(key), meta
except ValueError as e:
raise StorySourceError(f"Malformed index key: {key}") from e
def validate(self, key: int) -> int:
"""
Ensures that the key matches a valid story
Args: Args:
key: Primary key of the story. key: Primary key of the story.
Returns: Returns:
dict: A reference to the story's meta. The key as cast to an int.
Raises: Raises:
InvalidStoryError: If story does not exist. InvalidStoryError: If a valid story is not found.
StorySourceError: If archive is closed. StorySourceError: If the fetcher is closed.
""" """
key = int(key)
if not self.is_open: if not self.is_open:
raise StorySourceError("Fetcher is closed.") raise StorySourceError("Fetcher is closed.")
key = str(key)
if key not in self.index: if key not in self.index:
raise InvalidStoryError("Story does not exist.") raise InvalidStoryError(f"No such story: {key}")
return self.index[key] return key
def fetch_data(self, key): def fetch_path(self, key: int) -> Optional[str]:
meta = self.lookup(key) """
Fetches the archive path of a story.
if 'path' not in meta: Args:
raise StorySourceError("Index is missing a path value.") key: Primary key of the story.
Returns:
A path to the story, or None.
Raises:
InvalidStoryError: If a valid story is not found.
StorySourceError: If the fetcher is closed.
"""
key = self.validate(key)
path = self.paths.get(key, Empty)
if path is not Empty:
return cast(Optional[str], path)
meta = self.fetch_meta(key)
return PATH.search(meta)
def close(self) -> None:
self.is_open = False
self.index = None
self.paths = None
if self.archive is not None:
self.archive.close()
self.archive = None
def fetch_meta(self, key: int) -> Dict[str, Any]:
key = self.validate(key)
raw = self.index[key]
try: try:
data = self.archive.read(meta['path']) meta = json.loads(raw)
except ValueError as e: except ValueError as e:
raise StorySourceError("Archive is missing a file.") from e raise StorySourceError("Malformed meta for {key}: {raw}") from e
except BadZipFile as e:
raise StorySourceError("Archive is corrupt.") from e
with ZipFile(BytesIO(data)) as story: actual = meta.get('id')
if story.testzip() is not None:
raise StorySourceError("Story is corrupt.") if key != actual:
raise StorySourceError("Invalid ID for {key}: {actual}")
self.paths[key] = PATH.search(meta)
return meta
def fetch_data(self, key: int) -> bytes:
key = self.validate(key)
path = self.fetch_path(key)
if not path:
raise StorySourceError("Missing path attribute for {key}.")
try:
data = self.archive.read(path)
except ValueError as e:
raise StorySourceError("Missing file for {key}: {path}") from e
except BadZipFile as e:
raise StorySourceError("Corrupt file for {key}: {path}") from e
return data return data
def fetch_meta(self, key):
meta = self.lookup(key)
return deepcopy(meta)

View file

@ -1,6 +1,7 @@
arrow arrow
bbcode bbcode
blinker blinker
boltons
flake8 flake8
jmespath jmespath
mypy mypy

View file

@ -88,6 +88,7 @@ setup(
'arrow', 'arrow',
'bbcode', 'bbcode',
'blinker', 'blinker',
'boltons',
'jmespath', 'jmespath',
'requests', 'requests',
), ),

View file

@ -31,7 +31,7 @@ from fimfarchive.fetchers import FimfarchiveFetcher
VALID_STORY_KEY = 9 VALID_STORY_KEY = 9
INVALID_STORY_KEY = 7 INVALID_STORY_KEY = 7
FIMFARCHIVE_PATH = 'fimfarchive-20170601.zip' FIMFARCHIVE_PATH = 'fimfarchive-20171203.zip'
class TestFimfarchiveFetcher: class TestFimfarchiveFetcher:
@ -52,10 +52,10 @@ class TestFimfarchiveFetcher:
Tests `StorySourceError` is raised when fetcher is closed. Tests `StorySourceError` is raised when fetcher is closed.
""" """
with FimfarchiveFetcher(FIMFARCHIVE_PATH) as fetcher: with FimfarchiveFetcher(FIMFARCHIVE_PATH) as fetcher:
fetcher.lookup(VALID_STORY_KEY) fetcher.fetch_meta(VALID_STORY_KEY)
with pytest.raises(StorySourceError): with pytest.raises(StorySourceError):
fetcher.lookup(VALID_STORY_KEY) fetcher.fetch_meta(VALID_STORY_KEY)
def test_fetch_meta_for_valid_story(self, fetcher): def test_fetch_meta_for_valid_story(self, fetcher):
""" """
@ -63,7 +63,6 @@ class TestFimfarchiveFetcher:
""" """
meta = fetcher.fetch_meta(VALID_STORY_KEY) meta = fetcher.fetch_meta(VALID_STORY_KEY)
assert meta['id'] == VALID_STORY_KEY assert meta['id'] == VALID_STORY_KEY
assert meta['words'] != 0
def test_fetch_meta_for_invalid_story(self, fetcher): def test_fetch_meta_for_invalid_story(self, fetcher):
""" """