mirror of
https://github.com/JockeTF/fimfarchive.git
synced 2024-11-22 05:17:59 +01:00
Update FimfarchiveFetcher for beta format
This commit is contained in:
parent
0b8b8182a9
commit
f3b8942c65
4 changed files with 134 additions and 63 deletions
|
@ -22,15 +22,16 @@ Fimfarchive fetcher.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
import codecs
|
|
||||||
import gc
|
|
||||||
import json
|
import json
|
||||||
from copy import deepcopy
|
from typing import cast, Any, Dict, IO, Iterable, Optional, Tuple, Union
|
||||||
from io import BytesIO
|
|
||||||
from zipfile import ZipFile, BadZipFile
|
from zipfile import ZipFile, BadZipFile
|
||||||
|
|
||||||
|
from boltons.cacheutils import LRU
|
||||||
|
from jmespath import compile as jmes
|
||||||
|
|
||||||
from fimfarchive.exceptions import InvalidStoryError, StorySourceError
|
from fimfarchive.exceptions import InvalidStoryError, StorySourceError
|
||||||
from fimfarchive.flavors import StorySource, DataFormat, MetaPurity
|
from fimfarchive.flavors import StorySource, DataFormat, MetaPurity
|
||||||
|
from fimfarchive.utils import Empty
|
||||||
|
|
||||||
from .base import Fetcher
|
from .base import Fetcher
|
||||||
|
|
||||||
|
@ -40,7 +41,7 @@ __all__ = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
StreamReader = codecs.getreader('utf-8')
|
PATH = jmes('archive.path || path')
|
||||||
|
|
||||||
|
|
||||||
class FimfarchiveFetcher(Fetcher):
|
class FimfarchiveFetcher(Fetcher):
|
||||||
|
@ -48,7 +49,7 @@ class FimfarchiveFetcher(Fetcher):
|
||||||
Fetcher for Fimfarchive.
|
Fetcher for Fimfarchive.
|
||||||
"""
|
"""
|
||||||
prefetch_meta = True
|
prefetch_meta = True
|
||||||
prefetch_data = True
|
prefetch_data = False
|
||||||
|
|
||||||
flavors = frozenset((
|
flavors = frozenset((
|
||||||
StorySource.FIMFARCHIVE,
|
StorySource.FIMFARCHIVE,
|
||||||
|
@ -56,106 +57,175 @@ class FimfarchiveFetcher(Fetcher):
|
||||||
MetaPurity.CLEAN,
|
MetaPurity.CLEAN,
|
||||||
))
|
))
|
||||||
|
|
||||||
def __init__(self, file):
|
def __init__(self, source: Union[str, IO[bytes]]) -> None:
|
||||||
"""
|
"""
|
||||||
Initializes a `FimfarchiveFetcher` instance.
|
Constructor.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: Path or file-like object for a Fimfarchive release.
|
source: Path or file-like object for a Fimfarchive release.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
StorySourceError: If no valid Fimfarchive release can be loaded.
|
StorySourceError: If no valid Fimfarchive release can be loaded.
|
||||||
"""
|
"""
|
||||||
self.is_open = False
|
self.archive: ZipFile
|
||||||
self.archive = None
|
self.index: Dict[int, str]
|
||||||
self.index = None
|
self.paths: Dict[int, str]
|
||||||
|
self.is_open: bool = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._init(file)
|
self.initialize(source)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.close()
|
self.close()
|
||||||
raise
|
raise
|
||||||
else:
|
|
||||||
self.is_open = True
|
|
||||||
|
|
||||||
def _init(self, file):
|
def initialize(self, source: Union[str, IO[bytes]]) -> None:
|
||||||
"""
|
"""
|
||||||
Internal initialization method.
|
Internal initialization method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: Path or file-like object for a Fimfarchive release.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
StorySourceError: If no valid Fimfarchive release can be loaded.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
self.archive = ZipFile(file)
|
self.archive = ZipFile(source)
|
||||||
except IOError as e:
|
except IOError as e:
|
||||||
raise StorySourceError("Could not read from file.") from e
|
raise StorySourceError("Could not read from source.") from e
|
||||||
except BadZipFile as e:
|
except BadZipFile as e:
|
||||||
raise StorySourceError("Archive is not a valid ZIP-file.") from e
|
raise StorySourceError("Source is not a valid ZIP-file.") from e
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with self.archive.open('index.json') as fobj:
|
with self.archive.open('index.json') as fobj:
|
||||||
self.index = json.load(StreamReader(fobj))
|
self.index = dict(self.load_index(fobj))
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise StorySourceError("Archive is missing the index.") from e
|
raise StorySourceError("Archive is missing the index.") from e
|
||||||
except ValueError as e:
|
|
||||||
raise StorySourceError("Index is not valid JSON.") from e
|
|
||||||
except UnicodeDecodeError as e:
|
|
||||||
raise StorySourceError("Index is incorrectly encoded.") from e
|
|
||||||
except BadZipFile as e:
|
except BadZipFile as e:
|
||||||
raise StorySourceError("Archive is corrupt.") from e
|
raise StorySourceError("Archive is corrupt.") from e
|
||||||
|
|
||||||
gc.collect()
|
self.paths = LRU()
|
||||||
|
self.is_open = True
|
||||||
|
|
||||||
def close(self):
|
def load_index(self, source: IO[bytes]) -> Iterable[Tuple[int, str]]:
|
||||||
self.is_open = False
|
|
||||||
self.index = None
|
|
||||||
|
|
||||||
if self.archive is not None:
|
|
||||||
self.archive.close()
|
|
||||||
self.archive = None
|
|
||||||
|
|
||||||
gc.collect()
|
|
||||||
|
|
||||||
def lookup(self, key):
|
|
||||||
"""
|
"""
|
||||||
Finds meta for a story in the index.
|
Yields unparsed index items from a byte stream.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source: The stream to read from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An iterable over index items.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
StorySourceError: If an item is malformed.
|
||||||
|
"""
|
||||||
|
for part in source:
|
||||||
|
if len(part) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
line = part.decode().strip()
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
raise StorySourceError("Incorrectly encoded index.") from e
|
||||||
|
|
||||||
|
key, meta = line.split(':', 1)
|
||||||
|
key = key.strip(' "')
|
||||||
|
meta = meta.strip(' ,')
|
||||||
|
|
||||||
|
if meta[0] != '{' or meta[-1] != '}':
|
||||||
|
raise StorySourceError(f"Malformed index meta: {meta}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield int(key), meta
|
||||||
|
except ValueError as e:
|
||||||
|
raise StorySourceError(f"Malformed index key: {key}") from e
|
||||||
|
|
||||||
|
def validate(self, key: int) -> int:
|
||||||
|
"""
|
||||||
|
Ensures that the key matches a valid story
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
key: Primary key of the story.
|
key: Primary key of the story.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: A reference to the story's meta.
|
The key as cast to an int.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
InvalidStoryError: If story does not exist.
|
InvalidStoryError: If a valid story is not found.
|
||||||
StorySourceError: If archive is closed.
|
StorySourceError: If the fetcher is closed.
|
||||||
"""
|
"""
|
||||||
|
key = int(key)
|
||||||
|
|
||||||
if not self.is_open:
|
if not self.is_open:
|
||||||
raise StorySourceError("Fetcher is closed.")
|
raise StorySourceError("Fetcher is closed.")
|
||||||
|
|
||||||
key = str(key)
|
|
||||||
|
|
||||||
if key not in self.index:
|
if key not in self.index:
|
||||||
raise InvalidStoryError("Story does not exist.")
|
raise InvalidStoryError(f"No such story: {key}")
|
||||||
|
|
||||||
return self.index[key]
|
return key
|
||||||
|
|
||||||
def fetch_data(self, key):
|
def fetch_path(self, key: int) -> Optional[str]:
|
||||||
meta = self.lookup(key)
|
"""
|
||||||
|
Fetches the archive path of a story.
|
||||||
|
|
||||||
if 'path' not in meta:
|
Args:
|
||||||
raise StorySourceError("Index is missing a path value.")
|
key: Primary key of the story.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A path to the story, or None.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidStoryError: If a valid story is not found.
|
||||||
|
StorySourceError: If the fetcher is closed.
|
||||||
|
"""
|
||||||
|
key = self.validate(key)
|
||||||
|
path = self.paths.get(key, Empty)
|
||||||
|
|
||||||
|
if path is not Empty:
|
||||||
|
return cast(Optional[str], path)
|
||||||
|
|
||||||
|
meta = self.fetch_meta(key)
|
||||||
|
return PATH.search(meta)
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self.is_open = False
|
||||||
|
self.index = None
|
||||||
|
self.paths = None
|
||||||
|
|
||||||
|
if self.archive is not None:
|
||||||
|
self.archive.close()
|
||||||
|
self.archive = None
|
||||||
|
|
||||||
|
def fetch_meta(self, key: int) -> Dict[str, Any]:
|
||||||
|
key = self.validate(key)
|
||||||
|
raw = self.index[key]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = self.archive.read(meta['path'])
|
meta = json.loads(raw)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise StorySourceError("Archive is missing a file.") from e
|
raise StorySourceError("Malformed meta for {key}: {raw}") from e
|
||||||
except BadZipFile as e:
|
|
||||||
raise StorySourceError("Archive is corrupt.") from e
|
|
||||||
|
|
||||||
with ZipFile(BytesIO(data)) as story:
|
actual = meta.get('id')
|
||||||
if story.testzip() is not None:
|
|
||||||
raise StorySourceError("Story is corrupt.")
|
if key != actual:
|
||||||
|
raise StorySourceError("Invalid ID for {key}: {actual}")
|
||||||
|
|
||||||
|
self.paths[key] = PATH.search(meta)
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
|
def fetch_data(self, key: int) -> bytes:
|
||||||
|
key = self.validate(key)
|
||||||
|
path = self.fetch_path(key)
|
||||||
|
|
||||||
|
if not path:
|
||||||
|
raise StorySourceError("Missing path attribute for {key}.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = self.archive.read(path)
|
||||||
|
except ValueError as e:
|
||||||
|
raise StorySourceError("Missing file for {key}: {path}") from e
|
||||||
|
except BadZipFile as e:
|
||||||
|
raise StorySourceError("Corrupt file for {key}: {path}") from e
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def fetch_meta(self, key):
|
|
||||||
meta = self.lookup(key)
|
|
||||||
return deepcopy(meta)
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
arrow
|
arrow
|
||||||
bbcode
|
bbcode
|
||||||
blinker
|
blinker
|
||||||
|
boltons
|
||||||
flake8
|
flake8
|
||||||
jmespath
|
jmespath
|
||||||
mypy
|
mypy
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -88,6 +88,7 @@ setup(
|
||||||
'arrow',
|
'arrow',
|
||||||
'bbcode',
|
'bbcode',
|
||||||
'blinker',
|
'blinker',
|
||||||
|
'boltons',
|
||||||
'jmespath',
|
'jmespath',
|
||||||
'requests',
|
'requests',
|
||||||
),
|
),
|
||||||
|
|
|
@ -31,7 +31,7 @@ from fimfarchive.fetchers import FimfarchiveFetcher
|
||||||
VALID_STORY_KEY = 9
|
VALID_STORY_KEY = 9
|
||||||
INVALID_STORY_KEY = 7
|
INVALID_STORY_KEY = 7
|
||||||
|
|
||||||
FIMFARCHIVE_PATH = 'fimfarchive-20170601.zip'
|
FIMFARCHIVE_PATH = 'fimfarchive-20171203.zip'
|
||||||
|
|
||||||
|
|
||||||
class TestFimfarchiveFetcher:
|
class TestFimfarchiveFetcher:
|
||||||
|
@ -52,10 +52,10 @@ class TestFimfarchiveFetcher:
|
||||||
Tests `StorySourceError` is raised when fetcher is closed.
|
Tests `StorySourceError` is raised when fetcher is closed.
|
||||||
"""
|
"""
|
||||||
with FimfarchiveFetcher(FIMFARCHIVE_PATH) as fetcher:
|
with FimfarchiveFetcher(FIMFARCHIVE_PATH) as fetcher:
|
||||||
fetcher.lookup(VALID_STORY_KEY)
|
fetcher.fetch_meta(VALID_STORY_KEY)
|
||||||
|
|
||||||
with pytest.raises(StorySourceError):
|
with pytest.raises(StorySourceError):
|
||||||
fetcher.lookup(VALID_STORY_KEY)
|
fetcher.fetch_meta(VALID_STORY_KEY)
|
||||||
|
|
||||||
def test_fetch_meta_for_valid_story(self, fetcher):
|
def test_fetch_meta_for_valid_story(self, fetcher):
|
||||||
"""
|
"""
|
||||||
|
@ -63,7 +63,6 @@ class TestFimfarchiveFetcher:
|
||||||
"""
|
"""
|
||||||
meta = fetcher.fetch_meta(VALID_STORY_KEY)
|
meta = fetcher.fetch_meta(VALID_STORY_KEY)
|
||||||
assert meta['id'] == VALID_STORY_KEY
|
assert meta['id'] == VALID_STORY_KEY
|
||||||
assert meta['words'] != 0
|
|
||||||
|
|
||||||
def test_fetch_meta_for_invalid_story(self, fetcher):
|
def test_fetch_meta_for_invalid_story(self, fetcher):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in a new issue