From aaa309d3a83c49074db8557fe3a7dfc27daa7006 Mon Sep 17 00:00:00 2001 From: Joakim Soderlund Date: Sun, 15 Jul 2018 20:01:45 +0200 Subject: [PATCH] Add fetcher for Fimfiction APIv2 --- fimfarchive/fetchers/__init__.py | 2 + fimfarchive/fetchers/fimfiction2.py | 738 ++++++++++++++++++++++++++++ requirements.txt | 1 + setup.py | 1 + tests/fetchers/test_fimfiction2.py | 158 ++++++ tox.ini | 1 + 6 files changed, 901 insertions(+) create mode 100644 fimfarchive/fetchers/fimfiction2.py create mode 100644 tests/fetchers/test_fimfiction2.py diff --git a/fimfarchive/fetchers/__init__.py b/fimfarchive/fetchers/__init__.py index ae49643..3f68467 100644 --- a/fimfarchive/fetchers/__init__.py +++ b/fimfarchive/fetchers/__init__.py @@ -26,6 +26,7 @@ from .base import Fetcher from .directory import DirectoryFetcher from .fimfarchive import FimfarchiveFetcher from .fimfiction import FimfictionFetcher +from .fimfiction2 import Fimfiction2Fetcher __all__ = ( @@ -33,4 +34,5 @@ __all__ = ( 'DirectoryFetcher', 'FimfarchiveFetcher', 'FimfictionFetcher', + 'Fimfiction2Fetcher', ) diff --git a/fimfarchive/fetchers/fimfiction2.py b/fimfarchive/fetchers/fimfiction2.py new file mode 100644 index 0000000..5c4b11c --- /dev/null +++ b/fimfarchive/fetchers/fimfiction2.py @@ -0,0 +1,738 @@ +""" +Fimfiction APIv2 fetcher. +""" + + +# +# Fimfarchive, preserves stories from Fimfiction. +# Copyright (C) 2015 Joakim Soderlund +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +import json +from abc import ABC, abstractmethod +from collections import OrderedDict, defaultdict +from copy import deepcopy +from typing import Any, Dict, Iterator, Optional, Set +from urllib.parse import urlencode + +from jsonapi_client import Filter, Session +from jsonapi_client.document import Document +from jsonapi_client.exceptions import DocumentError +from jsonapi_client.resourceobject import ResourceObject + +from fimfarchive import __version__ as VERSION +from fimfarchive.flavors import DataFormat, MetaFormat, MetaPurity, StorySource + +from fimfarchive.exceptions import ( + FimfarchiveError, + InvalidStoryError, + StorySourceError, +) + +from .base import Fetcher + + +__all__ = ( + 'BetaFormatVerifier', + 'Fimfiction2Fetcher', +) + + +QueryParams = Dict[str, Set[str]] + + +ROOT = 'root' +AUTHOR = 'author' +CHAPTERS = 'chapters' +PREQUEL = 'prequel' +TAGS = 'tags' + + +DATA_PARAMS: QueryParams = { + 'include': { + 'chapters', + }, + 'fields[chapter]': { + 'authors_note_html', + 'authors_note_position', + 'chapter_number', + 'content_html', + 'title', + }, + 'fields[story]': { + 'chapters', + }, +} + + +META_PARAMS: QueryParams = { + 'include': { + 'author', + 'chapters', + 'tags', + }, + 'fields[chapter]': { + 'chapter_number', + 'date_modified', + 'date_published', + 'num_views', + 'num_words', + 'published', + 'title', + }, + 'fields[story]': { + 'author', + 'chapters', + 'color', + 'completion_status', + 'content_rating', + 'cover_image', + 'date_modified', + 'date_published', + 'date_updated', + 'description_html', + 'num_chapters', + 'num_comments', + 'num_dislikes', + 'num_likes', + 'num_views', + 'num_words', + 'prequel', + 'published', + 'rating', + 'short_description', + 'status', + 'submitted', + 'tags', + 'title', + 'total_num_views', + }, + 'fields[story_tag]': { + 'name', + 'type', + }, + 'fields[user]': { + 'avatar', + 'bio_html', + 'date_joined', + 'name', + 'num_blog_posts', + 'num_followers', + 'num_stories', + }, +} + + +class ApiClient: + """ + Performs API requests. + """ + + def __init__(self, token: str) -> None: + """ + Constructor. + + Args: + token: Fimfiction authorization bearer. + """ + self.session = self.create_session(token) + + def create_session(self, token: str) -> Session: + """ + Creates a jsonapi session with authorization. + + Args: + token: Fimfiction authorization bearer. + + Returns: + A jsonapi session containing the token. + """ + headers = { + 'Accept-Encoding': 'gzip, deflate', + 'Authorization': f'Bearer {token}', + 'User-Agent': f'fimfarchive/{VERSION}', + } + + return Session( + server_url='https://www.fimfiction.net/api/v2/', + request_kwargs={'headers': headers}, + ) + + def create_filter(self, params: QueryParams) -> Filter: + """ + Creates a jsonapi filter from query parameters. + + Args: + params: Parameters to create a filter for. + + Returns: + A jsonapi filter matching the parameters. + """ + joined: Dict[str, str] = OrderedDict() + + for key, value in sorted(params.items()): + joined[key] = ','.join(sorted(value)) + + return Filter(urlencode(joined)) + + def get(self, path: str, params: QueryParams = dict()) -> Document: + """ + Performs a jsonapi request. + + Args: + resource: Path to the resource. + params: Parameters for the request. + + Returns: + A jsonapi response document. + """ + query = self.create_filter(params) + + return self.session.get(path, query) + + +class Requester(ABC): + """ + Performs Fimfiction APIv2 requests. + """ + + @abstractmethod + def get_meta(self, key: int) -> ResourceObject: + """ + Performs an API request for story meta. + + Args: + key: Primary key of the story. + + Returns: + A resource object containing story meta. + + Raises: + InvalidStoryError: If a valid story is not found. + StorySourceError: If source does not return valid data. + """ + + @abstractmethod + def get_data(self, key: int) -> Iterator[ResourceObject]: + """ + Performs an API request for story data. + + Args: + key: Primary key of the story. + + Returns: + Resource objects containing story chapters. + + Raises: + InvalidStoryError: If a valid story is not found. + StorySourceError: If source does not return valid data. + """ + + +class SingleRequester(Requester): + """ + Requests stories one by one. + """ + + def __init__(self, client: ApiClient) -> None: + """ + Constructor. + + Args: + client: Client to use for queries. + """ + self.client = client + + def error(self, key: int, status: int) -> FimfarchiveError: + """ + Creates an exception for the status. + + Args: + key: Primary key of the story. + status: Status code of the response. + + Returns: + A fimfarchive exception instance. + """ + if status == 403: + return InvalidStoryError(f"Private story: {key}") + elif status == 404: + return InvalidStoryError(f"Missing story: {key}") + else: + return StorySourceError(f"Bad HTTP status for {key}: {status}") + + def get(self, key: int, path: str, params: QueryParams) -> Document: + """ + Performs a Fimfiction APIv2 request. + + Args: + key: Primary key of the story. + path: Resource to query. + params: Query parameters. + + Raises: + InvalidStoryError: If a valid story is not found. + StorySourceError: If source does not return valid data. + """ + try: + return self.client.get(path, params) + except DocumentError as e: + raise self.error(key, e.response.status_code) from e + except Exception as e: + raise StorySourceError("Unknown error for {key}.") from e + + def get_meta(self, key: int) -> ResourceObject: + path = f'stories/{key}' + response = self.get(key, path, META_PARAMS) + return response.resource + + def get_data(self, key: int) -> Iterator[ResourceObject]: + path = f'stories/{key}/chapters' + response = self.get(key, path, DATA_PARAMS) + return response.resources + + +class BulkRequester(Requester): + """ + Requests stories in bulk. + """ + response: Optional[Document] + resources: Dict[int, Optional[ResourceObject]] + + def __init__( + self, + client: ApiClient, + bulk_meta: bool = True, + bulk_data: bool = True, + bulk_size: int = 16, + ) -> None: + """ + Constructor. + + Args: + client: Client to use for queries. + bulk_meta: Toggles bulk fetching of meta. + bulk_data: Toggles bulk fetching of data. + bulk_size: Number of items to request per batch. + """ + self.client = client + self.bulk_meta = bulk_meta + self.bulk_data = bulk_data + self.bulk_size = bulk_size + + def __setattr__(self, name: str, value: Any) -> None: + """ + Resets the requester when necessary. + """ + try: + super().__setattr__(name, value) + finally: + if name in ('bulk_meta', 'bulk_data'): + self.reset() + + def reset(self) -> None: + """ + Drops the currently cached story batch. + """ + self.response = None + self.resources = dict() + + def create_params(self) -> QueryParams: + """ + Creates general query parameters for a request. + """ + params: QueryParams = defaultdict(set) + + if self.bulk_meta: + for key, value in META_PARAMS.items(): + params[key].update(value) + + if self.bulk_data: + for key, value in DATA_PARAMS.items(): + params[key].update(value) + + return dict(params) + + def cache(self, key: int) -> None: + """ + Caches a story batch from Fimfiction. + + Args: + key: Primary key of the story. + """ + count = int(self.bulk_size) + lower = key - (key % count) + upper = lower + count + + keys = range(lower, upper) + params = self.create_params() + params['page[size]'] = {str(len(keys) + 4)} + params['filter[ids]'] = {str(i) for i in keys} + + self.response = self.client.get('stories', params) + self.resources = {key: None for key in keys} + + for resource in self.response.resources: + self.resources[int(resource.id)] = resource + + def fetch(self, key: int) -> ResourceObject: + """ + Fetches a resource from either cache or Fimfiction. + + Args: + key: Primary key of the story. + + Returns: + A resource object containing the story. + + Raises: + InvalidStoryError: If a valid story is not found. + StorySourceError: If source does not return valid data. + """ + if key not in self.resources: + try: + self.cache(key) + except Exception as e: + self.reset() + raise StorySourceError("Unable to fetch.") from e + + resource = self.resources[key] + + if resource is None: + raise InvalidStoryError("Invalid story ID.") + + return resource + + def get_meta(self, key: int) -> ResourceObject: + if not self.bulk_meta: + raise StorySourceError("Bulk meta not enabled.") + + return self.fetch(key) + + def get_data(self, key: int) -> Iterator[ResourceObject]: + if not self.bulk_data: + raise StorySourceError("Bulk data not enabled.") + + return self.fetch(key).chapters + + +class RoutedRequester(Requester): + """ + Routes between single and bulk requesters. + """ + + def __init__( + self, + client: ApiClient, + bulk_meta: bool, + bulk_data: bool, + ) -> None: + """ + Constructor. + + Args: + client: Client to use for queries. + bulk_meta: Toggles bulk fetching of meta. + bulk_data: Toggles bulk fetching of data. + """ + self.single = SingleRequester(client) + self.bulk = BulkRequester(client, bulk_meta, bulk_data) + + def get_meta(self, key: int) -> ResourceObject: + if self.bulk.bulk_meta: + return self.bulk.get_meta(key) + else: + return self.single.get_meta(key) + + def get_data(self, key: int) -> Iterator[ResourceObject]: + if self.bulk.bulk_data: + return self.bulk.get_data(key) + else: + return self.single.get_data(key) + + +class Documentifier: + """ + Converts a resource into a dictionary. + """ + + def merge(self, target: Dict, source: Dict) -> None: + """ + Copies items from source into target. + + Args: + target: Dictionary to copy to. + source: Dictionary to copy from. + """ + for key, value in deepcopy(source).items(): + assert key not in target + target[key] = value + + def flatten(self, resource: ResourceObject) -> Dict[str, Any]: + """ + Flattens the resource into a dictionary. + + Args: + resource: Resource to flatten. + + Returns: + A dictionary representation. + """ + document: Dict[str, Any] = { + 'id': int(resource.id), + } + + self.merge(document, resource.json['attributes']) + self.merge(document, resource.meta.meta) + + return document + + def __call__(self, resource: ResourceObject) -> Dict[str, Any]: + """ + Applies the documentifier. + + Args: + resource: Resource to documentify. + + Returns: + A dictionary representation. + """ + return self.flatten(resource) + + +class MetaDocumentifier(Documentifier): + """ + Converts a resource into a story meta dictionary. + """ + fill = ( + 'cover_image', + 'date_published', + ) + + remove = ( + 'content_html', + 'authors_note_html', + 'authors_note_position', + ) + + def fill_keys(self, meta: Dict[str, Any]) -> None: + """ + Fills keys that may be left out by Fimfiction. + + Args: + meta: Dictionary to fill. + """ + for key in self.fill: + if key not in meta: + meta[key] = None + + def remove_data(self, meta: Dict[str, Any]) -> None: + """ + Removes keys that may be left in by the bulk fetcher. + + Args: + meta: Dictionary to clean. + """ + for chapter in meta['chapters']: + for key in self.remove: + if key in chapter: + del chapter[key] + + def __call__(self, resource: ResourceObject) -> Dict[str, Any]: + meta = self.flatten(resource) + + assert AUTHOR not in meta + meta[AUTHOR] = self.flatten(resource.author) + + assert CHAPTERS not in meta + chapters = [self.flatten(chapter) for chapter in resource.chapters] + chapters.sort(key=lambda chapter: chapter['chapter_number']) + meta[CHAPTERS] = chapters + + assert PREQUEL not in meta + prequel = getattr(resource.relationships, PREQUEL, None) + + if prequel: + value = prequel._resource_identifier.id + meta[PREQUEL] = int(value) + else: + meta[PREQUEL] = None + + assert TAGS not in meta + tags = [self.flatten(tag) for tag in resource.tags] + tags.sort(key=lambda tag: (tag['type'], tag['name'])) + meta[TAGS] = tags + + self.fill_keys(meta) + self.remove_data(meta) + + return meta + + +class BetaFormatVerifier: + """ + Verifies that required keys are present in a dictionary. + """ + + def __init__(self, requirements: Dict[str, Set[str]]) -> None: + """ + Constructor. + + Args: + requirements: Specifies the required keys. + """ + self.requirements: Dict[str, Set[str]] = requirements + + @classmethod + def from_params( + cls, + params: QueryParams, + mapping: Dict[str, str], + ) -> 'BetaFormatVerifier': + """ + Constructor, using query parameters. + + Args: + params: Query parameters to base the requirements on. + mapping: Mapping from document keys to resource types. + """ + requirements = dict() + + for key, resource in mapping.items(): + param = f'fields[{resource}]' + fields = deepcopy(params[param]) + fields.update(('id', 'url')) + requirements[key] = fields + + return cls(requirements) + + @classmethod + def from_meta_params(cls) -> 'BetaFormatVerifier': + """ + Constructor, for creating a meta verifier. + """ + return cls.from_params(META_PARAMS, { + ROOT: 'story', + AUTHOR: 'user', + CHAPTERS: 'chapter', + TAGS: 'story_tag', + }) + + @classmethod + def from_data_params(cls) -> 'BetaFormatVerifier': + """ + Constructor, for creating a chapter verifier. + """ + return cls.from_params(DATA_PARAMS, { + ROOT: 'chapter', + }) + + def check(self, key: str, required: Set[str], data: Any) -> None: + """ + Checks dictionaries against a set of required keys. + + Args: + key: Document key being checked. + required: Set of required keys. + data: Dictionaries to check. + + Raises: + StorySourceError: If a dictionary is invalid. + """ + if isinstance(data, dict): + data = (data,) + + for obj in data: + if obj.keys() < required: + missing = ", ".join(required - obj.keys()) + message = f"Missing from {key}: {missing}" + raise StorySourceError(message) + + def __call__(self, data: Dict[str, Any]) -> None: + """ + Applies the verifier to a dictionary. + + Args: + data: Dictionary to check. + + Raises: + StorySourceError: If a dictionary is invalid. + """ + for key, required in self.requirements.items(): + if key == ROOT: + self.check(key, required, data) + else: + self.check(key, required, data[key]) + + +class Fimfiction2Fetcher(Fetcher): + """ + Fetcher for Fimfiction APIv2. + """ + prefetch_meta = True + prefetch_data = False + + flavors = frozenset(( + StorySource.FIMFICTION, + DataFormat.JSON, + MetaFormat.BETA, + MetaPurity.DIRTY, + )) + + def __init__(self, token: str, bulk_meta=False, bulk_data=False) -> None: + """ + Constructor. + + Args: + token: Authentication token for Fimfiction. + bulk_meta: Toggles bulk fetching of story meta. + bulk_data: Toggles bulk fetching of story data. + """ + client = ApiClient(token) + self.extract_meta = MetaDocumentifier() + self.extract_chapter = Documentifier() + self.verify_meta = BetaFormatVerifier.from_meta_params() + self.verify_chapter = BetaFormatVerifier.from_data_params() + self.requester = RoutedRequester(client, bulk_meta, bulk_data) + + def fetch_meta(self, key: int) -> Dict[str, Any]: + resource = self.requester.get_meta(int(key)) + meta = self.extract_meta(resource) + self.verify_meta(meta) + + return meta + + def fetch_data(self, key: int) -> bytes: + resource = self.requester.get_data(int(key)) + chapters = [self.extract_chapter(chapter) for chapter in resource] + + if not chapters: + raise InvalidStoryError("Missing chapters.") + + for chapter in chapters: + self.verify_chapter(chapter) + + chapters.sort(key=lambda chapter: chapter['chapter_number']) + + data = json.dumps( + chapters, + indent=4, + ensure_ascii=False, + sort_keys=True + ) + + return data.encode() diff --git a/requirements.txt b/requirements.txt index 0c3a253..f4d3d6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ bbcode blinker boltons flake8 +git+https://github.com/qvantel/jsonapi-client.git jmespath mypy pytest diff --git a/setup.py b/setup.py index b0a30b9..2dbd1e6 100755 --- a/setup.py +++ b/setup.py @@ -90,6 +90,7 @@ setup( 'blinker', 'boltons', 'jmespath', + 'jsonapi-client', 'requests', 'tqdm', ), diff --git a/tests/fetchers/test_fimfiction2.py b/tests/fetchers/test_fimfiction2.py new file mode 100644 index 0000000..22e7991 --- /dev/null +++ b/tests/fetchers/test_fimfiction2.py @@ -0,0 +1,158 @@ +""" +Fimfiction APIv2 fetcher tests. +""" + + +# +# Fimfarchive, preserves stories from Fimfiction. +# Copyright (C) 2015 Joakim Soderlund +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +import json +import os + +import pytest + +from fimfarchive.exceptions import InvalidStoryError +from fimfarchive.fetchers import Fimfiction2Fetcher + + +VALID_STORY_KEY = 9 +AVATAR_STORY_KEY = 5764 +COVER_STORY_KEY = 444 +PUBLISHED_STORY_KEY = 25739 + +INVALID_STORY_KEY = 7 +EMPTY_STORY_KEY = 199462 +HIDDEN_STORY_KEY = 8 +PROTECTED_STORY_KEY = 208799 + +AVATAR_PLACEHOLDER = { + '32': 'https://static.fimfiction.net/images/none_32.png', + '64': 'https://static.fimfiction.net/images/none_64.png', +} + +BULK_COMBINATIONS = [ + (False, False), + (False, True), + (True, False), + (True, True), +] + + +class TestFimfiction2Fetcher: + """ + Fimfarchive2Fetcher tests. + """ + + @pytest.fixture(params=BULK_COMBINATIONS) + def fetcher(self, request): + """ + Returns a Fimfarchive2Fetcher instance. + """ + bulk_meta, bulk_data = request.param + token = os.environ['FIMFICTION_ACCESS_TOKEN'] + fetcher = Fimfiction2Fetcher(token, bulk_meta, bulk_data) + + fetcher.prefetch_meta = False + fetcher.prefetch_data = False + + yield fetcher + + def fetch_valid(self, fetcher, key): + """ + Fetches a valid story. + """ + story = fetcher.fetch(key) + + assert story.meta['id'] == key + assert json.loads(story.data.decode()) + + return story + + def fetch_invalid(self, fetcher, key): + """ + Fetches an invalid story. + """ + story = fetcher.fetch(key) + + with pytest.raises(InvalidStoryError): + story.meta + + with pytest.raises(InvalidStoryError): + story.data + + return story + + def test_valid(self, fetcher): + """ + Tests fetching a valid story. + """ + self.fetch_valid(fetcher, VALID_STORY_KEY) + + def test_valid_missing_cover(self, fetcher): + """ + Tests fetching a valid story without cover. + """ + story = self.fetch_valid(fetcher, COVER_STORY_KEY) + assert story.meta['cover_image'] is None + + def test_valid_missing_avatar(self, fetcher): + """ + Tests fetching a valid story without avatar. + """ + story = self.fetch_valid(fetcher, AVATAR_STORY_KEY) + avatar = story.meta['author']['avatar'] + + assert AVATAR_PLACEHOLDER['32'] == avatar['32'] + assert AVATAR_PLACEHOLDER['64'] == avatar['64'] + assert AVATAR_PLACEHOLDER['64'] == avatar['96'] + + def test_valid_missing_published_date(self, fetcher): + """ + Tests fetching a valid story without published date. + """ + story = self.fetch_valid(fetcher, PUBLISHED_STORY_KEY) + assert story.meta['date_published'] is None + + def test_empty_story(self, fetcher): + """ + Test fetching a story without chapters. + """ + story = fetcher.fetch(EMPTY_STORY_KEY) + assert story.meta['id'] == EMPTY_STORY_KEY + + with pytest.raises(InvalidStoryError): + story.data + + def test_invalid_story(self, fetcher): + """ + Tests fetching an invalid story. + """ + self.fetch_invalid(fetcher, INVALID_STORY_KEY) + + def test_hidden_story(self, fetcher): + """ + Tests fetching a hidden story. + """ + self.fetch_invalid(fetcher, HIDDEN_STORY_KEY) + + def test_protected_story(self, fetcher): + """ + Tests fetching a password-protected story. + """ + self.fetch_invalid(fetcher, PROTECTED_STORY_KEY) diff --git a/tox.ini b/tox.ini index d71eac2..917b6a7 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,7 @@ commands = addopts = --ignore tests/fetchers/test_fimfarchive.py --ignore tests/fetchers/test_fimfiction.py + --ignore tests/fetchers/test_fimfiction2.py tests [flake8]