fimfarchive/fimfarchive/fetchers/fimfiction2.py
2025-02-22 17:49:52 +01:00

750 lines
19 KiB
Python

"""
Fimfiction APIv2 fetcher.
"""
#
# Fimfarchive, preserves stories from Fimfiction.
# Copyright (C) 2015 Joakim Soderlund
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import json
from abc import ABC, abstractmethod
from collections import OrderedDict, defaultdict
from copy import deepcopy
from time import sleep
from typing import Any, Dict, Iterator, Optional, Set
from urllib.parse import urlencode
from jsonapi_client import Filter, Session
from jsonapi_client.document import Document
from jsonapi_client.exceptions import DocumentError
from jsonapi_client.resourceobject import ResourceObject
from fimfarchive import __version__ as version
from fimfarchive.flavors import DataFormat, MetaFormat, MetaPurity, StorySource
from fimfarchive.utils import tqdm
from fimfarchive.exceptions import (
FimfarchiveError,
InvalidStoryError,
StorySourceError,
)
from .base import Fetcher
__all__ = (
'BetaFormatVerifier',
'Fimfiction2Fetcher',
)
QueryParams = Dict[str, Set[str]]
ROOT = 'root'
AUTHOR = 'author'
CHAPTERS = 'chapters'
PREQUEL = 'prequel'
TAGS = 'tags'
DATA_PARAMS: QueryParams = {
'include': {
'chapters',
},
'fields[chapter]': {
'authors_note_html',
'authors_note_position',
'chapter_number',
'content_html',
'title',
},
'fields[story]': {
'chapters',
},
}
META_PARAMS: QueryParams = {
'include': {
'author',
'chapters',
'tags',
},
'fields[chapter]': {
'chapter_number',
'date_modified',
'date_published',
'num_views',
'num_words',
'published',
'title',
},
'fields[story]': {
'author',
'chapters',
'color',
'completion_status',
'content_rating',
'cover_image',
'date_modified',
'date_published',
'date_updated',
'description_html',
'num_chapters',
'num_comments',
'num_dislikes',
'num_likes',
'num_views',
'num_words',
'prequel',
'published',
'rating',
'short_description',
'status',
'submitted',
'tags',
'title',
'total_num_views',
},
'fields[story_tag]': {
'name',
'type',
},
'fields[user]': {
'avatar',
'bio_html',
'date_joined',
'name',
'num_blog_posts',
'num_followers',
'num_stories',
},
}
class ApiClient:
"""
Performs API requests.
"""
def __init__(self, token: str) -> None:
"""
Constructor.
Args:
token: Fimfiction authorization bearer.
"""
self.token = token
def create_session(self, token: str) -> Session:
"""
Creates a jsonapi session with authorization.
Args:
token: Fimfiction authorization bearer.
Returns:
A jsonapi session containing the token.
"""
headers = {
'Accept-Encoding': 'gzip, deflate',
'Authorization': f'Bearer {token}',
'User-Agent': f'fimfarchive/{version}',
}
return Session(
server_url='https://www.fimfiction.net/api/v2/',
request_kwargs={'headers': headers},
)
def create_filter(self, params: QueryParams) -> Filter:
"""
Creates a jsonapi filter from query parameters.
Args:
params: Parameters to create a filter for.
Returns:
A jsonapi filter matching the parameters.
"""
joined: Dict[str, str] = OrderedDict()
for key, value in sorted(params.items()):
joined[key] = ','.join(sorted(value))
return Filter(urlencode(joined))
def get(self, path: str, params: QueryParams = dict()) -> Document:
"""
Performs a jsonapi request.
Args:
resource: Path to the resource.
params: Parameters for the request.
Returns:
A jsonapi response document.
"""
query = self.create_filter(params)
session = self.create_session(self.token)
return session.get(path, query)
class Requester(ABC):
"""
Performs Fimfiction APIv2 requests.
"""
@abstractmethod
def get_meta(self, key: int) -> ResourceObject:
"""
Performs an API request for story meta.
Args:
key: Primary key of the story.
Returns:
A resource object containing story meta.
Raises:
InvalidStoryError: If a valid story is not found.
StorySourceError: If source does not return valid data.
"""
@abstractmethod
def get_data(self, key: int) -> Iterator[ResourceObject]:
"""
Performs an API request for story data.
Args:
key: Primary key of the story.
Returns:
Resource objects containing story chapters.
Raises:
InvalidStoryError: If a valid story is not found.
StorySourceError: If source does not return valid data.
"""
class SingleRequester(Requester):
"""
Requests stories one by one.
"""
def __init__(self, client: ApiClient) -> None:
"""
Constructor.
Args:
client: Client to use for queries.
"""
self.client = client
def error(self, key: int, status: int) -> FimfarchiveError:
"""
Creates an exception for the status.
Args:
key: Primary key of the story.
status: Status code of the response.
Returns:
A fimfarchive exception instance.
"""
if status == 403:
return InvalidStoryError(f"Private story: {key}")
elif status == 404:
return InvalidStoryError(f"Missing story: {key}")
else:
return StorySourceError(f"Bad HTTP status for {key}: {status}")
def get(self, key: int, path: str, params: QueryParams) -> Document:
"""
Performs a Fimfiction APIv2 request.
Args:
key: Primary key of the story.
path: Resource to query.
params: Query parameters.
Raises:
InvalidStoryError: If a valid story is not found.
StorySourceError: If source does not return valid data.
"""
try:
return self.client.get(path, params)
except DocumentError as e:
raise self.error(key, e.response.status_code) from e
except Exception as e:
raise StorySourceError("Unknown error for {key}.") from e
def get_meta(self, key: int) -> ResourceObject:
path = f'stories/{key}'
response = self.get(key, path, META_PARAMS)
return response.resource
def get_data(self, key: int) -> Iterator[ResourceObject]:
list_path = f'stories/{key}/chapters'
list_params = deepcopy(DATA_PARAMS)
list_params['fields[chapter]'].remove('content_html')
list_response = self.get(key, list_path, list_params)
for resource in tqdm(list_response.resources):
path = f"chapters/{resource.id}"
response = self.get(key, path, DATA_PARAMS)
yield response.resource
sleep(5)
class BulkRequester(Requester):
"""
Requests stories in bulk.
"""
response: Optional[Document]
resources: Dict[int, Optional[ResourceObject]]
def __init__(
self,
client: ApiClient,
bulk_meta: bool = True,
bulk_data: bool = True,
bulk_size: int = 16,
) -> None:
"""
Constructor.
Args:
client: Client to use for queries.
bulk_meta: Toggles bulk fetching of meta.
bulk_data: Toggles bulk fetching of data.
bulk_size: Number of items to request per batch.
"""
self.client = client
self.bulk_meta = bulk_meta
self.bulk_data = bulk_data
self.bulk_size = bulk_size
def __setattr__(self, name: str, value: Any) -> None:
"""
Resets the requester when necessary.
"""
try:
super().__setattr__(name, value)
finally:
if name in ('bulk_meta', 'bulk_data'):
self.reset()
def reset(self) -> None:
"""
Drops the currently cached story batch.
"""
self.response = None
self.resources = dict()
def create_params(self) -> QueryParams:
"""
Creates general query parameters for a request.
"""
params: QueryParams = defaultdict(set)
if self.bulk_meta:
for key, value in META_PARAMS.items():
params[key].update(value)
if self.bulk_data:
for key, value in DATA_PARAMS.items():
params[key].update(value)
return dict(params)
def cache(self, key: int) -> None:
"""
Caches a story batch from Fimfiction.
Args:
key: Primary key of the story.
"""
count = int(self.bulk_size)
lower = key - (key % count)
upper = lower + count
keys = range(lower, upper)
params = self.create_params()
params['page[size]'] = {str(len(keys) + 4)}
params['filter[ids]'] = {str(i) for i in keys}
self.response = self.client.get('stories', params)
self.resources = {key: None for key in keys}
for resource in self.response.resources:
self.resources[int(resource.id)] = resource
def fetch(self, key: int) -> ResourceObject:
"""
Fetches a resource from either cache or Fimfiction.
Args:
key: Primary key of the story.
Returns:
A resource object containing the story.
Raises:
InvalidStoryError: If a valid story is not found.
StorySourceError: If source does not return valid data.
"""
if key not in self.resources:
try:
self.cache(key)
except Exception as e:
self.reset()
raise StorySourceError("Unable to fetch.") from e
resource = self.resources[key]
if resource is None:
raise InvalidStoryError("Invalid story ID.")
return resource
def get_meta(self, key: int) -> ResourceObject:
if not self.bulk_meta:
raise StorySourceError("Bulk meta not enabled.")
return self.fetch(key)
def get_data(self, key: int) -> Iterator[ResourceObject]:
if not self.bulk_data:
raise StorySourceError("Bulk data not enabled.")
return self.fetch(key).chapters
class RoutedRequester(Requester):
"""
Routes between single and bulk requesters.
"""
def __init__(
self,
client: ApiClient,
bulk_meta: bool,
bulk_data: bool,
) -> None:
"""
Constructor.
Args:
client: Client to use for queries.
bulk_meta: Toggles bulk fetching of meta.
bulk_data: Toggles bulk fetching of data.
"""
self.single = SingleRequester(client)
self.bulk = BulkRequester(client, bulk_meta, bulk_data)
def get_meta(self, key: int) -> ResourceObject:
if self.bulk.bulk_meta:
return self.bulk.get_meta(key)
else:
return self.single.get_meta(key)
def get_data(self, key: int) -> Iterator[ResourceObject]:
if self.bulk.bulk_data:
return self.bulk.get_data(key)
else:
return self.single.get_data(key)
class Documentifier:
"""
Converts a resource into a dictionary.
"""
def merge(self, target: Dict, source: Dict) -> None:
"""
Copies items from source into target.
Args:
target: Dictionary to copy to.
source: Dictionary to copy from.
"""
for key, value in deepcopy(source).items():
assert key not in target
target[key] = value
def flatten(self, resource: ResourceObject) -> Dict[str, Any]:
"""
Flattens the resource into a dictionary.
Args:
resource: Resource to flatten.
Returns:
A dictionary representation.
"""
document: Dict[str, Any] = {
'id': int(resource.id),
}
self.merge(document, resource.json['attributes'])
self.merge(document, resource.meta.meta)
return document
def __call__(self, resource: ResourceObject) -> Dict[str, Any]:
"""
Applies the documentifier.
Args:
resource: Resource to documentify.
Returns:
A dictionary representation.
"""
return self.flatten(resource)
class MetaDocumentifier(Documentifier):
"""
Converts a resource into a story meta dictionary.
"""
fill = (
'cover_image',
'date_published',
)
remove = (
'content_html',
'authors_note_html',
'authors_note_position',
)
def fill_keys(self, meta: Dict[str, Any]) -> None:
"""
Fills keys that may be left out by Fimfiction.
Args:
meta: Dictionary to fill.
"""
for key in self.fill:
if key not in meta:
meta[key] = None
def remove_data(self, meta: Dict[str, Any]) -> None:
"""
Removes keys that may be left in by the bulk fetcher.
Args:
meta: Dictionary to clean.
"""
for chapter in meta['chapters']:
for key in self.remove:
if key in chapter:
del chapter[key]
def __call__(self, resource: ResourceObject) -> Dict[str, Any]:
meta = self.flatten(resource)
assert AUTHOR not in meta
meta[AUTHOR] = self.flatten(resource.author)
assert CHAPTERS not in meta
chapters = [self.flatten(chapter) for chapter in resource.chapters]
chapters.sort(key=lambda chapter: chapter['chapter_number'])
meta[CHAPTERS] = chapters
assert PREQUEL not in meta
prequel = getattr(resource.relationships, PREQUEL, None)
if prequel:
value = prequel._resource_identifier.id
meta[PREQUEL] = int(value)
else:
meta[PREQUEL] = None
assert TAGS not in meta
tags = [self.flatten(tag) for tag in resource.tags]
tags.sort(key=lambda tag: (tag['type'], tag['name']))
meta[TAGS] = tags
self.fill_keys(meta)
self.remove_data(meta)
return meta
class BetaFormatVerifier:
"""
Verifies that required keys are present in a dictionary.
"""
def __init__(self, requirements: Dict[str, Set[str]]) -> None:
"""
Constructor.
Args:
requirements: Specifies the required keys.
"""
self.requirements: Dict[str, Set[str]] = requirements
@classmethod
def from_params(
cls,
params: QueryParams,
mapping: Dict[str, str],
) -> 'BetaFormatVerifier':
"""
Constructor, using query parameters.
Args:
params: Query parameters to base the requirements on.
mapping: Mapping from document keys to resource types.
"""
requirements = dict()
for key, resource in mapping.items():
param = f'fields[{resource}]'
fields = deepcopy(params[param])
fields.update(('id', 'url'))
requirements[key] = fields
return cls(requirements)
@classmethod
def from_meta_params(cls) -> 'BetaFormatVerifier':
"""
Constructor, for creating a meta verifier.
"""
return cls.from_params(META_PARAMS, {
ROOT: 'story',
AUTHOR: 'user',
CHAPTERS: 'chapter',
TAGS: 'story_tag',
})
@classmethod
def from_data_params(cls) -> 'BetaFormatVerifier':
"""
Constructor, for creating a chapter verifier.
"""
return cls.from_params(DATA_PARAMS, {
ROOT: 'chapter',
})
def check(self, key: str, required: Set[str], data: Any) -> None:
"""
Checks dictionaries against a set of required keys.
Args:
key: Document key being checked.
required: Set of required keys.
data: Dictionaries to check.
Raises:
StorySourceError: If a dictionary is invalid.
"""
if isinstance(data, dict):
data = (data,)
for obj in data:
if obj.keys() < required:
missing = ", ".join(required - obj.keys())
message = f"Missing from {key}: {missing}"
raise StorySourceError(message)
def __call__(self, data: Dict[str, Any]) -> None:
"""
Applies the verifier to a dictionary.
Args:
data: Dictionary to check.
Raises:
StorySourceError: If a dictionary is invalid.
"""
for key, required in self.requirements.items():
if key == ROOT:
self.check(key, required, data)
else:
self.check(key, required, data[key])
class Fimfiction2Fetcher(Fetcher):
"""
Fetcher for Fimfiction APIv2.
"""
prefetch_meta = True
prefetch_data = True
flavors = frozenset((
StorySource.FIMFICTION,
DataFormat.JSON,
MetaFormat.BETA,
MetaPurity.DIRTY,
))
def __init__(self, token: str, bulk_meta=False, bulk_data=False) -> None:
"""
Constructor.
Args:
token: Authentication token for Fimfiction.
bulk_meta: Toggles bulk fetching of story meta.
bulk_data: Toggles bulk fetching of story data.
"""
assert not bulk_meta
assert not bulk_data
client = ApiClient(token)
self.extract_meta = MetaDocumentifier()
self.extract_chapter = Documentifier()
self.verify_meta = BetaFormatVerifier.from_meta_params()
self.verify_chapter = BetaFormatVerifier.from_data_params()
self.requester = SingleRequester(client)
def fetch_meta(self, key: int) -> Dict[str, Any]:
resource = self.requester.get_meta(int(key))
meta = self.extract_meta(resource)
self.verify_meta(meta)
return meta
def fetch_data(self, key: int) -> bytes:
resource = self.requester.get_data(int(key))
chapters = [self.extract_chapter(chapter) for chapter in resource]
if not chapters:
raise InvalidStoryError("Missing chapters.")
for chapter in chapters:
self.verify_chapter(chapter)
chapters.sort(key=lambda chapter: chapter['chapter_number'])
data = json.dumps(
chapters,
indent=4,
ensure_ascii=False,
sort_keys=True
)
return data.encode()