Merge pull request #364 from philomena-dev/data-export

Add framework for zip data export
This commit is contained in:
liamwhite 2024-11-09 10:46:12 -05:00 committed by GitHub
commit ba66c93b45
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 509 additions and 8 deletions

View file

@ -0,0 +1,171 @@
defmodule Philomena.DataExports.Aggregator do
@moduledoc """
Data generation module for data export logic.
"""
import Ecto.Query
alias PhilomenaQuery.Batch
# Direct PII
alias Philomena.Donations.Donation
alias Philomena.UserFingerprints.UserFingerprint
alias Philomena.UserIps.UserIp
alias Philomena.UserNameChanges.UserNameChange
alias Philomena.Users.User
# UGC for export
alias Philomena.ArtistLinks.ArtistLink
alias Philomena.Badges.Award
alias Philomena.Comments.Comment
alias Philomena.Commissions.Commission
alias Philomena.DnpEntries.DnpEntry
alias Philomena.DuplicateReports.DuplicateReport
alias Philomena.Filters.Filter
alias Philomena.ImageFaves.ImageFave
alias Philomena.ImageHides.ImageHide
alias Philomena.ImageVotes.ImageVote
alias Philomena.Images.Image
alias Philomena.PollVotes.PollVote
alias Philomena.Posts.Post
alias Philomena.Reports.Report
alias Philomena.SourceChanges.SourceChange
alias Philomena.TagChanges.TagChange
alias Philomena.Topics.Topic
alias Philomena.Bans.User, as: UserBan
# Direct UGC from form submission
@user_columns [
:created_at,
:name,
:email,
:description,
:current_filter_id,
:spoiler_type,
:theme,
:images_per_page,
:show_large_thumbnails,
:show_sidebar_and_watched_images,
:fancy_tag_field_on_upload,
:fancy_tag_field_on_edit,
:fancy_tag_field_in_settings,
:autorefresh_by_default,
:anonymous_by_default,
:comments_newest_first,
:comments_always_jump_to_last,
:comments_per_page,
:watch_on_reply,
:watch_on_new_topic,
:watch_on_upload,
:messages_newest_first,
:serve_webm,
:no_spoilered_in_watched,
:watched_images_query_str,
:watched_images_exclude_str,
:use_centered_layout,
:personal_title,
:hide_vote_counts,
:scale_large_images
]
# All these also have created_at and are selected by user_id
@indirect_columns [
{Donation, [:email, :amount, :fee, :note]},
{UserFingerprint, [:fingerprint, :uses, :updated_at]},
{UserIp, [:ip, :uses, :updated_at]},
{UserNameChange, [:name]},
{ArtistLink, [:aasm_state, :uri, :public, :tag_id]},
{Award, [:label, :badge_name, :badge_id]},
{Comment,
[
:ip,
:fingerprint,
:user_agent,
:referrer,
:anonymous,
:image_id,
:edited_at,
:edit_reason,
:body
]},
{Commission,
[:open, :sheet_image_id, :categories, :information, :contact, :will_create, :will_not_create]},
{DnpEntry, [:tag_id, :aasm_state, :dnp_type, :hide_reason, :feedback, :reason, :instructions],
:requesting_user_id},
{DuplicateReport, [:reason, :image_id, :duplicate_of_image_id]},
{Filter,
[
:name,
:description,
:public,
:hidden_complex_str,
:spoilered_complex_str,
:hidden_tag_ids,
:spoilered_tag_ids
]},
{ImageFave, [:image_id], :user_id, :image_id},
{ImageHide, [:image_id], :user_id, :image_id},
{ImageVote, [:image_id, :up], :user_id, :image_id},
{Image, [:ip, :fingerprint, :user_agent, :referrer, :anonymous, :description]},
{PollVote, [:rank, :poll_option_id]},
{Post,
[:ip, :fingerprint, :user_agent, :referrer, :anonymous, :edited_at, :edit_reason, :body]},
{Report,
[:ip, :fingerprint, :user_agent, :referrer, :reason, :reportable_id, :reportable_type]},
{SourceChange, [:ip, :fingerprint, :user_agent, :referrer, :image_id, :added, :value]},
{TagChange,
[:ip, :fingerprint, :user_agent, :referrer, :image_id, :added, :tag_id, :tag_name_cache]},
{Topic, [:title, :anonymous, :forum_id]},
{UserBan, [:reason, :generated_ban_id]}
]
@doc """
Get all of the export data for the given user.
"""
def get_for_user(user_id) do
[select_user(user_id)] ++ select_indirect(user_id)
end
defp select_user(user_id) do
select_schema_by_key(user_id, User, @user_columns, :id)
end
defp select_indirect(user_id) do
Enum.map(@indirect_columns, fn
{schema_name, columns} ->
select_schema_by_key(user_id, schema_name, columns)
{schema_name, columns, key_column} ->
select_schema_by_key(user_id, schema_name, columns, key_column)
{schema_name, columns, key_column, id_field} ->
select_schema_by_key(user_id, schema_name, columns, key_column, id_field)
end)
end
defp select_schema_by_key(
user_id,
schema_name,
columns,
key_column \\ :user_id,
id_field \\ :id
) do
table_name = schema_name.__schema__(:source)
columns = [:created_at] ++ columns
{"#{table_name}.jsonl",
schema_name
|> where([s], field(s, ^key_column) == ^user_id)
|> select([s], map(s, ^columns))
|> Batch.records(id_field: id_field)
|> results_as_json_lines()}
end
defp results_as_json_lines(list_of_maps) do
Stream.map(list_of_maps, fn map ->
map
|> Map.new(fn {k, v} -> {k, to_string(v)} end)
|> Jason.encode!()
|> Kernel.<>("\n")
end)
end
end

View file

@ -0,0 +1,56 @@
defmodule Philomena.DataExports.ZipGenerator do
@moduledoc """
ZIP file generator for an export.
"""
alias Philomena.Native
@doc """
Write the ZIP file for the given aggregate data.
Expects a list of 2-tuples, with the first element being the name of the
file to generate, and the second element being a stream which generates the
binary contents of the file.
"""
@spec generate(Path.t(), Enumerable.t()) :: :ok | atom()
def generate(filename, aggregate) do
case Native.zip_open_writer(filename) do
{:ok, zip} ->
stream_aggregate(zip, aggregate)
error ->
error
end
end
@spec stream_aggregate(reference(), Enumerable.t()) :: {:ok, reference()} | :error
defp stream_aggregate(zip, aggregate) do
aggregate
|> Enum.reduce_while(:ok, fn {name, content_stream}, _ ->
with :ok <- Native.zip_start_file(zip, name),
:ok <- stream_file_data(zip, content_stream) do
{:cont, :ok}
else
error ->
{:halt, error}
end
end)
|> case do
:ok ->
Native.zip_finish(zip)
error ->
error
end
end
@spec stream_file_data(reference(), Enumerable.t(iodata())) :: :ok | :error
defp stream_file_data(zip, content_stream) do
Enum.reduce_while(content_stream, :ok, fn iodata, _ ->
case Native.zip_write(zip, IO.iodata_to_binary(iodata)) do
:ok -> {:cont, :ok}
error -> {:halt, error}
end
end)
end
end

View file

@ -11,4 +11,16 @@ defmodule Philomena.Native do
@spec camo_image_url(String.t()) :: String.t() @spec camo_image_url(String.t()) :: String.t()
def camo_image_url(_uri), do: :erlang.nif_error(:nif_not_loaded) def camo_image_url(_uri), do: :erlang.nif_error(:nif_not_loaded)
@spec zip_open_writer(Path.t()) :: {:ok, reference()} | {:error, atom()}
def zip_open_writer(_path), do: :erlang.nif_error(:nif_not_loaded)
@spec zip_start_file(reference(), String.t()) :: :ok | :error
def zip_start_file(_zip, _name), do: :erlang.nif_error(:nif_not_loaded)
@spec zip_write(reference(), binary()) :: :ok | :error
def zip_write(_zip, _data), do: :erlang.nif_error(:nif_not_loaded)
@spec zip_finish(reference()) :: :ok | :error
def zip_finish(_zip), do: :erlang.nif_error(:nif_not_loaded)
end end

View file

@ -125,8 +125,9 @@ defmodule PhilomenaQuery.Batch do
batch_size = Keyword.get(opts, :batch_size, 1000) batch_size = Keyword.get(opts, :batch_size, 1000)
queryable queryable
|> exclude(:preload)
|> exclude(:order_by) |> exclude(:order_by)
|> exclude(:preload)
|> exclude(:select)
|> order_by(asc: ^id_field) |> order_by(asc: ^id_field)
|> where([m], field(m, ^id_field) > ^max_id) |> where([m], field(m, ^id_field) > ^max_id)
|> select([m], field(m, ^id_field)) |> select([m], field(m, ^id_field))

View file

@ -2,6 +2,12 @@
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 3
[[package]]
name = "adler2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]] [[package]]
name = "aho-corasick" name = "aho-corasick"
version = "1.1.3" version = "1.1.3"
@ -11,6 +17,15 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
dependencies = [
"derive_arbitrary",
]
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.21.7" version = "0.21.7"
@ -57,6 +72,21 @@ dependencies = [
"unicode_categories", "unicode_categories",
] ]
[[package]]
name = "crc32fast"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]] [[package]]
name = "darling" name = "darling"
version = "0.20.9" version = "0.20.9"
@ -92,6 +122,17 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "derive_arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "derive_builder" name = "derive_builder"
version = "0.20.0" version = "0.20.0"
@ -129,12 +170,39 @@ version = "1.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322ef0094744e63628e6f0eb2295517f79276a5b342a4c2ff3042566ca181d4e" checksum = "322ef0094744e63628e6f0eb2295517f79276a5b342a4c2ff3042566ca181d4e"
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "entities" name = "entities"
version = "1.0.1" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca" checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
[[package]]
name = "equivalent"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "flate2"
version = "1.0.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]] [[package]]
name = "fnv" name = "fnv"
version = "1.0.7" version = "1.0.7"
@ -150,6 +218,12 @@ dependencies = [
"percent-encoding", "percent-encoding",
] ]
[[package]]
name = "hashbrown"
version = "0.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3"
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.4.1" version = "0.4.1"
@ -183,6 +257,16 @@ dependencies = [
"unicode-normalization", "unicode-normalization",
] ]
[[package]]
name = "indexmap"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
dependencies = [
"equivalent",
"hashbrown",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "1.0.11" version = "1.0.11"
@ -230,6 +314,12 @@ version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "lockfree-object-pool"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e"
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.21" version = "0.4.21"
@ -238,9 +328,18 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.2" version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
dependencies = [
"adler2",
]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
@ -266,13 +365,14 @@ dependencies = [
"ring", "ring",
"rustler", "rustler",
"url", "url",
"zip",
] ]
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.80" version = "1.0.89"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a56dea16b0a29e94408b9aa5e2940a4eedbd128a1ba20e8f7ae60fd3d465af0e" checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
@ -363,6 +463,12 @@ dependencies = [
"unreachable", "unreachable",
] ]
[[package]]
name = "simd-adler32"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
[[package]] [[package]]
name = "slug" name = "slug"
version = "0.1.5" version = "0.1.5"
@ -387,15 +493,35 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.59" version = "2.0.87"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a" checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "thiserror"
version = "1.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "tinyvec" name = "tinyvec"
version = "1.6.0" version = "1.6.0"
@ -561,3 +687,34 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "zip"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc5e4288ea4057ae23afc69a4472434a87a2495cafce6632fd1c4ec9f5cf3494"
dependencies = [
"arbitrary",
"crc32fast",
"crossbeam-utils",
"displaydoc",
"flate2",
"indexmap",
"memchr",
"thiserror",
"zopfli",
]
[[package]]
name = "zopfli"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946"
dependencies = [
"bumpalo",
"crc32fast",
"lockfree-object-pool",
"log",
"once_cell",
"simd-adler32",
]

View file

@ -18,6 +18,7 @@ regex = "1"
ring = "0.16" ring = "0.16"
rustler = "0.28" rustler = "0.28"
url = "2.3" url = "2.3"
zip = { version = "2.2.0", features = ["deflate"], default-features = false }
[profile.release] [profile.release]
opt-level = 3 opt-level = 3

View file

@ -1,16 +1,28 @@
use jemallocator::Jemalloc; use jemallocator::Jemalloc;
use rustler::{Atom, Binary, Env, Term};
use std::collections::HashMap; use std::collections::HashMap;
mod camo; mod camo;
mod domains; mod domains;
mod markdown; mod markdown;
mod zip;
#[global_allocator] #[global_allocator]
static GLOBAL: Jemalloc = Jemalloc; static GLOBAL: Jemalloc = Jemalloc;
rustler::init! { rustler::init! {
"Elixir.Philomena.Native", "Elixir.Philomena.Native",
[markdown_to_html, markdown_to_html_unsafe, camo_image_url] [
markdown_to_html, markdown_to_html_unsafe, camo_image_url,
zip_open_writer, zip_start_file, zip_write, zip_finish
],
load = load
}
// Setup.
fn load(env: Env, arg: Term) -> bool {
zip::load(env, arg)
} }
// Markdown NIF wrappers. // Markdown NIF wrappers.
@ -31,3 +43,25 @@ fn markdown_to_html_unsafe(input: &str, reps: HashMap<String, String>) -> String
fn camo_image_url(input: &str) -> String { fn camo_image_url(input: &str) -> String {
camo::image_url_careful(input) camo::image_url_careful(input)
} }
// Zip NIF wrappers.
#[rustler::nif]
fn zip_open_writer(path: &str) -> Result<zip::WriterResourceArc, Atom> {
zip::open_writer(path)
}
#[rustler::nif]
fn zip_start_file(writer: zip::WriterResourceArc, name: &str) -> Atom {
zip::start_file(writer, name)
}
#[rustler::nif(schedule = "DirtyCpu")]
fn zip_write(writer: zip::WriterResourceArc, data: Binary) -> Atom {
zip::write(writer, data.as_slice())
}
#[rustler::nif(schedule = "DirtyCpu")]
fn zip_finish(writer: zip::WriterResourceArc) -> Atom {
zip::finish(writer)
}

View file

@ -0,0 +1,69 @@
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::sync::Mutex;
use rustler::{Atom, Env, ResourceArc, Term};
use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter};
mod atoms {
rustler::atoms! {
ok,
error,
}
}
pub struct WriterResource {
inner: Mutex<Option<ZipWriter<File>>>,
}
pub type WriterResourceArc = ResourceArc<WriterResource>;
pub fn load(env: Env, _: Term) -> bool {
rustler::resource!(WriterResource, env);
true
}
fn with_writer<F, T>(writer: WriterResourceArc, f: F) -> Atom
where
F: FnOnce(&mut Option<ZipWriter<File>>) -> Option<T>,
{
let mut guard = match (*writer).inner.lock() {
Ok(g) => g,
Err(_) => return atoms::error(),
};
match f(&mut guard) {
Some(_) => atoms::ok(),
None => atoms::error(),
}
}
pub fn open_writer(path: &str) -> Result<WriterResourceArc, Atom> {
match OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.open(path)
{
Ok(file) => Ok(ResourceArc::new(WriterResource {
inner: Mutex::new(Some(ZipWriter::new(file))),
})),
Err(_) => Err(atoms::error()),
}
}
pub fn start_file(writer: WriterResourceArc, name: &str) -> Atom {
let options = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated);
with_writer(writer, move |writer| {
writer.as_mut()?.start_file(name, options).ok()
})
}
pub fn write(writer: WriterResourceArc, data: &[u8]) -> Atom {
with_writer(writer, move |writer| writer.as_mut()?.write(data).ok())
}
pub fn finish(writer: WriterResourceArc) -> Atom {
with_writer(writer, move |writer| writer.take().map(|w| w.finish().ok()))
}