From 1de4a7b82011ec73a38d939710e0e1750a424000 Mon Sep 17 00:00:00 2001 From: Liam Date: Sun, 3 Nov 2024 19:17:01 -0500 Subject: [PATCH] Add framework for zip data export --- lib/philomena/data_exports/aggregator.ex | 171 ++++++++++++++++++++ lib/philomena/data_exports/zip_generator.ex | 56 +++++++ lib/philomena/native.ex | 12 ++ lib/philomena_query/batch.ex | 3 +- native/philomena/Cargo.lock | 169 ++++++++++++++++++- native/philomena/Cargo.toml | 1 + native/philomena/src/lib.rs | 36 ++++- native/philomena/src/zip.rs | 69 ++++++++ 8 files changed, 509 insertions(+), 8 deletions(-) create mode 100644 lib/philomena/data_exports/aggregator.ex create mode 100644 lib/philomena/data_exports/zip_generator.ex create mode 100644 native/philomena/src/zip.rs diff --git a/lib/philomena/data_exports/aggregator.ex b/lib/philomena/data_exports/aggregator.ex new file mode 100644 index 00000000..5dcb82b3 --- /dev/null +++ b/lib/philomena/data_exports/aggregator.ex @@ -0,0 +1,171 @@ +defmodule Philomena.DataExports.Aggregator do + @moduledoc """ + Data generation module for data export logic. + """ + + import Ecto.Query + alias PhilomenaQuery.Batch + + # Direct PII + alias Philomena.Donations.Donation + alias Philomena.UserFingerprints.UserFingerprint + alias Philomena.UserIps.UserIp + alias Philomena.UserNameChanges.UserNameChange + alias Philomena.Users.User + + # UGC for export + alias Philomena.ArtistLinks.ArtistLink + alias Philomena.Badges.Award + alias Philomena.Comments.Comment + alias Philomena.Commissions.Commission + alias Philomena.DnpEntries.DnpEntry + alias Philomena.DuplicateReports.DuplicateReport + alias Philomena.Filters.Filter + alias Philomena.ImageFaves.ImageFave + alias Philomena.ImageHides.ImageHide + alias Philomena.ImageVotes.ImageVote + alias Philomena.Images.Image + alias Philomena.PollVotes.PollVote + alias Philomena.Posts.Post + alias Philomena.Reports.Report + alias Philomena.SourceChanges.SourceChange + alias Philomena.TagChanges.TagChange + alias Philomena.Topics.Topic + alias Philomena.Bans.User, as: UserBan + + # Direct UGC from form submission + @user_columns [ + :created_at, + :name, + :email, + :description, + :current_filter_id, + :spoiler_type, + :theme, + :images_per_page, + :show_large_thumbnails, + :show_sidebar_and_watched_images, + :fancy_tag_field_on_upload, + :fancy_tag_field_on_edit, + :fancy_tag_field_in_settings, + :autorefresh_by_default, + :anonymous_by_default, + :comments_newest_first, + :comments_always_jump_to_last, + :comments_per_page, + :watch_on_reply, + :watch_on_new_topic, + :watch_on_upload, + :messages_newest_first, + :serve_webm, + :no_spoilered_in_watched, + :watched_images_query_str, + :watched_images_exclude_str, + :use_centered_layout, + :personal_title, + :hide_vote_counts, + :scale_large_images + ] + + # All these also have created_at and are selected by user_id + @indirect_columns [ + {Donation, [:email, :amount, :fee, :note]}, + {UserFingerprint, [:fingerprint, :uses, :updated_at]}, + {UserIp, [:ip, :uses, :updated_at]}, + {UserNameChange, [:name]}, + {ArtistLink, [:aasm_state, :uri, :public, :tag_id]}, + {Award, [:label, :badge_name, :badge_id]}, + {Comment, + [ + :ip, + :fingerprint, + :user_agent, + :referrer, + :anonymous, + :image_id, + :edited_at, + :edit_reason, + :body + ]}, + {Commission, + [:open, :sheet_image_id, :categories, :information, :contact, :will_create, :will_not_create]}, + {DnpEntry, [:tag_id, :aasm_state, :dnp_type, :hide_reason, :feedback, :reason, :instructions], + :requesting_user_id}, + {DuplicateReport, [:reason, :image_id, :duplicate_of_image_id]}, + {Filter, + [ + :name, + :description, + :public, + :hidden_complex_str, + :spoilered_complex_str, + :hidden_tag_ids, + :spoilered_tag_ids + ]}, + {ImageFave, [:image_id], :user_id, :image_id}, + {ImageHide, [:image_id], :user_id, :image_id}, + {ImageVote, [:image_id, :up], :user_id, :image_id}, + {Image, [:ip, :fingerprint, :user_agent, :referrer, :anonymous, :description]}, + {PollVote, [:rank, :poll_option_id]}, + {Post, + [:ip, :fingerprint, :user_agent, :referrer, :anonymous, :edited_at, :edit_reason, :body]}, + {Report, + [:ip, :fingerprint, :user_agent, :referrer, :reason, :reportable_id, :reportable_type]}, + {SourceChange, [:ip, :fingerprint, :user_agent, :referrer, :image_id, :added, :value]}, + {TagChange, + [:ip, :fingerprint, :user_agent, :referrer, :image_id, :added, :tag_id, :tag_name_cache]}, + {Topic, [:title, :anonymous, :forum_id]}, + {UserBan, [:reason, :generated_ban_id]} + ] + + @doc """ + Get all of the export data for the given user. + """ + def get_for_user(user_id) do + [select_user(user_id)] ++ select_indirect(user_id) + end + + defp select_user(user_id) do + select_schema_by_key(user_id, User, @user_columns, :id) + end + + defp select_indirect(user_id) do + Enum.map(@indirect_columns, fn + {schema_name, columns} -> + select_schema_by_key(user_id, schema_name, columns) + + {schema_name, columns, key_column} -> + select_schema_by_key(user_id, schema_name, columns, key_column) + + {schema_name, columns, key_column, id_field} -> + select_schema_by_key(user_id, schema_name, columns, key_column, id_field) + end) + end + + defp select_schema_by_key( + user_id, + schema_name, + columns, + key_column \\ :user_id, + id_field \\ :id + ) do + table_name = schema_name.__schema__(:source) + columns = [:created_at] ++ columns + + {"#{table_name}.jsonl", + schema_name + |> where([s], field(s, ^key_column) == ^user_id) + |> select([s], map(s, ^columns)) + |> Batch.records(id_field: id_field) + |> results_as_json_lines()} + end + + defp results_as_json_lines(list_of_maps) do + Stream.map(list_of_maps, fn map -> + map + |> Map.new(fn {k, v} -> {k, to_string(v)} end) + |> Jason.encode!() + |> Kernel.<>("\n") + end) + end +end diff --git a/lib/philomena/data_exports/zip_generator.ex b/lib/philomena/data_exports/zip_generator.ex new file mode 100644 index 00000000..8c5aaf7e --- /dev/null +++ b/lib/philomena/data_exports/zip_generator.ex @@ -0,0 +1,56 @@ +defmodule Philomena.DataExports.ZipGenerator do + @moduledoc """ + ZIP file generator for an export. + """ + + alias Philomena.Native + + @doc """ + Write the ZIP file for the given aggregate data. + + Expects a list of 2-tuples, with the first element being the name of the + file to generate, and the second element being a stream which generates the + binary contents of the file. + """ + @spec generate(Path.t(), Enumerable.t()) :: :ok | atom() + def generate(filename, aggregate) do + case Native.zip_open_writer(filename) do + {:ok, zip} -> + stream_aggregate(zip, aggregate) + + error -> + error + end + end + + @spec stream_aggregate(reference(), Enumerable.t()) :: {:ok, reference()} | :error + defp stream_aggregate(zip, aggregate) do + aggregate + |> Enum.reduce_while(:ok, fn {name, content_stream}, _ -> + with :ok <- Native.zip_start_file(zip, name), + :ok <- stream_file_data(zip, content_stream) do + {:cont, :ok} + else + error -> + {:halt, error} + end + end) + |> case do + :ok -> + Native.zip_finish(zip) + + error -> + error + end + end + + @spec stream_file_data(reference(), Enumerable.t(iodata())) :: :ok | :error + defp stream_file_data(zip, content_stream) do + Enum.reduce_while(content_stream, :ok, fn iodata, _ -> + case Native.zip_write(zip, IO.iodata_to_binary(iodata)) do + :ok -> {:cont, :ok} + error -> {:halt, error} + end + end) + end +end diff --git a/lib/philomena/native.ex b/lib/philomena/native.ex index a67dc33f..14eeaa17 100644 --- a/lib/philomena/native.ex +++ b/lib/philomena/native.ex @@ -11,4 +11,16 @@ defmodule Philomena.Native do @spec camo_image_url(String.t()) :: String.t() def camo_image_url(_uri), do: :erlang.nif_error(:nif_not_loaded) + + @spec zip_open_writer(Path.t()) :: {:ok, reference()} | {:error, atom()} + def zip_open_writer(_path), do: :erlang.nif_error(:nif_not_loaded) + + @spec zip_start_file(reference(), String.t()) :: :ok | :error + def zip_start_file(_zip, _name), do: :erlang.nif_error(:nif_not_loaded) + + @spec zip_write(reference(), binary()) :: :ok | :error + def zip_write(_zip, _data), do: :erlang.nif_error(:nif_not_loaded) + + @spec zip_finish(reference()) :: :ok | :error + def zip_finish(_zip), do: :erlang.nif_error(:nif_not_loaded) end diff --git a/lib/philomena_query/batch.ex b/lib/philomena_query/batch.ex index 20bdc364..bb9c2c74 100644 --- a/lib/philomena_query/batch.ex +++ b/lib/philomena_query/batch.ex @@ -125,8 +125,9 @@ defmodule PhilomenaQuery.Batch do batch_size = Keyword.get(opts, :batch_size, 1000) queryable - |> exclude(:preload) |> exclude(:order_by) + |> exclude(:preload) + |> exclude(:select) |> order_by(asc: ^id_field) |> where([m], field(m, ^id_field) > ^max_id) |> select([m], field(m, ^id_field)) diff --git a/native/philomena/Cargo.lock b/native/philomena/Cargo.lock index 495a6815..623227a1 100644 --- a/native/philomena/Cargo.lock +++ b/native/philomena/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "aho-corasick" version = "1.1.3" @@ -11,6 +17,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "base64" version = "0.21.7" @@ -57,6 +72,21 @@ dependencies = [ "unicode_categories", ] +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "darling" version = "0.20.9" @@ -92,6 +122,17 @@ dependencies = [ "syn", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "derive_builder" version = "0.20.0" @@ -129,12 +170,39 @@ version = "1.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "322ef0094744e63628e6f0eb2295517f79276a5b342a4c2ff3042566ca181d4e" +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "entities" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca" +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "flate2" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -150,6 +218,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "hashbrown" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" + [[package]] name = "heck" version = "0.4.1" @@ -183,6 +257,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "itoa" version = "1.0.11" @@ -230,6 +314,12 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "lockfree-object-pool" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" + [[package]] name = "log" version = "0.4.21" @@ -238,9 +328,18 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] [[package]] name = "once_cell" @@ -266,13 +365,14 @@ dependencies = [ "ring", "rustler", "url", + "zip", ] [[package]] name = "proc-macro2" -version = "1.0.80" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56dea16b0a29e94408b9aa5e2940a4eedbd128a1ba20e8f7ae60fd3d465af0e" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] @@ -363,6 +463,12 @@ dependencies = [ "unreachable", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "slug" version = "0.1.5" @@ -387,15 +493,35 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.59" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a6531ffc7b071655e4ce2e04bd464c4830bb585a61cabb96cf808f05172615a" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "thiserror" +version = "1.0.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -561,3 +687,34 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "zip" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc5e4288ea4057ae23afc69a4472434a87a2495cafce6632fd1c4ec9f5cf3494" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap", + "memchr", + "thiserror", + "zopfli", +] + +[[package]] +name = "zopfli" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" +dependencies = [ + "bumpalo", + "crc32fast", + "lockfree-object-pool", + "log", + "once_cell", + "simd-adler32", +] diff --git a/native/philomena/Cargo.toml b/native/philomena/Cargo.toml index 20cb3b9b..9ef7caac 100644 --- a/native/philomena/Cargo.toml +++ b/native/philomena/Cargo.toml @@ -18,6 +18,7 @@ regex = "1" ring = "0.16" rustler = "0.28" url = "2.3" +zip = { version = "2.2.0", features = ["deflate"], default-features = false } [profile.release] opt-level = 3 diff --git a/native/philomena/src/lib.rs b/native/philomena/src/lib.rs index bcbadcb6..6c8f96f7 100644 --- a/native/philomena/src/lib.rs +++ b/native/philomena/src/lib.rs @@ -1,16 +1,28 @@ use jemallocator::Jemalloc; +use rustler::{Atom, Binary, Env, Term}; use std::collections::HashMap; mod camo; mod domains; mod markdown; +mod zip; #[global_allocator] static GLOBAL: Jemalloc = Jemalloc; rustler::init! { "Elixir.Philomena.Native", - [markdown_to_html, markdown_to_html_unsafe, camo_image_url] + [ + markdown_to_html, markdown_to_html_unsafe, camo_image_url, + zip_open_writer, zip_start_file, zip_write, zip_finish + ], + load = load +} + +// Setup. + +fn load(env: Env, arg: Term) -> bool { + zip::load(env, arg) } // Markdown NIF wrappers. @@ -31,3 +43,25 @@ fn markdown_to_html_unsafe(input: &str, reps: HashMap) -> String fn camo_image_url(input: &str) -> String { camo::image_url_careful(input) } + +// Zip NIF wrappers. + +#[rustler::nif] +fn zip_open_writer(path: &str) -> Result { + zip::open_writer(path) +} + +#[rustler::nif] +fn zip_start_file(writer: zip::WriterResourceArc, name: &str) -> Atom { + zip::start_file(writer, name) +} + +#[rustler::nif(schedule = "DirtyCpu")] +fn zip_write(writer: zip::WriterResourceArc, data: Binary) -> Atom { + zip::write(writer, data.as_slice()) +} + +#[rustler::nif(schedule = "DirtyCpu")] +fn zip_finish(writer: zip::WriterResourceArc) -> Atom { + zip::finish(writer) +} diff --git a/native/philomena/src/zip.rs b/native/philomena/src/zip.rs new file mode 100644 index 00000000..60ef15be --- /dev/null +++ b/native/philomena/src/zip.rs @@ -0,0 +1,69 @@ +use std::fs::{File, OpenOptions}; +use std::io::Write; +use std::sync::Mutex; + +use rustler::{Atom, Env, ResourceArc, Term}; +use zip::{write::SimpleFileOptions, CompressionMethod, ZipWriter}; + +mod atoms { + rustler::atoms! { + ok, + error, + } +} + +pub struct WriterResource { + inner: Mutex>>, +} + +pub type WriterResourceArc = ResourceArc; + +pub fn load(env: Env, _: Term) -> bool { + rustler::resource!(WriterResource, env); + true +} + +fn with_writer(writer: WriterResourceArc, f: F) -> Atom +where + F: FnOnce(&mut Option>) -> Option, +{ + let mut guard = match (*writer).inner.lock() { + Ok(g) => g, + Err(_) => return atoms::error(), + }; + + match f(&mut guard) { + Some(_) => atoms::ok(), + None => atoms::error(), + } +} + +pub fn open_writer(path: &str) -> Result { + match OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(path) + { + Ok(file) => Ok(ResourceArc::new(WriterResource { + inner: Mutex::new(Some(ZipWriter::new(file))), + })), + Err(_) => Err(atoms::error()), + } +} + +pub fn start_file(writer: WriterResourceArc, name: &str) -> Atom { + let options = SimpleFileOptions::default().compression_method(CompressionMethod::Deflated); + + with_writer(writer, move |writer| { + writer.as_mut()?.start_file(name, options).ok() + }) +} + +pub fn write(writer: WriterResourceArc, data: &[u8]) -> Atom { + with_writer(writer, move |writer| writer.as_mut()?.write(data).ok()) +} + +pub fn finish(writer: WriterResourceArc) -> Atom { + with_writer(writer, move |writer| writer.take().map(|w| w.finish().ok())) +}