From dadc2f15857e889b70c6f0863095eebf48296240 Mon Sep 17 00:00:00 2001 From: liamwhite Date: Sun, 26 Dec 2021 19:16:21 -0500 Subject: [PATCH] Local autocomplete (#151) --- assets/js/autocomplete.js | 28 ++++ assets/js/utils/local-autocompleter.js | 134 ++++++++++++++++ docker/app/run-development | 1 + lib/philomena/autocomplete.ex | 144 ++++++++++++++++++ lib/philomena/autocomplete/autocomplete.ex | 17 +++ lib/philomena/release.ex | 5 + .../autocomplete/compiled_controller.ex | 23 +++ lib/philomena_web/router.ex | 1 + .../20211219194836_create_autocomplete.exs | 10 ++ priv/repo/structure.sql | 11 ++ 10 files changed, 374 insertions(+) create mode 100644 assets/js/utils/local-autocompleter.js create mode 100644 lib/philomena/autocomplete.ex create mode 100644 lib/philomena/autocomplete/autocomplete.ex create mode 100644 lib/philomena_web/controllers/autocomplete/compiled_controller.ex create mode 100644 priv/repo/migrations/20211219194836_create_autocomplete.exs diff --git a/assets/js/autocomplete.js b/assets/js/autocomplete.js index ff68ead1..4bb3f4a9 100644 --- a/assets/js/autocomplete.js +++ b/assets/js/autocomplete.js @@ -2,6 +2,9 @@ * Autocomplete. */ +import { LocalAutocompleter } from 'utils/local-autocompleter'; +import { handleError } from 'utils/requests'; + const cache = {}; let inputField, originalTerm; @@ -122,8 +125,23 @@ function getSuggestions(term) { function listenAutocomplete() { let timeout; + /** @type {LocalAutocompleter} */ + let localAc = null; + let localFetched = false; + + document.addEventListener('focusin', fetchLocalAutocomplete); + document.addEventListener('input', event => { removeParent(); + fetchLocalAutocomplete(event); + + if (localAc !== null && 'ac' in event.target.dataset) { + inputField = event.target; + originalTerm = inputField.value; + + const suggestions = localAc.topK(inputField.value, 5).map(({ name, imageCount }) => ({ label: `${name} (${imageCount})`, value: name })); + return showAutocomplete(suggestions, originalTerm, event.target); + } window.clearTimeout(timeout); // Use a timeout to delay requests until the user has stopped typing @@ -150,6 +168,16 @@ function listenAutocomplete() { document.addEventListener('click', event => { if (event.target && event.target !== inputField) removeParent(); }); + + function fetchLocalAutocomplete(event) { + if (!localFetched && event.target.dataset && 'ac' in event.target.dataset) { + localFetched = true; + fetch('/autocomplete/compiled', { credentials: 'omit', cache: 'force-cache' }) + .then(handleError) + .then(resp => resp.arrayBuffer()) + .then(buf => localAc = new LocalAutocompleter(buf)); + } + } } export { listenAutocomplete }; diff --git a/assets/js/utils/local-autocompleter.js b/assets/js/utils/local-autocompleter.js new file mode 100644 index 00000000..cbf290fb --- /dev/null +++ b/assets/js/utils/local-autocompleter.js @@ -0,0 +1,134 @@ +//@ts-check +/* + * Client-side tag completion. + */ + +/** + * @typedef {object} Result + * @property {string} name + * @property {number} imageCount + * @property {number[]} associations + */ + +/** + * See lib/philomena/autocomplete.ex for binary structure details. + * + * A binary blob is used to avoid the creation of large amounts of garbage on + * the JS heap and speed up the execution of the search. + */ +export class LocalAutocompleter { + /** + * Build a new local autocompleter. + * + * @param {ArrayBuffer} backingStore + */ + constructor(backingStore) { + /** @type {Uint8Array} */ + this.data = new Uint8Array(backingStore); + /** @type {DataView} */ + this.view = new DataView(backingStore); + /** @type {TextDecoder} */ + this.decoder = new TextDecoder(); + /** @type {number} */ + this.numTags = this.view.getUint32(backingStore.byteLength - 4, true); + /** @type {number} */ + this.referenceStart = this.view.getUint32(backingStore.byteLength - 8, true); + /** @type {number} */ + this.formatVersion = this.view.getUint32(backingStore.byteLength - 12, true); + + if (this.formatVersion !== 1) { + throw new Error('Incompatible autocomplete format version'); + } + } + + /** + * Get a tag's name and its associations given a byte location inside the file. + * + * @param {number} location + * @returns {[string, number[]]} + */ + getTagFromLocation(location) { + const nameLength = this.view.getUint8(location); + const assnLength = this.view.getUint8(location + 1 + nameLength); + + /** @type {number[]} */ + const associations = []; + const name = this.decoder.decode(this.data.slice(location + 1, location + nameLength + 1)); + + for (let i = 0; i < assnLength; i++) { + associations.push(this.view.getUint32(location + 1 + nameLength + i * 4, true)); + } + + return [ name, associations ]; + } + + /** + * Get a Result object as the ith tag inside the file. + * + * @param {number} i + * @returns {Result} + */ + getResultAt(i) { + const nameLocation = this.view.getUint32(this.referenceStart + i * 8, true); + const imageCount = this.view.getUint32(this.referenceStart + i * 8 + 4, true); + const [ name, associations ] = this.getTagFromLocation(nameLocation); + + return { name, imageCount, associations }; + } + + /** + * Find the top k results by image count which match the given string prefix. + * + * @param {string} prefix + * @param {number} k + * @returns {Result[]} + */ + topK(prefix, k) { + /** @type {Result[]} */ + const results = []; + + /** @type {number[]} */ + //@ts-expect-error No type for window.booru yet + const hiddenTags = window.booru.hiddenTagList; + + if (prefix === '') { + return results; + } + + // Binary search to find last smaller prefix + let l = 0; + let r = this.numTags; + + while (l < r - 1) { + const m = (l + (r - l) / 2) | 0; + const { name } = this.getResultAt(m); + + if (name.slice(0, prefix.length) >= prefix) { + // too large, go left + r = m; + } + else { + // too small, go right + l = m; + } + } + + // Scan forward until no more matches occur + while (l < this.numTags - 1) { + const result = this.getResultAt(++l); + if (!result.name.startsWith(prefix)) { + break; + } + + // Add if no associations are filtered + if (hiddenTags.findIndex(ht => result.associations.includes(ht)) === -1) { + results.push(result); + } + } + + // Sort results by image count + results.sort((a, b) => b.imageCount - a.imageCount); + + return results.slice(0, k); + } +} diff --git a/docker/app/run-development b/docker/app/run-development index 8f1440cd..dc3e7009 100755 --- a/docker/app/run-development +++ b/docker/app/run-development @@ -10,6 +10,7 @@ background() { mix run -e 'Philomena.Release.verify_artist_links()' mix run -e 'Philomena.Release.update_stats()' mix run -e 'Philomena.Release.clean_moderation_logs()' + mix run -e 'Philomena.Release.generate_autocomplete()' sleep 300 done diff --git a/lib/philomena/autocomplete.ex b/lib/philomena/autocomplete.ex new file mode 100644 index 00000000..dd6f26f4 --- /dev/null +++ b/lib/philomena/autocomplete.ex @@ -0,0 +1,144 @@ +defmodule Philomena.Autocomplete do + @moduledoc """ + Pregenerated autocomplete files. + """ + + import Ecto.Query, warn: false + alias Philomena.Repo + + alias Philomena.Tags.Tag + alias Philomena.Images.Tagging + alias Philomena.Autocomplete.Autocomplete + + @type tags_list() :: [{String.t(), number(), number()}] + @type assoc_map() :: %{String.t() => [number()]} + + @spec get_autocomplete() :: Autocomplete.t() | nil + def get_autocomplete do + Autocomplete + |> order_by(desc: :created_at) + |> limit(1) + |> Repo.one() + end + + def generate_autocomplete! do + tags = get_tags() + associations = get_associations(tags) + + # Tags are already sorted, so just add them to the file directly + # + # struct tag { + # uint8_t key_length; + # uint8_t key[]; + # uint8_t association_length; + # uint32_t associations[]; + # }; + # + # struct tag_reference { + # uint32_t tag_location; + # uint32_t num_uses; + # }; + # + + {ac_file, references} = + Enum.reduce(tags, {<<>>, <<>>}, fn {name, images_count, _}, {file, references} -> + pos = byte_size(file) + assn = Map.get(associations, name, []) + assn_bin = for id <- assn, into: <<>>, do: <> + + { + <>, + <> + } + end) + + ac_file = int32_align(ac_file) + reference_start = byte_size(ac_file) + + # Finally add the reference start and number of tags in the footer + # + # struct autocomplete_file { + # struct tag tags[]; + # struct tag_reference references[]; + # uint32_t format_version; + # uint32_t reference_start; + # uint32_t num_tags; + # }; + # + + ac_file = + <> + + # Insert the autocomplete binary + new_ac = + %Autocomplete{} + |> Autocomplete.changeset(%{content: ac_file}) + |> Repo.insert!() + + # Remove anything older + Autocomplete + |> where([ac], ac.created_at < ^new_ac.created_at) + |> Repo.delete_all() + end + + # + # Get the names of tags and their number of uses as a map. + # Sort is done in the application to avoid collation. + # + @spec get_tags() :: tags_list() + defp get_tags do + Tag + |> select([t], {t.name, t.images_count, t.id}) + |> where([t], t.images_count > 0) + |> order_by(desc: :images_count) + |> limit(65_535) + |> Repo.all() + |> Enum.filter(fn {name, _, _} -> byte_size(name) < 255 end) + |> Enum.sort() + end + + # + # Get up to eight associated tag ids for each returned tag. + # + @spec get_associations(tags_list()) :: assoc_map() + defp get_associations(tags) do + tags + |> Enum.map(fn {name, images_count, id} -> + # Randomly sample 100 images with this tag + image_sample = + Tagging + |> where(tag_id: ^id) + |> select([it], it.image_id) + |> order_by(asc: fragment("random()")) + |> limit(100) + + # Select the tags from those images which have more uses than + # the current one being considered, and overlap more than 50% + assoc_ids = + Tagging + |> join(:inner, [it], _ in assoc(it, :tag)) + |> where([_, t], t.images_count > ^images_count) + |> where([it, _], it.image_id in subquery(image_sample)) + |> group_by([_, t], t.id) + |> order_by(desc: fragment("count(*)")) + |> having([_, t], fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^images_count)) + |> select([_, t], t.id) + |> limit(8) + |> Repo.all() + + {name, assoc_ids} + end) + |> Map.new() + end + + # + # Right-pad a binary to be a multiple of 4 bytes. + # + @spec int32_align(binary()) :: binary() + defp int32_align(bin) do + pad_bits = 8 * (4 - rem(byte_size(bin), 4)) + + <> + end +end diff --git a/lib/philomena/autocomplete/autocomplete.ex b/lib/philomena/autocomplete/autocomplete.ex new file mode 100644 index 00000000..ed7dc5fa --- /dev/null +++ b/lib/philomena/autocomplete/autocomplete.ex @@ -0,0 +1,17 @@ +defmodule Philomena.Autocomplete.Autocomplete do + use Ecto.Schema + import Ecto.Changeset + + @primary_key false + schema "autocomplete" do + field :content, :binary + timestamps(inserted_at: :created_at, updated_at: false, type: :utc_datetime) + end + + @doc false + def changeset(autocomplete, attrs) do + autocomplete + |> cast(attrs, [:content]) + |> validate_required([:content]) + end +end diff --git a/lib/philomena/release.ex b/lib/philomena/release.ex index 8963bff2..32e24263 100644 --- a/lib/philomena/release.ex +++ b/lib/philomena/release.ex @@ -34,6 +34,11 @@ defmodule Philomena.Release do Philomena.ModerationLogs.cleanup!() end + def generate_autocomplete do + start_app() + Philomena.Autocomplete.generate_autocomplete!() + end + defp repos do Application.fetch_env!(@app, :ecto_repos) end diff --git a/lib/philomena_web/controllers/autocomplete/compiled_controller.ex b/lib/philomena_web/controllers/autocomplete/compiled_controller.ex new file mode 100644 index 00000000..7571e03e --- /dev/null +++ b/lib/philomena_web/controllers/autocomplete/compiled_controller.ex @@ -0,0 +1,23 @@ +defmodule PhilomenaWeb.Autocomplete.CompiledController do + use PhilomenaWeb, :controller + + alias Philomena.Autocomplete + + def show(conn, _params) do + autocomplete = Autocomplete.get_autocomplete() + + case autocomplete do + nil -> + conn + |> put_status(:not_found) + |> configure_session(drop: true) + |> text("") + + %{content: content} -> + conn + |> put_resp_header("cache-control", "public, max-age=86400") + |> configure_session(drop: true) + |> resp(200, content) + end + end +end diff --git a/lib/philomena_web/router.ex b/lib/philomena_web/router.ex index 36cf7efb..ad7f6a24 100644 --- a/lib/philomena_web/router.ex +++ b/lib/philomena_web/router.ex @@ -461,6 +461,7 @@ defmodule PhilomenaWeb.Router do scope "/autocomplete", Autocomplete, as: :autocomplete do resources "/tags", TagController, only: [:show], singleton: true + resources "/compiled", CompiledController, only: [:show], singleton: true end scope "/fetch", Fetch, as: :fetch do diff --git a/priv/repo/migrations/20211219194836_create_autocomplete.exs b/priv/repo/migrations/20211219194836_create_autocomplete.exs new file mode 100644 index 00000000..9268eb58 --- /dev/null +++ b/priv/repo/migrations/20211219194836_create_autocomplete.exs @@ -0,0 +1,10 @@ +defmodule Philomena.Repo.Migrations.CreateAutocomplete do + use Ecto.Migration + + def change do + create table(:autocomplete, primary_key: false) do + add :content, :binary, null: false + timestamps(inserted_at: :created_at, updated_at: false, type: :utc_datetime) + end + end +end diff --git a/priv/repo/structure.sql b/priv/repo/structure.sql index 724cee05..0a05672e 100644 --- a/priv/repo/structure.sql +++ b/priv/repo/structure.sql @@ -116,6 +116,16 @@ CREATE SEQUENCE public.artist_links_id_seq ALTER SEQUENCE public.artist_links_id_seq OWNED BY public.artist_links.id; +-- +-- Name: autocomplete; Type: TABLE; Schema: public; Owner: - +-- + +CREATE TABLE public.autocomplete ( + content bytea NOT NULL, + created_at timestamp(0) without time zone NOT NULL +); + + -- -- Name: badge_awards; Type: TABLE; Schema: public; Owner: - -- @@ -4959,3 +4969,4 @@ INSERT INTO public."schema_migrations" (version) VALUES (20210917190346); INSERT INTO public."schema_migrations" (version) VALUES (20210921025336); INSERT INTO public."schema_migrations" (version) VALUES (20210929181319); INSERT INTO public."schema_migrations" (version) VALUES (20211107130226); +INSERT INTO public."schema_migrations" (version) VALUES (20211219194836);