From d6facc7809bef1e31996855d845bff9bed599198 Mon Sep 17 00:00:00 2001 From: Liam Date: Sat, 22 Jun 2024 21:30:49 -0400 Subject: [PATCH] Autocomplete logic extraction --- lib/philomena/autocomplete.ex | 212 +++-------------------- lib/philomena/autocomplete/generator.ex | 147 ++++++++++++++++ lib/philomena/tags/local_autocomplete.ex | 101 +++++++++++ 3 files changed, 270 insertions(+), 190 deletions(-) create mode 100644 lib/philomena/autocomplete/generator.ex create mode 100644 lib/philomena/tags/local_autocomplete.ex diff --git a/lib/philomena/autocomplete.ex b/lib/philomena/autocomplete.ex index ec5e04ec..f9cc9f44 100644 --- a/lib/philomena/autocomplete.ex +++ b/lib/philomena/autocomplete.ex @@ -1,19 +1,32 @@ defmodule Philomena.Autocomplete do @moduledoc """ Pregenerated autocomplete files. + + These are used to eliminate the latency of looking up search results on the server. + A script can parse the binary and generate results directly as the user types, without + incurring any roundtrip penalty. """ import Ecto.Query, warn: false alias Philomena.Repo - alias Philomena.Tags.Tag - alias Philomena.Images.Tagging alias Philomena.Autocomplete.Autocomplete + alias Philomena.Autocomplete.Generator - @type tags_list() :: [{String.t(), number(), number(), String.t() | nil}] - @type assoc_map() :: %{String.t() => [number()]} + @doc """ + Gets the current local autocompletion binary. - @spec get_autocomplete() :: Autocomplete.t() | nil + Returns nil if the binary is not currently generated. + + ## Examples + + iex> get_artist_link() + nil + + iex> get_autocomplete() + %Autocomplete{} + + """ def get_autocomplete do Autocomplete |> order_by(desc: :created_at) @@ -21,103 +34,11 @@ defmodule Philomena.Autocomplete do |> Repo.one() end + @doc """ + Creates a new local autocompletion binary, replacing any which currently exist. + """ def generate_autocomplete! do - tags = get_tags() - associations = get_associations(tags) - - # Tags are already sorted, so just add them to the file directly - # - # struct tag { - # uint8_t key_length; - # uint8_t key[]; - # uint8_t association_length; - # uint32_t associations[]; - # }; - # - - {ac_file, name_locations} = - Enum.reduce(tags, {<<>>, %{}}, fn {name, _, _, _}, {file, name_locations} -> - pos = byte_size(file) - assn = Map.get(associations, name, []) - assn_bin = for id <- assn, into: <<>>, do: <> - - { - <>, - Map.put(name_locations, name, pos) - } - end) - - # Link reference list; self-referential, so must be preprocessed to deal with aliases - # - # struct tag_reference { - # uint32_t tag_location; - # uint8_t is_aliased : 1; - # union { - # uint32_t num_uses : 31; - # uint32_t alias_index : 31; - # }; - # }; - # - - ac_file = int32_align(ac_file) - reference_start = byte_size(ac_file) - - reference_indexes = - tags - |> Enum.with_index() - |> Enum.map(fn {{name, _, _, _}, index} -> {name, index} end) - |> Map.new() - - references = - Enum.reduce(tags, <<>>, fn {name, images_count, _, alias_target}, references -> - pos = Map.fetch!(name_locations, name) - - if not is_nil(alias_target) do - target = Map.fetch!(reference_indexes, alias_target) - - <> - else - <> - end - end) - - # Reorder tags by name in their namespace to provide a secondary ordering - # - # struct secondary_reference { - # uint32_t primary_location; - # }; - # - - secondary_references = - tags - |> Enum.map(&{name_in_namespace(elem(&1, 0)), elem(&1, 0)}) - |> Enum.sort() - |> Enum.reduce(<<>>, fn {_k, v}, secondary_references -> - target = Map.fetch!(reference_indexes, v) - - <> - end) - - # Finally add the reference start and number of tags in the footer - # - # struct autocomplete_file { - # struct tag tags[]; - # struct tag_reference primary_references[]; - # struct secondary_reference secondary_references[]; - # uint32_t format_version; - # uint32_t reference_start; - # uint32_t num_tags; - # }; - # - - ac_file = << - ac_file::binary, - references::binary, - secondary_references::binary, - 2::32-little, - reference_start::32-little, - length(tags)::32-little - >> + ac_file = Generator.generate() # Insert the autocomplete binary new_ac = @@ -130,93 +51,4 @@ defmodule Philomena.Autocomplete do |> where([ac], ac.created_at < ^new_ac.created_at) |> Repo.delete_all() end - - # - # Get the names of tags and their number of uses as a map. - # Sort is done in the application to avoid collation. - # - @spec get_tags() :: tags_list() - defp get_tags do - top_tags = - Tag - |> select([t], {t.name, t.images_count, t.id, nil}) - |> where([t], t.images_count > 0) - |> order_by(desc: :images_count) - |> limit(50_000) - |> Repo.all() - - aliases_of_top_tags = - Tag - |> where([t], t.aliased_tag_id in ^Enum.map(top_tags, fn {_, _, id, _} -> id end)) - |> join(:inner, [t], _ in assoc(t, :aliased_tag)) - |> select([t, a], {t.name, 0, 0, a.name}) - |> Repo.all() - - (aliases_of_top_tags ++ top_tags) - |> Enum.filter(fn {name, _, _, _} -> byte_size(name) < 255 end) - |> Enum.sort() - end - - # - # Get up to eight associated tag ids for each returned tag. - # - @spec get_associations(tags_list()) :: assoc_map() - defp get_associations(tags) do - tags - |> Enum.filter(fn {_, _, _, aliased} -> is_nil(aliased) end) - |> Enum.map(fn {name, images_count, id, _} -> - # Randomly sample 100 images with this tag - image_sample = - Tagging - |> where(tag_id: ^id) - |> select([it], it.image_id) - |> order_by(asc: fragment("random()")) - |> limit(100) - - # Select the tags from those images which have more uses than - # the current one being considered, and overlap more than 50% - assoc_ids = - Tagging - |> join(:inner, [it], _ in assoc(it, :tag)) - |> where([_, t], t.images_count > ^images_count) - |> where([it, _], it.image_id in subquery(image_sample)) - |> group_by([_, t], t.id) - |> order_by(desc: fragment("count(*)")) - |> having([_, t], fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^images_count)) - |> select([_, t], t.id) - |> limit(8) - |> Repo.all(timeout: 120_000) - - {name, assoc_ids} - end) - |> Map.new() - end - - # - # Right-pad a binary to be a multiple of 4 bytes. - # - @spec int32_align(binary()) :: binary() - defp int32_align(bin) do - pad_bits = 8 * (4 - rem(byte_size(bin), 4)) - - <> - end - - # - # Remove the artist:, oc: etc. prefix from a tag name, - # if one is present. - # - @spec name_in_namespace(String.t()) :: String.t() - defp name_in_namespace(s) do - case String.split(s, ":", parts: 2, trim: true) do - [_namespace, name] -> - name - - [name] -> - name - - _unknown -> - s - end - end end diff --git a/lib/philomena/autocomplete/generator.ex b/lib/philomena/autocomplete/generator.ex new file mode 100644 index 00000000..6493027d --- /dev/null +++ b/lib/philomena/autocomplete/generator.ex @@ -0,0 +1,147 @@ +defmodule Philomena.Autocomplete.Generator do + @moduledoc """ + Compiled autocomplete binary for frontend usage. + + See assets/js/utils/local-autocompleter.ts for how this should be used. + The file follows the following binary format: + + struct tag { + uint8_t key_length; + uint8_t key[]; + uint8_t association_length; + uint32_t associations[]; + }; + + struct tag_reference { + uint32_t tag_location; + union { + int32_t raw; + uint32_t num_uses; ///< when positive + uint32_t alias_index; ///< when negative, -alias_index - 1 + }; + }; + + struct secondary_reference { + uint32_t primary_location; + }; + + struct autocomplete_file { + struct tag tags[]; + struct tag_reference primary_references[]; + struct secondary_reference secondary_references[]; + uint32_t format_version; + uint32_t reference_start; + uint32_t num_tags; + }; + + """ + + alias Philomena.Tags.LocalAutocomplete + + @format_version 2 + @top_tags 50_000 + @max_associations 8 + + @doc """ + Create the compiled autocomplete binary. + + See module documentation for the format. This is not expected to be larger + than a few megabytes on average. + """ + @spec generate() :: binary() + def generate do + {tags, associations} = tags_and_associations() + + # Tags are already sorted, so just add them to the file directly + {tag_block, name_locations} = + Enum.reduce(tags, {<<>>, %{}}, fn %{name: name}, {data, name_locations} -> + pos = byte_size(data) + assn = Map.get(associations, name, []) + assn_bin = for id <- assn, into: <<>>, do: <> + + { + <>, + Map.put(name_locations, name, pos) + } + end) + + # Link reference list; self-referential, so must be preprocessed to deal with aliases + tag_block = int32_align(tag_block) + reference_start = byte_size(tag_block) + + reference_indexes = + tags + |> Enum.with_index() + |> Enum.map(fn {entry, index} -> {entry.name, index} end) + |> Map.new() + + references = + Enum.reduce(tags, <<>>, fn entry, references -> + pos = Map.fetch!(name_locations, entry.name) + + if not is_nil(entry.alias_name) do + target = Map.fetch!(reference_indexes, entry.alias_name) + + <> + else + <> + end + end) + + # Reorder tags by name in their namespace to provide a secondary ordering + secondary_references = + tags + |> Enum.map(&{name_in_namespace(&1.name), &1.name}) + |> Enum.sort() + |> Enum.reduce(<<>>, fn {_k, v}, secondary_references -> + target = Map.fetch!(reference_indexes, v) + + <> + end) + + # Finally add the reference start and number of tags in the footer + << + tag_block::binary, + references::binary, + secondary_references::binary, + @format_version::32-little, + reference_start::32-little, + length(tags)::32-little + >> + end + + defp tags_and_associations do + # Names longer than 255 bytes do not fit and will break parsing. + # Sort is done in the application to avoid collation. + tags = + LocalAutocomplete.get_tags(@top_tags) + |> Enum.filter(&(byte_size(&1.name) < 255)) + |> Enum.sort_by(& &1.name) + + associations = + LocalAutocomplete.get_associations(tags, @max_associations) + + {tags, associations} + end + + defp int32_align(bin) do + # Right-pad a binary to be a multiple of 4 bytes. + pad_bits = 8 * (4 - rem(byte_size(bin), 4)) + + <> + end + + defp name_in_namespace(s) do + # Remove the artist:, oc: etc. prefix from a tag name, if one is present. + case String.split(s, ":", parts: 2, trim: true) do + [_namespace, name] -> + name + + [name] -> + name + + _unknown -> + s + end + end +end diff --git a/lib/philomena/tags/local_autocomplete.ex b/lib/philomena/tags/local_autocomplete.ex new file mode 100644 index 00000000..6f1785ed --- /dev/null +++ b/lib/philomena/tags/local_autocomplete.ex @@ -0,0 +1,101 @@ +defmodule Philomena.Tags.LocalAutocomplete do + alias Philomena.Images.Tagging + alias Philomena.Tags.Tag + alias Philomena.Repo + import Ecto.Query + + defmodule Entry do + @moduledoc """ + An individual entry record for autocomplete generation. + """ + + @type t :: %__MODULE__{ + name: String.t(), + images_count: integer(), + id: integer(), + alias_name: String.t() | nil + } + + defstruct name: "", + images_count: 0, + id: 0, + alias_name: nil + end + + @type entry_list() :: [Entry.t()] + + @type tag_id :: integer() + @type assoc_map() :: %{optional(String.t()) => [tag_id()]} + + @doc """ + Get a flat list of entry records for all of the top `amount` tags, and all of their + aliases. + """ + @spec get_tags(integer()) :: entry_list() + def get_tags(amount) do + tags = top_tags(amount) + aliases = aliases_of_tags(tags) + aliases ++ tags + end + + @doc """ + Get a map of tag names to their most associated tag ids. + + For every tag entry, its associated tags satisfy the following properties: + - is not the same as the entry's tag id + - of a sample of 100 images, appear simultaneously more than 50% of the time + """ + @spec get_associations(entry_list(), integer()) :: assoc_map() + def get_associations(tags, amount) do + tags + |> Enum.filter(&is_nil(&1.alias_name)) + |> Map.new(&{&1.name, associated_tag_ids(&1, amount)}) + end + + defp top_tags(amount) do + query = + from t in Tag, + where: t.images_count > 0, + select: %Entry{name: t.name, images_count: t.images_count, id: t.id}, + order_by: [desc: :images_count], + limit: ^amount + + Repo.all(query) + end + + defp aliases_of_tags(tags) do + ids = Enum.map(tags, & &1.id) + + query = + from t in Tag, + where: t.aliased_tag_id in ^ids, + inner_join: a in assoc(t, :aliased_tag), + select: %Entry{name: t.name, images_count: 0, id: 0, alias_name: a.name} + + Repo.all(query) + end + + defp associated_tag_ids(entry, amount) do + image_sample_query = + from it in Tagging, + where: it.tag_id == ^entry.id, + select: it.image_id, + order_by: [asc: fragment("random()")], + limit: 100 + + # Select the tags from those images which have more uses than + # the current one being considered, and overlap more than 50% + assoc_query = + from it in Tagging, + inner_join: t in assoc(it, :tag), + where: t.images_count > ^entry.images_count, + where: it.image_id in subquery(image_sample_query), + group_by: t.id, + order_by: [desc: fragment("count(*)")], + having: fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^entry.images_count), + select: t.id, + limit: ^amount + + Repo.all(assoc_query, timeout: 120_000) + end +end