mirror of
https://github.com/philomena-dev/philomena.git
synced 2025-01-19 22:27:59 +01:00
Merge pull request #303 from philomena-dev/autocomplete-extraction
Autocomplete logic cleanup
This commit is contained in:
commit
bd4dfd9016
3 changed files with 270 additions and 190 deletions
|
@ -1,19 +1,32 @@
|
||||||
defmodule Philomena.Autocomplete do
|
defmodule Philomena.Autocomplete do
|
||||||
@moduledoc """
|
@moduledoc """
|
||||||
Pregenerated autocomplete files.
|
Pregenerated autocomplete files.
|
||||||
|
|
||||||
|
These are used to eliminate the latency of looking up search results on the server.
|
||||||
|
A script can parse the binary and generate results directly as the user types, without
|
||||||
|
incurring any roundtrip penalty.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import Ecto.Query, warn: false
|
import Ecto.Query, warn: false
|
||||||
alias Philomena.Repo
|
alias Philomena.Repo
|
||||||
|
|
||||||
alias Philomena.Tags.Tag
|
|
||||||
alias Philomena.Images.Tagging
|
|
||||||
alias Philomena.Autocomplete.Autocomplete
|
alias Philomena.Autocomplete.Autocomplete
|
||||||
|
alias Philomena.Autocomplete.Generator
|
||||||
|
|
||||||
@type tags_list() :: [{String.t(), number(), number(), String.t() | nil}]
|
@doc """
|
||||||
@type assoc_map() :: %{String.t() => [number()]}
|
Gets the current local autocompletion binary.
|
||||||
|
|
||||||
@spec get_autocomplete() :: Autocomplete.t() | nil
|
Returns nil if the binary is not currently generated.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
iex> get_artist_link()
|
||||||
|
nil
|
||||||
|
|
||||||
|
iex> get_autocomplete()
|
||||||
|
%Autocomplete{}
|
||||||
|
|
||||||
|
"""
|
||||||
def get_autocomplete do
|
def get_autocomplete do
|
||||||
Autocomplete
|
Autocomplete
|
||||||
|> order_by(desc: :created_at)
|
|> order_by(desc: :created_at)
|
||||||
|
@ -21,103 +34,11 @@ defmodule Philomena.Autocomplete do
|
||||||
|> Repo.one()
|
|> Repo.one()
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Creates a new local autocompletion binary, replacing any which currently exist.
|
||||||
|
"""
|
||||||
def generate_autocomplete! do
|
def generate_autocomplete! do
|
||||||
tags = get_tags()
|
ac_file = Generator.generate()
|
||||||
associations = get_associations(tags)
|
|
||||||
|
|
||||||
# Tags are already sorted, so just add them to the file directly
|
|
||||||
#
|
|
||||||
# struct tag {
|
|
||||||
# uint8_t key_length;
|
|
||||||
# uint8_t key[];
|
|
||||||
# uint8_t association_length;
|
|
||||||
# uint32_t associations[];
|
|
||||||
# };
|
|
||||||
#
|
|
||||||
|
|
||||||
{ac_file, name_locations} =
|
|
||||||
Enum.reduce(tags, {<<>>, %{}}, fn {name, _, _, _}, {file, name_locations} ->
|
|
||||||
pos = byte_size(file)
|
|
||||||
assn = Map.get(associations, name, [])
|
|
||||||
assn_bin = for id <- assn, into: <<>>, do: <<id::32-little>>
|
|
||||||
|
|
||||||
{
|
|
||||||
<<file::binary, byte_size(name)::8, name::binary, length(assn)::8, assn_bin::binary>>,
|
|
||||||
Map.put(name_locations, name, pos)
|
|
||||||
}
|
|
||||||
end)
|
|
||||||
|
|
||||||
# Link reference list; self-referential, so must be preprocessed to deal with aliases
|
|
||||||
#
|
|
||||||
# struct tag_reference {
|
|
||||||
# uint32_t tag_location;
|
|
||||||
# uint8_t is_aliased : 1;
|
|
||||||
# union {
|
|
||||||
# uint32_t num_uses : 31;
|
|
||||||
# uint32_t alias_index : 31;
|
|
||||||
# };
|
|
||||||
# };
|
|
||||||
#
|
|
||||||
|
|
||||||
ac_file = int32_align(ac_file)
|
|
||||||
reference_start = byte_size(ac_file)
|
|
||||||
|
|
||||||
reference_indexes =
|
|
||||||
tags
|
|
||||||
|> Enum.with_index()
|
|
||||||
|> Enum.map(fn {{name, _, _, _}, index} -> {name, index} end)
|
|
||||||
|> Map.new()
|
|
||||||
|
|
||||||
references =
|
|
||||||
Enum.reduce(tags, <<>>, fn {name, images_count, _, alias_target}, references ->
|
|
||||||
pos = Map.fetch!(name_locations, name)
|
|
||||||
|
|
||||||
if not is_nil(alias_target) do
|
|
||||||
target = Map.fetch!(reference_indexes, alias_target)
|
|
||||||
|
|
||||||
<<references::binary, pos::32-little, -(target + 1)::32-little>>
|
|
||||||
else
|
|
||||||
<<references::binary, pos::32-little, images_count::32-little>>
|
|
||||||
end
|
|
||||||
end)
|
|
||||||
|
|
||||||
# Reorder tags by name in their namespace to provide a secondary ordering
|
|
||||||
#
|
|
||||||
# struct secondary_reference {
|
|
||||||
# uint32_t primary_location;
|
|
||||||
# };
|
|
||||||
#
|
|
||||||
|
|
||||||
secondary_references =
|
|
||||||
tags
|
|
||||||
|> Enum.map(&{name_in_namespace(elem(&1, 0)), elem(&1, 0)})
|
|
||||||
|> Enum.sort()
|
|
||||||
|> Enum.reduce(<<>>, fn {_k, v}, secondary_references ->
|
|
||||||
target = Map.fetch!(reference_indexes, v)
|
|
||||||
|
|
||||||
<<secondary_references::binary, target::32-little>>
|
|
||||||
end)
|
|
||||||
|
|
||||||
# Finally add the reference start and number of tags in the footer
|
|
||||||
#
|
|
||||||
# struct autocomplete_file {
|
|
||||||
# struct tag tags[];
|
|
||||||
# struct tag_reference primary_references[];
|
|
||||||
# struct secondary_reference secondary_references[];
|
|
||||||
# uint32_t format_version;
|
|
||||||
# uint32_t reference_start;
|
|
||||||
# uint32_t num_tags;
|
|
||||||
# };
|
|
||||||
#
|
|
||||||
|
|
||||||
ac_file = <<
|
|
||||||
ac_file::binary,
|
|
||||||
references::binary,
|
|
||||||
secondary_references::binary,
|
|
||||||
2::32-little,
|
|
||||||
reference_start::32-little,
|
|
||||||
length(tags)::32-little
|
|
||||||
>>
|
|
||||||
|
|
||||||
# Insert the autocomplete binary
|
# Insert the autocomplete binary
|
||||||
new_ac =
|
new_ac =
|
||||||
|
@ -130,93 +51,4 @@ defmodule Philomena.Autocomplete do
|
||||||
|> where([ac], ac.created_at < ^new_ac.created_at)
|
|> where([ac], ac.created_at < ^new_ac.created_at)
|
||||||
|> Repo.delete_all()
|
|> Repo.delete_all()
|
||||||
end
|
end
|
||||||
|
|
||||||
#
|
|
||||||
# Get the names of tags and their number of uses as a map.
|
|
||||||
# Sort is done in the application to avoid collation.
|
|
||||||
#
|
|
||||||
@spec get_tags() :: tags_list()
|
|
||||||
defp get_tags do
|
|
||||||
top_tags =
|
|
||||||
Tag
|
|
||||||
|> select([t], {t.name, t.images_count, t.id, nil})
|
|
||||||
|> where([t], t.images_count > 0)
|
|
||||||
|> order_by(desc: :images_count)
|
|
||||||
|> limit(50_000)
|
|
||||||
|> Repo.all()
|
|
||||||
|
|
||||||
aliases_of_top_tags =
|
|
||||||
Tag
|
|
||||||
|> where([t], t.aliased_tag_id in ^Enum.map(top_tags, fn {_, _, id, _} -> id end))
|
|
||||||
|> join(:inner, [t], _ in assoc(t, :aliased_tag))
|
|
||||||
|> select([t, a], {t.name, 0, 0, a.name})
|
|
||||||
|> Repo.all()
|
|
||||||
|
|
||||||
(aliases_of_top_tags ++ top_tags)
|
|
||||||
|> Enum.filter(fn {name, _, _, _} -> byte_size(name) < 255 end)
|
|
||||||
|> Enum.sort()
|
|
||||||
end
|
|
||||||
|
|
||||||
#
|
|
||||||
# Get up to eight associated tag ids for each returned tag.
|
|
||||||
#
|
|
||||||
@spec get_associations(tags_list()) :: assoc_map()
|
|
||||||
defp get_associations(tags) do
|
|
||||||
tags
|
|
||||||
|> Enum.filter(fn {_, _, _, aliased} -> is_nil(aliased) end)
|
|
||||||
|> Enum.map(fn {name, images_count, id, _} ->
|
|
||||||
# Randomly sample 100 images with this tag
|
|
||||||
image_sample =
|
|
||||||
Tagging
|
|
||||||
|> where(tag_id: ^id)
|
|
||||||
|> select([it], it.image_id)
|
|
||||||
|> order_by(asc: fragment("random()"))
|
|
||||||
|> limit(100)
|
|
||||||
|
|
||||||
# Select the tags from those images which have more uses than
|
|
||||||
# the current one being considered, and overlap more than 50%
|
|
||||||
assoc_ids =
|
|
||||||
Tagging
|
|
||||||
|> join(:inner, [it], _ in assoc(it, :tag))
|
|
||||||
|> where([_, t], t.images_count > ^images_count)
|
|
||||||
|> where([it, _], it.image_id in subquery(image_sample))
|
|
||||||
|> group_by([_, t], t.id)
|
|
||||||
|> order_by(desc: fragment("count(*)"))
|
|
||||||
|> having([_, t], fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^images_count))
|
|
||||||
|> select([_, t], t.id)
|
|
||||||
|> limit(8)
|
|
||||||
|> Repo.all(timeout: 120_000)
|
|
||||||
|
|
||||||
{name, assoc_ids}
|
|
||||||
end)
|
|
||||||
|> Map.new()
|
|
||||||
end
|
|
||||||
|
|
||||||
#
|
|
||||||
# Right-pad a binary to be a multiple of 4 bytes.
|
|
||||||
#
|
|
||||||
@spec int32_align(binary()) :: binary()
|
|
||||||
defp int32_align(bin) do
|
|
||||||
pad_bits = 8 * (4 - rem(byte_size(bin), 4))
|
|
||||||
|
|
||||||
<<bin::binary, 0::size(pad_bits)>>
|
|
||||||
end
|
|
||||||
|
|
||||||
#
|
|
||||||
# Remove the artist:, oc: etc. prefix from a tag name,
|
|
||||||
# if one is present.
|
|
||||||
#
|
|
||||||
@spec name_in_namespace(String.t()) :: String.t()
|
|
||||||
defp name_in_namespace(s) do
|
|
||||||
case String.split(s, ":", parts: 2, trim: true) do
|
|
||||||
[_namespace, name] ->
|
|
||||||
name
|
|
||||||
|
|
||||||
[name] ->
|
|
||||||
name
|
|
||||||
|
|
||||||
_unknown ->
|
|
||||||
s
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
147
lib/philomena/autocomplete/generator.ex
Normal file
147
lib/philomena/autocomplete/generator.ex
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
defmodule Philomena.Autocomplete.Generator do
|
||||||
|
@moduledoc """
|
||||||
|
Compiled autocomplete binary for frontend usage.
|
||||||
|
|
||||||
|
See assets/js/utils/local-autocompleter.ts for how this should be used.
|
||||||
|
The file follows the following binary format:
|
||||||
|
|
||||||
|
struct tag {
|
||||||
|
uint8_t key_length;
|
||||||
|
uint8_t key[];
|
||||||
|
uint8_t association_length;
|
||||||
|
uint32_t associations[];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct tag_reference {
|
||||||
|
uint32_t tag_location;
|
||||||
|
union {
|
||||||
|
int32_t raw;
|
||||||
|
uint32_t num_uses; ///< when positive
|
||||||
|
uint32_t alias_index; ///< when negative, -alias_index - 1
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct secondary_reference {
|
||||||
|
uint32_t primary_location;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct autocomplete_file {
|
||||||
|
struct tag tags[];
|
||||||
|
struct tag_reference primary_references[];
|
||||||
|
struct secondary_reference secondary_references[];
|
||||||
|
uint32_t format_version;
|
||||||
|
uint32_t reference_start;
|
||||||
|
uint32_t num_tags;
|
||||||
|
};
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
alias Philomena.Tags.LocalAutocomplete
|
||||||
|
|
||||||
|
@format_version 2
|
||||||
|
@top_tags 50_000
|
||||||
|
@max_associations 8
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Create the compiled autocomplete binary.
|
||||||
|
|
||||||
|
See module documentation for the format. This is not expected to be larger
|
||||||
|
than a few megabytes on average.
|
||||||
|
"""
|
||||||
|
@spec generate() :: binary()
|
||||||
|
def generate do
|
||||||
|
{tags, associations} = tags_and_associations()
|
||||||
|
|
||||||
|
# Tags are already sorted, so just add them to the file directly
|
||||||
|
{tag_block, name_locations} =
|
||||||
|
Enum.reduce(tags, {<<>>, %{}}, fn %{name: name}, {data, name_locations} ->
|
||||||
|
pos = byte_size(data)
|
||||||
|
assn = Map.get(associations, name, [])
|
||||||
|
assn_bin = for id <- assn, into: <<>>, do: <<id::32-little>>
|
||||||
|
|
||||||
|
{
|
||||||
|
<<data::binary, byte_size(name)::8, name::binary, length(assn)::8, assn_bin::binary>>,
|
||||||
|
Map.put(name_locations, name, pos)
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
|
||||||
|
# Link reference list; self-referential, so must be preprocessed to deal with aliases
|
||||||
|
tag_block = int32_align(tag_block)
|
||||||
|
reference_start = byte_size(tag_block)
|
||||||
|
|
||||||
|
reference_indexes =
|
||||||
|
tags
|
||||||
|
|> Enum.with_index()
|
||||||
|
|> Enum.map(fn {entry, index} -> {entry.name, index} end)
|
||||||
|
|> Map.new()
|
||||||
|
|
||||||
|
references =
|
||||||
|
Enum.reduce(tags, <<>>, fn entry, references ->
|
||||||
|
pos = Map.fetch!(name_locations, entry.name)
|
||||||
|
|
||||||
|
if not is_nil(entry.alias_name) do
|
||||||
|
target = Map.fetch!(reference_indexes, entry.alias_name)
|
||||||
|
|
||||||
|
<<references::binary, pos::32-little, -(target + 1)::32-little>>
|
||||||
|
else
|
||||||
|
<<references::binary, pos::32-little, entry.images_count::32-little>>
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|
||||||
|
# Reorder tags by name in their namespace to provide a secondary ordering
|
||||||
|
secondary_references =
|
||||||
|
tags
|
||||||
|
|> Enum.map(&{name_in_namespace(&1.name), &1.name})
|
||||||
|
|> Enum.sort()
|
||||||
|
|> Enum.reduce(<<>>, fn {_k, v}, secondary_references ->
|
||||||
|
target = Map.fetch!(reference_indexes, v)
|
||||||
|
|
||||||
|
<<secondary_references::binary, target::32-little>>
|
||||||
|
end)
|
||||||
|
|
||||||
|
# Finally add the reference start and number of tags in the footer
|
||||||
|
<<
|
||||||
|
tag_block::binary,
|
||||||
|
references::binary,
|
||||||
|
secondary_references::binary,
|
||||||
|
@format_version::32-little,
|
||||||
|
reference_start::32-little,
|
||||||
|
length(tags)::32-little
|
||||||
|
>>
|
||||||
|
end
|
||||||
|
|
||||||
|
defp tags_and_associations do
|
||||||
|
# Names longer than 255 bytes do not fit and will break parsing.
|
||||||
|
# Sort is done in the application to avoid collation.
|
||||||
|
tags =
|
||||||
|
LocalAutocomplete.get_tags(@top_tags)
|
||||||
|
|> Enum.filter(&(byte_size(&1.name) < 255))
|
||||||
|
|> Enum.sort_by(& &1.name)
|
||||||
|
|
||||||
|
associations =
|
||||||
|
LocalAutocomplete.get_associations(tags, @max_associations)
|
||||||
|
|
||||||
|
{tags, associations}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp int32_align(bin) do
|
||||||
|
# Right-pad a binary to be a multiple of 4 bytes.
|
||||||
|
pad_bits = 8 * (4 - rem(byte_size(bin), 4))
|
||||||
|
|
||||||
|
<<bin::binary, 0::size(pad_bits)>>
|
||||||
|
end
|
||||||
|
|
||||||
|
defp name_in_namespace(s) do
|
||||||
|
# Remove the artist:, oc: etc. prefix from a tag name, if one is present.
|
||||||
|
case String.split(s, ":", parts: 2, trim: true) do
|
||||||
|
[_namespace, name] ->
|
||||||
|
name
|
||||||
|
|
||||||
|
[name] ->
|
||||||
|
name
|
||||||
|
|
||||||
|
_unknown ->
|
||||||
|
s
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
101
lib/philomena/tags/local_autocomplete.ex
Normal file
101
lib/philomena/tags/local_autocomplete.ex
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
defmodule Philomena.Tags.LocalAutocomplete do
|
||||||
|
alias Philomena.Images.Tagging
|
||||||
|
alias Philomena.Tags.Tag
|
||||||
|
alias Philomena.Repo
|
||||||
|
import Ecto.Query
|
||||||
|
|
||||||
|
defmodule Entry do
|
||||||
|
@moduledoc """
|
||||||
|
An individual entry record for autocomplete generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@type t :: %__MODULE__{
|
||||||
|
name: String.t(),
|
||||||
|
images_count: integer(),
|
||||||
|
id: integer(),
|
||||||
|
alias_name: String.t() | nil
|
||||||
|
}
|
||||||
|
|
||||||
|
defstruct name: "",
|
||||||
|
images_count: 0,
|
||||||
|
id: 0,
|
||||||
|
alias_name: nil
|
||||||
|
end
|
||||||
|
|
||||||
|
@type entry_list() :: [Entry.t()]
|
||||||
|
|
||||||
|
@type tag_id :: integer()
|
||||||
|
@type assoc_map() :: %{optional(String.t()) => [tag_id()]}
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Get a flat list of entry records for all of the top `amount` tags, and all of their
|
||||||
|
aliases.
|
||||||
|
"""
|
||||||
|
@spec get_tags(integer()) :: entry_list()
|
||||||
|
def get_tags(amount) do
|
||||||
|
tags = top_tags(amount)
|
||||||
|
aliases = aliases_of_tags(tags)
|
||||||
|
aliases ++ tags
|
||||||
|
end
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Get a map of tag names to their most associated tag ids.
|
||||||
|
|
||||||
|
For every tag entry, its associated tags satisfy the following properties:
|
||||||
|
- is not the same as the entry's tag id
|
||||||
|
- of a sample of 100 images, appear simultaneously more than 50% of the time
|
||||||
|
"""
|
||||||
|
@spec get_associations(entry_list(), integer()) :: assoc_map()
|
||||||
|
def get_associations(tags, amount) do
|
||||||
|
tags
|
||||||
|
|> Enum.filter(&is_nil(&1.alias_name))
|
||||||
|
|> Map.new(&{&1.name, associated_tag_ids(&1, amount)})
|
||||||
|
end
|
||||||
|
|
||||||
|
defp top_tags(amount) do
|
||||||
|
query =
|
||||||
|
from t in Tag,
|
||||||
|
where: t.images_count > 0,
|
||||||
|
select: %Entry{name: t.name, images_count: t.images_count, id: t.id},
|
||||||
|
order_by: [desc: :images_count],
|
||||||
|
limit: ^amount
|
||||||
|
|
||||||
|
Repo.all(query)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp aliases_of_tags(tags) do
|
||||||
|
ids = Enum.map(tags, & &1.id)
|
||||||
|
|
||||||
|
query =
|
||||||
|
from t in Tag,
|
||||||
|
where: t.aliased_tag_id in ^ids,
|
||||||
|
inner_join: a in assoc(t, :aliased_tag),
|
||||||
|
select: %Entry{name: t.name, images_count: 0, id: 0, alias_name: a.name}
|
||||||
|
|
||||||
|
Repo.all(query)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp associated_tag_ids(entry, amount) do
|
||||||
|
image_sample_query =
|
||||||
|
from it in Tagging,
|
||||||
|
where: it.tag_id == ^entry.id,
|
||||||
|
select: it.image_id,
|
||||||
|
order_by: [asc: fragment("random()")],
|
||||||
|
limit: 100
|
||||||
|
|
||||||
|
# Select the tags from those images which have more uses than
|
||||||
|
# the current one being considered, and overlap more than 50%
|
||||||
|
assoc_query =
|
||||||
|
from it in Tagging,
|
||||||
|
inner_join: t in assoc(it, :tag),
|
||||||
|
where: t.images_count > ^entry.images_count,
|
||||||
|
where: it.image_id in subquery(image_sample_query),
|
||||||
|
group_by: t.id,
|
||||||
|
order_by: [desc: fragment("count(*)")],
|
||||||
|
having: fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^entry.images_count),
|
||||||
|
select: t.id,
|
||||||
|
limit: ^amount
|
||||||
|
|
||||||
|
Repo.all(assoc_query, timeout: 120_000)
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue