philomena/lib/philomena/autocomplete.ex

144 lines
4 KiB
Elixir

defmodule Philomena.Autocomplete do
@moduledoc """
Pregenerated autocomplete files.
"""
import Ecto.Query, warn: false
alias Philomena.Repo
alias Philomena.Tags.Tag
alias Philomena.Images.Tagging
alias Philomena.Autocomplete.Autocomplete
@type tags_list() :: [{String.t(), number(), number()}]
@type assoc_map() :: %{String.t() => [number()]}
@spec get_autocomplete() :: Autocomplete.t() | nil
def get_autocomplete do
Autocomplete
|> order_by(desc: :created_at)
|> limit(1)
|> Repo.one()
end
def generate_autocomplete! do
tags = get_tags()
associations = get_associations(tags)
# Tags are already sorted, so just add them to the file directly
#
# struct tag {
# uint8_t key_length;
# uint8_t key[];
# uint8_t association_length;
# uint32_t associations[];
# };
#
# struct tag_reference {
# uint32_t tag_location;
# uint32_t num_uses;
# };
#
{ac_file, references} =
Enum.reduce(tags, {<<>>, <<>>}, fn {name, images_count, _}, {file, references} ->
pos = byte_size(file)
assn = Map.get(associations, name, [])
assn_bin = for id <- assn, into: <<>>, do: <<id::32-little>>
{
<<file::binary, byte_size(name)::8, name::binary, length(assn)::8, assn_bin::binary>>,
<<references::binary, pos::32-little, images_count::32-little>>
}
end)
ac_file = int32_align(ac_file)
reference_start = byte_size(ac_file)
# Finally add the reference start and number of tags in the footer
#
# struct autocomplete_file {
# struct tag tags[];
# struct tag_reference references[];
# uint32_t format_version;
# uint32_t reference_start;
# uint32_t num_tags;
# };
#
ac_file =
<<ac_file::binary, references::binary, 1::32-little, reference_start::32-little,
length(tags)::32-little>>
# Insert the autocomplete binary
new_ac =
%Autocomplete{}
|> Autocomplete.changeset(%{content: ac_file})
|> Repo.insert!()
# Remove anything older
Autocomplete
|> where([ac], ac.created_at < ^new_ac.created_at)
|> Repo.delete_all()
end
#
# Get the names of tags and their number of uses as a map.
# Sort is done in the application to avoid collation.
#
@spec get_tags() :: tags_list()
defp get_tags do
Tag
|> select([t], {t.name, t.images_count, t.id})
|> where([t], t.images_count > 0)
|> order_by(desc: :images_count)
|> limit(65_535)
|> Repo.all()
|> Enum.filter(fn {name, _, _} -> byte_size(name) < 255 end)
|> Enum.sort()
end
#
# Get up to eight associated tag ids for each returned tag.
#
@spec get_associations(tags_list()) :: assoc_map()
defp get_associations(tags) do
tags
|> Enum.map(fn {name, images_count, id} ->
# Randomly sample 100 images with this tag
image_sample =
Tagging
|> where(tag_id: ^id)
|> select([it], it.image_id)
|> order_by(asc: fragment("random()"))
|> limit(100)
# Select the tags from those images which have more uses than
# the current one being considered, and overlap more than 50%
assoc_ids =
Tagging
|> join(:inner, [it], _ in assoc(it, :tag))
|> where([_, t], t.images_count > ^images_count)
|> where([it, _], it.image_id in subquery(image_sample))
|> group_by([_, t], t.id)
|> order_by(desc: fragment("count(*)"))
|> having([_, t], fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^images_count))
|> select([_, t], t.id)
|> limit(8)
|> Repo.all(timeout: 120_000)
{name, assoc_ids}
end)
|> Map.new()
end
#
# Right-pad a binary to be a multiple of 4 bytes.
#
@spec int32_align(binary()) :: binary()
defp int32_align(bin) do
pad_bits = 8 * (4 - rem(byte_size(bin), 4))
<<bin::binary, 0::size(pad_bits)>>
end
end