philomena/lib/philomena/autocomplete.ex

defmodule Philomena.Autocomplete do
  @moduledoc """
  Pregenerated autocomplete files.
  """

  import Ecto.Query, warn: false
  alias Philomena.Repo

  alias Philomena.Tags.Tag
  alias Philomena.Images.Tagging
  alias Philomena.Autocomplete.Autocomplete

  @type tags_list() :: [{String.t(), number(), number()}]
  @type assoc_map() :: %{String.t() => [number()]}

  @spec get_autocomplete() :: Autocomplete.t() | nil
  def get_autocomplete do
    Autocomplete
    |> order_by(desc: :created_at)
    |> limit(1)
    |> Repo.one()
  end

  def generate_autocomplete! do
    tags = get_tags()
    associations = get_associations(tags)

    # Tags are already sorted, so just add them to the file directly
    #
    #     struct tag {
    #       uint8_t key_length;
    #       uint8_t key[];
    #       uint8_t association_length;
    #       uint32_t associations[];
    #     };
    #
    #     struct tag_reference {
    #       uint32_t tag_location;
    #       uint32_t num_uses;
    #     };
    #

    {ac_file, references} =
      Enum.reduce(tags, {<<>>, <<>>}, fn {name, images_count, _}, {file, references} ->
        pos = byte_size(file)
        assn = Map.get(associations, name, [])
        assn_bin = for id <- assn, into: <<>>, do: <<id::32-little>>

        {
          <<file::binary, byte_size(name)::8, name::binary, length(assn)::8, assn_bin::binary>>,
          <<references::binary, pos::32-little, images_count::32-little>>
        }
      end)

    ac_file = int32_align(ac_file)
    reference_start = byte_size(ac_file)

    # Finally add the reference start and number of tags in the footer
    #
    #     struct autocomplete_file {
    #       struct tag tags[];
    #       struct tag_reference references[];
    #       uint32_t format_version;
    #       uint32_t reference_start;
    #       uint32_t num_tags;
    #     };
    #

    ac_file =
      <<ac_file::binary, references::binary, 1::32-little, reference_start::32-little,
        length(tags)::32-little>>

    # Insert the autocomplete binary
    new_ac =
      %Autocomplete{}
      |> Autocomplete.changeset(%{content: ac_file})
      |> Repo.insert!()

    # Remove anything older
    Autocomplete
    |> where([ac], ac.created_at < ^new_ac.created_at)
    |> Repo.delete_all()
  end

  #
  # Get the names of tags and their number of uses as a map.
  # Sort is done in the application to avoid collation.
  #
  @spec get_tags() :: tags_list()
  defp get_tags do
    Tag
    |> select([t], {t.name, t.images_count, t.id})
    |> where([t], t.images_count > 0)
    |> order_by(desc: :images_count)
    |> limit(65_535)
    |> Repo.all()
    |> Enum.filter(fn {name, _, _} -> byte_size(name) < 255 end)
    |> Enum.sort()
  end

  #
  # Get up to eight associated tag ids for each returned tag.
  #
  @spec get_associations(tags_list()) :: assoc_map()
  defp get_associations(tags) do
    tags
    |> Enum.map(fn {name, images_count, id} ->
      # Randomly sample 100 images with this tag
      image_sample =
        Tagging
        |> where(tag_id: ^id)
        |> select([it], it.image_id)
        |> order_by(asc: fragment("random()"))
        |> limit(100)

      # Select the tags from those images which have more uses than
      # the current one being considered, and overlap more than 50%
      assoc_ids =
        Tagging
        |> join(:inner, [it], _ in assoc(it, :tag))
        |> where([_, t], t.images_count > ^images_count)
        |> where([it, _], it.image_id in subquery(image_sample))
        |> group_by([_, t], t.id)
        |> order_by(desc: fragment("count(*)"))
        |> having([_, t], fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^images_count))
        |> select([_, t], t.id)
        |> limit(8)
        |> Repo.all(timeout: 120_000)

      {name, assoc_ids}
    end)
    |> Map.new()
  end

  #
  # Right-pad a binary to be a multiple of 4 bytes.
  #
  @spec int32_align(binary()) :: binary()
  defp int32_align(bin) do
    pad_bits = 8 * (4 - rem(byte_size(bin), 4))

    <<bin::binary, 0::size(pad_bits)>>
  end
end