Local autocomplete (#151)

This commit is contained in:
liamwhite 2021-12-26 19:16:21 -05:00 committed by GitHub
parent 715506352c
commit dadc2f1585
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 374 additions and 0 deletions

View file

@ -2,6 +2,9 @@
* Autocomplete. * Autocomplete.
*/ */
import { LocalAutocompleter } from 'utils/local-autocompleter';
import { handleError } from 'utils/requests';
const cache = {}; const cache = {};
let inputField, originalTerm; let inputField, originalTerm;
@ -122,8 +125,23 @@ function getSuggestions(term) {
function listenAutocomplete() { function listenAutocomplete() {
let timeout; let timeout;
/** @type {LocalAutocompleter} */
let localAc = null;
let localFetched = false;
document.addEventListener('focusin', fetchLocalAutocomplete);
document.addEventListener('input', event => { document.addEventListener('input', event => {
removeParent(); removeParent();
fetchLocalAutocomplete(event);
if (localAc !== null && 'ac' in event.target.dataset) {
inputField = event.target;
originalTerm = inputField.value;
const suggestions = localAc.topK(inputField.value, 5).map(({ name, imageCount }) => ({ label: `${name} (${imageCount})`, value: name }));
return showAutocomplete(suggestions, originalTerm, event.target);
}
window.clearTimeout(timeout); window.clearTimeout(timeout);
// Use a timeout to delay requests until the user has stopped typing // Use a timeout to delay requests until the user has stopped typing
@ -150,6 +168,16 @@ function listenAutocomplete() {
document.addEventListener('click', event => { document.addEventListener('click', event => {
if (event.target && event.target !== inputField) removeParent(); if (event.target && event.target !== inputField) removeParent();
}); });
function fetchLocalAutocomplete(event) {
if (!localFetched && event.target.dataset && 'ac' in event.target.dataset) {
localFetched = true;
fetch('/autocomplete/compiled', { credentials: 'omit', cache: 'force-cache' })
.then(handleError)
.then(resp => resp.arrayBuffer())
.then(buf => localAc = new LocalAutocompleter(buf));
}
}
} }
export { listenAutocomplete }; export { listenAutocomplete };

View file

@ -0,0 +1,134 @@
//@ts-check
/*
* Client-side tag completion.
*/
/**
* @typedef {object} Result
* @property {string} name
* @property {number} imageCount
* @property {number[]} associations
*/
/**
* See lib/philomena/autocomplete.ex for binary structure details.
*
* A binary blob is used to avoid the creation of large amounts of garbage on
* the JS heap and speed up the execution of the search.
*/
export class LocalAutocompleter {
/**
* Build a new local autocompleter.
*
* @param {ArrayBuffer} backingStore
*/
constructor(backingStore) {
/** @type {Uint8Array} */
this.data = new Uint8Array(backingStore);
/** @type {DataView} */
this.view = new DataView(backingStore);
/** @type {TextDecoder} */
this.decoder = new TextDecoder();
/** @type {number} */
this.numTags = this.view.getUint32(backingStore.byteLength - 4, true);
/** @type {number} */
this.referenceStart = this.view.getUint32(backingStore.byteLength - 8, true);
/** @type {number} */
this.formatVersion = this.view.getUint32(backingStore.byteLength - 12, true);
if (this.formatVersion !== 1) {
throw new Error('Incompatible autocomplete format version');
}
}
/**
* Get a tag's name and its associations given a byte location inside the file.
*
* @param {number} location
* @returns {[string, number[]]}
*/
getTagFromLocation(location) {
const nameLength = this.view.getUint8(location);
const assnLength = this.view.getUint8(location + 1 + nameLength);
/** @type {number[]} */
const associations = [];
const name = this.decoder.decode(this.data.slice(location + 1, location + nameLength + 1));
for (let i = 0; i < assnLength; i++) {
associations.push(this.view.getUint32(location + 1 + nameLength + i * 4, true));
}
return [ name, associations ];
}
/**
* Get a Result object as the ith tag inside the file.
*
* @param {number} i
* @returns {Result}
*/
getResultAt(i) {
const nameLocation = this.view.getUint32(this.referenceStart + i * 8, true);
const imageCount = this.view.getUint32(this.referenceStart + i * 8 + 4, true);
const [ name, associations ] = this.getTagFromLocation(nameLocation);
return { name, imageCount, associations };
}
/**
* Find the top k results by image count which match the given string prefix.
*
* @param {string} prefix
* @param {number} k
* @returns {Result[]}
*/
topK(prefix, k) {
/** @type {Result[]} */
const results = [];
/** @type {number[]} */
//@ts-expect-error No type for window.booru yet
const hiddenTags = window.booru.hiddenTagList;
if (prefix === '') {
return results;
}
// Binary search to find last smaller prefix
let l = 0;
let r = this.numTags;
while (l < r - 1) {
const m = (l + (r - l) / 2) | 0;
const { name } = this.getResultAt(m);
if (name.slice(0, prefix.length) >= prefix) {
// too large, go left
r = m;
}
else {
// too small, go right
l = m;
}
}
// Scan forward until no more matches occur
while (l < this.numTags - 1) {
const result = this.getResultAt(++l);
if (!result.name.startsWith(prefix)) {
break;
}
// Add if no associations are filtered
if (hiddenTags.findIndex(ht => result.associations.includes(ht)) === -1) {
results.push(result);
}
}
// Sort results by image count
results.sort((a, b) => b.imageCount - a.imageCount);
return results.slice(0, k);
}
}

View file

@ -10,6 +10,7 @@ background() {
mix run -e 'Philomena.Release.verify_artist_links()' mix run -e 'Philomena.Release.verify_artist_links()'
mix run -e 'Philomena.Release.update_stats()' mix run -e 'Philomena.Release.update_stats()'
mix run -e 'Philomena.Release.clean_moderation_logs()' mix run -e 'Philomena.Release.clean_moderation_logs()'
mix run -e 'Philomena.Release.generate_autocomplete()'
sleep 300 sleep 300
done done

View file

@ -0,0 +1,144 @@
defmodule Philomena.Autocomplete do
@moduledoc """
Pregenerated autocomplete files.
"""
import Ecto.Query, warn: false
alias Philomena.Repo
alias Philomena.Tags.Tag
alias Philomena.Images.Tagging
alias Philomena.Autocomplete.Autocomplete
@type tags_list() :: [{String.t(), number(), number()}]
@type assoc_map() :: %{String.t() => [number()]}
@spec get_autocomplete() :: Autocomplete.t() | nil
def get_autocomplete do
Autocomplete
|> order_by(desc: :created_at)
|> limit(1)
|> Repo.one()
end
def generate_autocomplete! do
tags = get_tags()
associations = get_associations(tags)
# Tags are already sorted, so just add them to the file directly
#
# struct tag {
# uint8_t key_length;
# uint8_t key[];
# uint8_t association_length;
# uint32_t associations[];
# };
#
# struct tag_reference {
# uint32_t tag_location;
# uint32_t num_uses;
# };
#
{ac_file, references} =
Enum.reduce(tags, {<<>>, <<>>}, fn {name, images_count, _}, {file, references} ->
pos = byte_size(file)
assn = Map.get(associations, name, [])
assn_bin = for id <- assn, into: <<>>, do: <<id::32-little>>
{
<<file::binary, byte_size(name)::8, name::binary, length(assn)::8, assn_bin::binary>>,
<<references::binary, pos::32-little, images_count::32-little>>
}
end)
ac_file = int32_align(ac_file)
reference_start = byte_size(ac_file)
# Finally add the reference start and number of tags in the footer
#
# struct autocomplete_file {
# struct tag tags[];
# struct tag_reference references[];
# uint32_t format_version;
# uint32_t reference_start;
# uint32_t num_tags;
# };
#
ac_file =
<<ac_file::binary, references::binary, 1::32-little, reference_start::32-little,
length(tags)::32-little>>
# Insert the autocomplete binary
new_ac =
%Autocomplete{}
|> Autocomplete.changeset(%{content: ac_file})
|> Repo.insert!()
# Remove anything older
Autocomplete
|> where([ac], ac.created_at < ^new_ac.created_at)
|> Repo.delete_all()
end
#
# Get the names of tags and their number of uses as a map.
# Sort is done in the application to avoid collation.
#
@spec get_tags() :: tags_list()
defp get_tags do
Tag
|> select([t], {t.name, t.images_count, t.id})
|> where([t], t.images_count > 0)
|> order_by(desc: :images_count)
|> limit(65_535)
|> Repo.all()
|> Enum.filter(fn {name, _, _} -> byte_size(name) < 255 end)
|> Enum.sort()
end
#
# Get up to eight associated tag ids for each returned tag.
#
@spec get_associations(tags_list()) :: assoc_map()
defp get_associations(tags) do
tags
|> Enum.map(fn {name, images_count, id} ->
# Randomly sample 100 images with this tag
image_sample =
Tagging
|> where(tag_id: ^id)
|> select([it], it.image_id)
|> order_by(asc: fragment("random()"))
|> limit(100)
# Select the tags from those images which have more uses than
# the current one being considered, and overlap more than 50%
assoc_ids =
Tagging
|> join(:inner, [it], _ in assoc(it, :tag))
|> where([_, t], t.images_count > ^images_count)
|> where([it, _], it.image_id in subquery(image_sample))
|> group_by([_, t], t.id)
|> order_by(desc: fragment("count(*)"))
|> having([_, t], fragment("(100 * count(*)::float / LEAST(?, 100)) > 50", ^images_count))
|> select([_, t], t.id)
|> limit(8)
|> Repo.all()
{name, assoc_ids}
end)
|> Map.new()
end
#
# Right-pad a binary to be a multiple of 4 bytes.
#
@spec int32_align(binary()) :: binary()
defp int32_align(bin) do
pad_bits = 8 * (4 - rem(byte_size(bin), 4))
<<bin::binary, 0::size(pad_bits)>>
end
end

View file

@ -0,0 +1,17 @@
defmodule Philomena.Autocomplete.Autocomplete do
use Ecto.Schema
import Ecto.Changeset
@primary_key false
schema "autocomplete" do
field :content, :binary
timestamps(inserted_at: :created_at, updated_at: false, type: :utc_datetime)
end
@doc false
def changeset(autocomplete, attrs) do
autocomplete
|> cast(attrs, [:content])
|> validate_required([:content])
end
end

View file

@ -34,6 +34,11 @@ defmodule Philomena.Release do
Philomena.ModerationLogs.cleanup!() Philomena.ModerationLogs.cleanup!()
end end
def generate_autocomplete do
start_app()
Philomena.Autocomplete.generate_autocomplete!()
end
defp repos do defp repos do
Application.fetch_env!(@app, :ecto_repos) Application.fetch_env!(@app, :ecto_repos)
end end

View file

@ -0,0 +1,23 @@
defmodule PhilomenaWeb.Autocomplete.CompiledController do
use PhilomenaWeb, :controller
alias Philomena.Autocomplete
def show(conn, _params) do
autocomplete = Autocomplete.get_autocomplete()
case autocomplete do
nil ->
conn
|> put_status(:not_found)
|> configure_session(drop: true)
|> text("")
%{content: content} ->
conn
|> put_resp_header("cache-control", "public, max-age=86400")
|> configure_session(drop: true)
|> resp(200, content)
end
end
end

View file

@ -461,6 +461,7 @@ defmodule PhilomenaWeb.Router do
scope "/autocomplete", Autocomplete, as: :autocomplete do scope "/autocomplete", Autocomplete, as: :autocomplete do
resources "/tags", TagController, only: [:show], singleton: true resources "/tags", TagController, only: [:show], singleton: true
resources "/compiled", CompiledController, only: [:show], singleton: true
end end
scope "/fetch", Fetch, as: :fetch do scope "/fetch", Fetch, as: :fetch do

View file

@ -0,0 +1,10 @@
defmodule Philomena.Repo.Migrations.CreateAutocomplete do
use Ecto.Migration
def change do
create table(:autocomplete, primary_key: false) do
add :content, :binary, null: false
timestamps(inserted_at: :created_at, updated_at: false, type: :utc_datetime)
end
end
end

View file

@ -116,6 +116,16 @@ CREATE SEQUENCE public.artist_links_id_seq
ALTER SEQUENCE public.artist_links_id_seq OWNED BY public.artist_links.id; ALTER SEQUENCE public.artist_links_id_seq OWNED BY public.artist_links.id;
--
-- Name: autocomplete; Type: TABLE; Schema: public; Owner: -
--
CREATE TABLE public.autocomplete (
content bytea NOT NULL,
created_at timestamp(0) without time zone NOT NULL
);
-- --
-- Name: badge_awards; Type: TABLE; Schema: public; Owner: - -- Name: badge_awards; Type: TABLE; Schema: public; Owner: -
-- --
@ -4959,3 +4969,4 @@ INSERT INTO public."schema_migrations" (version) VALUES (20210917190346);
INSERT INTO public."schema_migrations" (version) VALUES (20210921025336); INSERT INTO public."schema_migrations" (version) VALUES (20210921025336);
INSERT INTO public."schema_migrations" (version) VALUES (20210929181319); INSERT INTO public."schema_migrations" (version) VALUES (20210929181319);
INSERT INTO public."schema_migrations" (version) VALUES (20211107130226); INSERT INTO public."schema_migrations" (version) VALUES (20211107130226);
INSERT INTO public."schema_migrations" (version) VALUES (20211219194836);