Merge pull request #380 from philomena-dev/faster-index

Remove makefile indexer and improve Elixir-side indexing
This commit is contained in:
liamwhite 2024-12-15 20:55:09 -05:00 committed by GitHub
commit 6ef53545dd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 156 additions and 538 deletions

View file

@ -101,7 +101,9 @@ if config_env() != :test do
url: System.fetch_env!("DATABASE_URL"), url: System.fetch_env!("DATABASE_URL"),
pool_size: String.to_integer(System.get_env("POOL_SIZE", "16")), pool_size: String.to_integer(System.get_env("POOL_SIZE", "16")),
timeout: 60_000, timeout: 60_000,
ownership_timeout: 60_000 ownership_timeout: 60_000,
queue_target: 20_000,
queue_interval: 20_000
end end
if config_env() == :prod do if config_env() == :prod do

View file

@ -1,25 +0,0 @@
all: comments galleries images posts reports tags filters
comments:
$(MAKE) -f comments.mk
galleries:
$(MAKE) -f galleries.mk
images:
$(MAKE) -f images.mk
posts:
$(MAKE) -f posts.mk
reports:
$(MAKE) -f reports.mk
tags:
$(MAKE) -f tags.mk
filters:
$(MAKE) -f filters.mk
clean:
rm -f ./*.jsonl

View file

@ -1,49 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=comments.jsonl --output=$OPENSEARCH_URL --output-index=comments --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata authors tags
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_comments.jsonb_object_agg(object) from temp_comments.comment_search_json group by comment_id) to stdout;' > comments.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_comments cascade;'
sed -i comments.jsonl -e 's/\\\\/\\/g'
metadata: comment_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object(
'id', c.id,
'posted_at', c.created_at,
'ip', c.ip,
'fingerprint', c.fingerprint,
'image_id', c.image_id,
'user_id', c.user_id,
'anonymous', c.anonymous,
'body', c.body,
'hidden_from_users', (c.hidden_from_users or i.hidden_from_users)
) from comments c inner join images i on c.image_id=i.id;
SQL
authors: comment_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('author', (case when c.anonymous='t' then null else u.name end)) from comments c left join users u on c.user_id=u.id;
SQL
tags: comment_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
create unlogged table temp_comments.image_tags (image_id bigint not null, tags jsonb not null);
insert into temp_comments.image_tags (image_id, tags) select it.image_id, jsonb_agg(it.tag_id) from image_taggings it group by it.image_id;
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('image_tag_ids', it.tags) from comments c inner join temp_comments.image_tags it on c.image_id=it.image_id;
SQL
comment_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_comments cascade;
create schema temp_comments;
create unlogged table temp_comments.comment_search_json (comment_id bigint not null, object jsonb not null);
create or replace aggregate temp_comments.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,47 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=filters.jsonl --output=$OPENSEARCH_URL --output-index=filters --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata creators
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_filters.jsonb_object_agg(object) from temp_filters.filter_search_json group by filter_id) to stdout;' > filters.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_filters cascade;'
sed -i filters.jsonl -e 's/\\\\/\\/g'
metadata: filter_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_filters.filter_search_json (filter_id, object) select f.id, jsonb_build_object(
'id', f.id,
'created_at', f.created_at,
'user_id', f.user_id,
'public', f.public or f.system,
'system', f.system,
'name', lower(f.name),
'description', f.description,
'spoilered_count', array_length(f.spoilered_tag_ids, 1),
'hidden_count', array_length(f.hidden_tag_ids, 1),
'spoilered_tag_ids', f.spoilered_tag_ids,
'hidden_tag_ids', f.hidden_tag_ids,
'spoilered_complex_str', lower(f.spoilered_complex_str),
'hidden_complex_str', lower(f.hidden_complex_str),
'user_count', f.user_count
) from filters f;
SQL
creators: filter_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_filters.filter_search_json (filter_id, object) select f.id, jsonb_build_object('creator', lower(u.name)) from filters f left join users u on f.user_id=u.id;
SQL
filter_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_filters cascade;
create schema temp_filters;
create unlogged table temp_filters.filter_search_json (filter_id bigint not null, object jsonb not null);
create or replace aggregate temp_filters.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,45 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=galleries.jsonl --output=$OPENSEARCH_URL --output-index=galleries --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata subscribers images
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_galleries.jsonb_object_agg(object) from temp_galleries.gallery_search_json group by gallery_id) to stdout;' > galleries.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_galleries cascade;'
sed -i galleries.jsonl -e 's/\\\\/\\/g'
metadata: gallery_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_galleries.gallery_search_json (gallery_id, object) select g.id, jsonb_build_object(
'id', g.id,
'image_count', g.image_count,
'updated_at', g.updated_at,
'created_at', g.created_at,
'title', lower(g.title),
'creator', lower(u.name),
'description', g.description
) from galleries g left join users u on g.creator_id=u.id;
SQL
subscribers: gallery_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('watcher_ids', jsonb_agg(user_id), 'watcher_count', count(*)) from gallery_subscriptions group by gallery_id;
SQL
images: gallery_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('image_ids', jsonb_agg(image_id)) from gallery_interactions group by gallery_id;
SQL
gallery_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_galleries cascade;
create schema temp_galleries;
create unlogged table temp_galleries.gallery_search_json (gallery_id bigint not null, object jsonb not null);
create or replace aggregate temp_galleries.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,156 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=images.jsonl --output=$OPENSEARCH_URL --output-index=images --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata true_uploaders uploaders deleters galleries tags sources hides upvotes downvotes faves tag_names
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_images.jsonb_object_agg(object) from temp_images.image_search_json group by image_id) to stdout;' > images.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_images cascade;'
sed -i images.jsonl -e 's/\\\\/\\/g'
metadata: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select id, jsonb_build_object(
'approved', approved,
'animated', is_animated,
'anonymous', anonymous,
'aspect_ratio', nullif(image_aspect_ratio, 'NaN'::float8),
'comment_count', comments_count,
'created_at', created_at,
'deletion_reason', deletion_reason,
'description', description,
'downvotes', downvotes_count,
'duplicate_id', duplicate_id,
'duration', (case when is_animated then image_duration else 0::float end),
'faves', faves_count,
'file_name', image_name,
'fingerprint', fingerprint,
'first_seen_at', first_seen_at,
'height', image_height,
'hidden_from_users', hidden_from_users,
'id', id,
'ip', ip,
'mime_type', image_mime_type,
'orig_sha512_hash', image_orig_sha512_hash,
'original_format', image_format,
'pixels', cast(image_width as bigint)*cast(image_height as bigint),
'processed', processed,
'score', score,
'size', image_size,
'orig_size', image_orig_size,
'sha512_hash', image_sha512_hash,
'thumbnails_generated', thumbnails_generated,
'updated_at', updated_at,
'upvotes', upvotes_count,
'width', image_width,
'wilson_score', temp_images.wilson_995(upvotes_count, downvotes_count)
) from images;
SQL
true_uploaders: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('true_uploader_id', u.id, 'true_uploader', u.name) from images i left join users u on u.id = i.user_id;
SQL
uploaders: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('uploader_id', (case when i.anonymous = 't' then null else u.id end), 'uploader', (case when i.anonymous = 't' then null else lower(u.name) end)) from images i left join users u on u.id = i.user_id;
SQL
deleters: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('deleted_by_user_id', u.id, 'deleted_by_user', lower(u.name)) from images i left join users u on u.id = i.deleted_by_id;
SQL
galleries: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select gi.image_id, jsonb_build_object('gallery_interactions', jsonb_agg(jsonb_build_object('id', gi.gallery_id, 'position', gi.position))) from gallery_interactions gi group by image_id;
SQL
tags: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select it.image_id, jsonb_build_object(
'tag_ids', jsonb_agg(it.tag_id),
'tag_count', count(*),
'error_tag_count', count(case when t.category = 'error' then t.category else null end),
'rating_tag_count', count(case when t.category = 'rating' then t.category else null end),
'origin_tag_count', count(case when t.category = 'origin' then t.category else null end),
'character_tag_count', count(case when t.category = 'character' then t.category else null end),
'oc_tag_count', count(case when t.category = 'oc' then t.category else null end),
'species_tag_count', count(case when t.category = 'species' then t.category else null end),
'body_type_tag_count', count(case when t.category = 'body-type' then t.category else null end),
'content_fanmade_tag_count', count(case when t.category = 'content-fanmade' then t.category else null end),
'content_official_tag_count', count(case when t.category = 'content-official' then t.category else null end),
'spoiler_tag_count', count(case when t.category = 'spoiler' then t.category else null end)
) from image_taggings it inner join tags t on t.id = it.tag_id group by image_id;
SQL
sources: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select s.image_id, jsonb_build_object('source_url', jsonb_agg(lower(s.source)), 'source_count', count(*)) from image_sources s group by image_id;
SQL
hides: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select ih.image_id, jsonb_build_object('hidden_by_user_ids', jsonb_agg(ih.user_id), 'hidden_by_users', jsonb_agg(lower(u.name))) from image_hides ih inner join users u on u.id = ih.user_id group by image_id;
SQL
downvotes: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('downvoter_ids', jsonb_agg(iv.user_id), 'downvoters', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = false group by image_id;
SQL
upvotes: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('upvoter_ids', jsonb_agg(iv.user_id), 'upvoters', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = true group by image_id;
SQL
faves: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select if.image_id, jsonb_build_object('favourited_by_user_ids', jsonb_agg(if.user_id), 'favourited_by_users', jsonb_agg(lower(u.name))) from image_faves if inner join users u on u.id = if.user_id group by image_id;
SQL
tag_names: tags_with_aliases
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select image_id, jsonb_build_object('namespaced_tags', jsonb_build_object('name', jsonb_agg(lower(tag_name)))) from temp_images.tags_with_aliases group by image_id;
SQL
tags_with_aliases: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
create unlogged table if not exists temp_images.tags_with_aliases (image_id bigint not null, tag_name text not null);
truncate temp_images.tags_with_aliases;
insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it inner join tags t on t.id = it.tag_id;
insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it left outer join tags t on t.aliased_tag_id = it.tag_id where t.name is not null;
SQL
image_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_images cascade;
create schema temp_images;
create unlogged table temp_images.image_search_json (image_id bigint not null, object jsonb not null);
create function temp_images.wilson_995(succ bigint, fail bigint) returns double precision as '
declare
n double precision;
p_hat double precision;
z double precision;
z2 double precision;
begin
if succ <= 0 then
return 0;
end if;
n := succ + fail;
p_hat := succ / n;
z := 2.57583;
z2 := 6.634900189;
return (p_hat + z2 / (2 * n) - z * sqrt((p_hat * (1 - p_hat) + z2 / (4 * n)) / n)) / (1 + z2 / n);
end
' language plpgsql;
create aggregate temp_images.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,51 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=posts.jsonl --output=$OPENSEARCH_URL --output-index=posts --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata authors
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_posts.jsonb_object_agg(object) from temp_posts.post_search_json group by post_id) to stdout;' > posts.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_posts cascade;'
sed -i posts.jsonl -e 's/\\\\/\\/g'
metadata: post_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object(
'id', p.id,
'topic_id', p.topic_id,
'body', p.body,
'subject', t.title,
'ip', p.ip,
'user_agent', '',
'referrer', '',
'fingerprint', p.fingerprint,
'topic_position', p.topic_position,
'forum', f.short_name,
'forum_id', t.forum_id,
'user_id', p.user_id,
'anonymous', p.anonymous,
'created_at', p.created_at,
'updated_at', p.updated_at,
'deleted', p.hidden_from_users,
'destroyed_content', p.destroyed_content,
'access_level', f.access_level
) from posts p inner join topics t on t.id=p.topic_id inner join forums f on f.id=t.forum_id;
SQL
authors: post_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object('author', (case when p.anonymous='t' then null else u.name end)) from posts p left join users u on p.user_id=u.id;
SQL
post_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_posts cascade;
create schema temp_posts;
create unlogged table temp_posts.post_search_json (post_id bigint not null, object jsonb not null);
create or replace aggregate temp_posts.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,51 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=reports.jsonl --output=$OPENSEARCH_URL --output-index=reports --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata image_ids comment_image_ids
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_reports.jsonb_object_agg(object) from temp_reports.report_search_json group by report_id) to stdout;' > reports.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_reports cascade;'
sed -i reports.jsonl -e 's/\\\\/\\/g'
metadata: report_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object(
'id', r.id,
'created_at', r.created_at,
'ip', r.ip,
'state', r.state,
'user', lower(u.name),
'user_id', r.user_id,
'admin', lower(a.name),
'admin_id', r.admin_id,
'reportable_type', r.reportable_type,
'reportable_id', r.reportable_id,
'fingerprint', r.fingerprint,
'open', r.open,
'reason', r.reason
) from reports r left join users u on r.user_id=u.id left join users a on r.admin_id=a.id;
SQL
image_ids: report_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', r.reportable_id) from reports r where r.reportable_type = 'Image';
SQL
comment_image_ids: report_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', c.image_id) from reports r inner join comments c on c.id = r.reportable_id where r.reportable_type = 'Comment';
SQL
report_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_reports cascade;
create schema temp_reports;
create unlogged table temp_reports.report_search_json (report_id bigint not null, object jsonb not null);
create or replace aggregate temp_reports.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,54 +0,0 @@
DATABASE ?= philomena
OPENSEARCH_URL ?= http://localhost:9200/
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=tags.jsonl --output=$OPENSEARCH_URL --output-index=tags --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
dump_jsonl: metadata aliases implied_tags implied_by_tags
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_tags.jsonb_object_agg(object) from temp_tags.tag_search_json group by tag_id) to stdout;' > tags.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_tags cascade;'
sed -i tags.jsonl -e 's/\\\\/\\/g'
metadata: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select t.id, jsonb_build_object(
'id', t.id,
'slug', t.slug,
'name', t.name,
'name_in_namespace', t.name_in_namespace,
'namespace', t.namespace,
'analyzed_name', t.name,
'aliased_tag', at.name,
'category', t.category,
'aliased', (t.aliased_tag_id is not null),
'description', t.description,
'short_description', t.short_description
) from tags t left join tags at on t.aliased_tag_id=at.id;
SQL
aliases: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select t.aliased_tag_id, jsonb_build_object('aliases', jsonb_agg(t.name)) from tags t inner join tags at on t.aliased_tag_id=t.id group by t.aliased_tag_id;
SQL
implied_tags: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select it.tag_id, jsonb_build_object('implied_tag_ids', jsonb_agg(it.implied_tag_id), 'implied_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.implied_tag_id group by it.tag_id;
SQL
implied_by_tags: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select it.implied_tag_id, jsonb_build_object('implied_by_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.tag_id group by it.implied_tag_id;
SQL
tag_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_tags cascade;
create schema temp_tags;
create unlogged table temp_tags.tag_search_json (tag_id bigint not null, object jsonb not null);
create or replace aggregate temp_tags.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -1,31 +1,7 @@
defmodule Mix.Tasks.ReindexAll do defmodule Mix.Tasks.ReindexAll do
use Mix.Task use Mix.Task
alias PhilomenaQuery.Search alias Philomena.SearchIndexer
alias Philomena.{
Comments.Comment,
Galleries.Gallery,
Posts.Post,
Images.Image,
Reports.Report,
Tags.Tag,
Filters.Filter
}
alias Philomena.{Comments, Galleries, Posts, Images, Tags, Filters}
alias Philomena.Polymorphic
alias Philomena.Repo
import Ecto.Query
@indices [
{Images, Image},
{Comments, Comment},
{Galleries, Gallery},
{Tags, Tag},
{Posts, Post},
{Filters, Filter}
]
@shortdoc "Destroys and recreates all OpenSearch indices." @shortdoc "Destroys and recreates all OpenSearch indices."
@requirements ["app.start"] @requirements ["app.start"]
@ -35,26 +11,6 @@ defmodule Mix.Tasks.ReindexAll do
raise "do not run this task unless you know what you're doing" raise "do not run this task unless you know what you're doing"
end end
@indices SearchIndexer.recreate_reindex_all_destructive!()
|> Enum.map(fn {context, schema} ->
Task.async(fn ->
Search.delete_index!(schema)
Search.create_index!(schema)
Search.reindex(preload(schema, ^context.indexing_preloads()), schema)
end)
end)
|> Task.await_many(:infinity)
# Reports are a bit special
Search.delete_index!(Report)
Search.create_index!(Report)
Report
|> preload([:user, :admin])
|> Repo.all()
|> Polymorphic.load_polymorphic(reportable: [reportable_id: :reportable_type])
|> Enum.map(&Search.index_document(&1, Report))
end end
end end

View file

@ -0,0 +1,134 @@
defmodule Philomena.SearchIndexer do
alias PhilomenaQuery.Batch
alias PhilomenaQuery.Search
alias Philomena.Comments
alias Philomena.Comments.Comment
alias Philomena.Filters
alias Philomena.Filters.Filter
alias Philomena.Galleries
alias Philomena.Galleries.Gallery
alias Philomena.Images
alias Philomena.Images.Image
alias Philomena.Posts
alias Philomena.Posts.Post
alias Philomena.Reports
alias Philomena.Reports.Report
alias Philomena.Tags
alias Philomena.Tags.Tag
alias Philomena.Polymorphic
import Ecto.Query
@schemas [
Comment,
Filter,
Gallery,
Image,
Post,
Report,
Tag
]
@contexts %{
Comment => Comments,
Filter => Filters,
Gallery => Galleries,
Image => Images,
Post => Posts,
Report => Reports,
Tag => Tags
}
@doc """
Recreate the index corresponding to all schemas, and then reindex all of the
documents within.
## Example
iex> SearchIndexer.recreate_reindex_all_destructive!()
:ok
"""
@spec recreate_reindex_all_destructive! :: :ok
def recreate_reindex_all_destructive! do
@schemas
|> Task.async_stream(
&recreate_reindex_schema_destructive!/1,
ordered: false,
timeout: :infinity
)
|> Stream.run()
end
@doc """
Recreate the index corresponding to a schema, and then reindex all of the
documents within the schema.
## Example
iex> SearchIndexer.recreate_reindex_schema_destructive!(Report)
:ok
"""
@spec recreate_reindex_schema_destructive!(schema :: module()) :: :ok
def recreate_reindex_schema_destructive!(schema) when schema in @schemas do
Search.delete_index!(schema)
Search.create_index!(schema)
reindex_schema(schema)
end
@doc """
Reindex all of the documents within all schemas.
## Example
iex> SearchIndexer.reindex_all()
:ok
"""
@spec reindex_all :: :ok
def reindex_all do
@schemas
|> Task.async_stream(
&reindex_schema/1,
ordered: false,
timeout: :infinity
)
|> Stream.run()
end
@doc """
Reindex all of the documents within a single schema.
## Example
iex> SearchIndexer.reindex_schema(Report)
:ok
"""
@spec reindex_schema(schema :: module()) :: :ok
def reindex_schema(schema)
def reindex_schema(Report) do
# Reports currently require handling for their polymorphic nature
Report
|> preload([:user, :admin])
|> Batch.record_batches()
|> Enum.each(fn records ->
records
|> Polymorphic.load_polymorphic(reportable: [reportable_id: :reportable_type])
|> Enum.map(&Search.index_document(&1, Report))
end)
end
def reindex_schema(schema) when schema in @schemas do
# Normal schemas can simply be reindexed with indexing_preloads
context = Map.fetch!(@contexts, schema)
schema
|> preload(^context.indexing_preloads())
|> Search.reindex(schema)
end
end

View file

@ -203,9 +203,8 @@ defmodule PhilomenaQuery.Search do
def reindex(queryable, module, opts \\ []) do def reindex(queryable, module, opts \\ []) do
index = @policy.index_for(module) index = @policy.index_for(module)
queryable process =
|> Batch.record_batches(opts) fn records ->
|> Enum.each(fn records ->
lines = lines =
Enum.flat_map(records, fn record -> Enum.flat_map(records, fn record ->
doc = index.as_json(record) doc = index.as_json(record)
@ -217,7 +216,12 @@ defmodule PhilomenaQuery.Search do
end) end)
Api.bulk(@policy.opensearch_url(), lines) Api.bulk(@policy.opensearch_url(), lines)
end) end
queryable
|> Batch.record_batches(opts)
|> Task.async_stream(process, ordered: false, timeout: :infinity)
|> Stream.run()
end end
@doc ~S""" @doc ~S"""