mirror of
https://github.com/philomena-dev/philomena.git
synced 2024-12-18 06:57:59 +01:00
Merge pull request #380 from philomena-dev/faster-index
Remove makefile indexer and improve Elixir-side indexing
This commit is contained in:
commit
6ef53545dd
12 changed files with 156 additions and 538 deletions
|
@ -101,7 +101,9 @@ if config_env() != :test do
|
|||
url: System.fetch_env!("DATABASE_URL"),
|
||||
pool_size: String.to_integer(System.get_env("POOL_SIZE", "16")),
|
||||
timeout: 60_000,
|
||||
ownership_timeout: 60_000
|
||||
ownership_timeout: 60_000,
|
||||
queue_target: 20_000,
|
||||
queue_interval: 20_000
|
||||
end
|
||||
|
||||
if config_env() == :prod do
|
||||
|
|
25
index/all.mk
25
index/all.mk
|
@ -1,25 +0,0 @@
|
|||
all: comments galleries images posts reports tags filters
|
||||
|
||||
comments:
|
||||
$(MAKE) -f comments.mk
|
||||
|
||||
galleries:
|
||||
$(MAKE) -f galleries.mk
|
||||
|
||||
images:
|
||||
$(MAKE) -f images.mk
|
||||
|
||||
posts:
|
||||
$(MAKE) -f posts.mk
|
||||
|
||||
reports:
|
||||
$(MAKE) -f reports.mk
|
||||
|
||||
tags:
|
||||
$(MAKE) -f tags.mk
|
||||
|
||||
filters:
|
||||
$(MAKE) -f filters.mk
|
||||
|
||||
clean:
|
||||
rm -f ./*.jsonl
|
|
@ -1,49 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=comments.jsonl --output=$OPENSEARCH_URL --output-index=comments --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata authors tags
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_comments.jsonb_object_agg(object) from temp_comments.comment_search_json group by comment_id) to stdout;' > comments.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_comments cascade;'
|
||||
sed -i comments.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: comment_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object(
|
||||
'id', c.id,
|
||||
'posted_at', c.created_at,
|
||||
'ip', c.ip,
|
||||
'fingerprint', c.fingerprint,
|
||||
'image_id', c.image_id,
|
||||
'user_id', c.user_id,
|
||||
'anonymous', c.anonymous,
|
||||
'body', c.body,
|
||||
'hidden_from_users', (c.hidden_from_users or i.hidden_from_users)
|
||||
) from comments c inner join images i on c.image_id=i.id;
|
||||
SQL
|
||||
|
||||
authors: comment_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('author', (case when c.anonymous='t' then null else u.name end)) from comments c left join users u on c.user_id=u.id;
|
||||
SQL
|
||||
|
||||
tags: comment_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
create unlogged table temp_comments.image_tags (image_id bigint not null, tags jsonb not null);
|
||||
insert into temp_comments.image_tags (image_id, tags) select it.image_id, jsonb_agg(it.tag_id) from image_taggings it group by it.image_id;
|
||||
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('image_tag_ids', it.tags) from comments c inner join temp_comments.image_tags it on c.image_id=it.image_id;
|
||||
SQL
|
||||
|
||||
comment_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_comments cascade;
|
||||
create schema temp_comments;
|
||||
create unlogged table temp_comments.comment_search_json (comment_id bigint not null, object jsonb not null);
|
||||
create or replace aggregate temp_comments.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
|
@ -1,47 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=filters.jsonl --output=$OPENSEARCH_URL --output-index=filters --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata creators
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_filters.jsonb_object_agg(object) from temp_filters.filter_search_json group by filter_id) to stdout;' > filters.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_filters cascade;'
|
||||
sed -i filters.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: filter_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_filters.filter_search_json (filter_id, object) select f.id, jsonb_build_object(
|
||||
'id', f.id,
|
||||
'created_at', f.created_at,
|
||||
'user_id', f.user_id,
|
||||
'public', f.public or f.system,
|
||||
'system', f.system,
|
||||
'name', lower(f.name),
|
||||
'description', f.description,
|
||||
'spoilered_count', array_length(f.spoilered_tag_ids, 1),
|
||||
'hidden_count', array_length(f.hidden_tag_ids, 1),
|
||||
'spoilered_tag_ids', f.spoilered_tag_ids,
|
||||
'hidden_tag_ids', f.hidden_tag_ids,
|
||||
'spoilered_complex_str', lower(f.spoilered_complex_str),
|
||||
'hidden_complex_str', lower(f.hidden_complex_str),
|
||||
'user_count', f.user_count
|
||||
) from filters f;
|
||||
SQL
|
||||
|
||||
creators: filter_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_filters.filter_search_json (filter_id, object) select f.id, jsonb_build_object('creator', lower(u.name)) from filters f left join users u on f.user_id=u.id;
|
||||
SQL
|
||||
|
||||
filter_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_filters cascade;
|
||||
create schema temp_filters;
|
||||
create unlogged table temp_filters.filter_search_json (filter_id bigint not null, object jsonb not null);
|
||||
create or replace aggregate temp_filters.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
|
@ -1,45 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=galleries.jsonl --output=$OPENSEARCH_URL --output-index=galleries --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata subscribers images
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_galleries.jsonb_object_agg(object) from temp_galleries.gallery_search_json group by gallery_id) to stdout;' > galleries.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_galleries cascade;'
|
||||
sed -i galleries.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: gallery_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_galleries.gallery_search_json (gallery_id, object) select g.id, jsonb_build_object(
|
||||
'id', g.id,
|
||||
'image_count', g.image_count,
|
||||
'updated_at', g.updated_at,
|
||||
'created_at', g.created_at,
|
||||
'title', lower(g.title),
|
||||
'creator', lower(u.name),
|
||||
'description', g.description
|
||||
) from galleries g left join users u on g.creator_id=u.id;
|
||||
SQL
|
||||
|
||||
subscribers: gallery_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('watcher_ids', jsonb_agg(user_id), 'watcher_count', count(*)) from gallery_subscriptions group by gallery_id;
|
||||
SQL
|
||||
|
||||
images: gallery_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('image_ids', jsonb_agg(image_id)) from gallery_interactions group by gallery_id;
|
||||
SQL
|
||||
|
||||
gallery_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_galleries cascade;
|
||||
create schema temp_galleries;
|
||||
create unlogged table temp_galleries.gallery_search_json (gallery_id bigint not null, object jsonb not null);
|
||||
create or replace aggregate temp_galleries.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
156
index/images.mk
156
index/images.mk
|
@ -1,156 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=images.jsonl --output=$OPENSEARCH_URL --output-index=images --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata true_uploaders uploaders deleters galleries tags sources hides upvotes downvotes faves tag_names
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_images.jsonb_object_agg(object) from temp_images.image_search_json group by image_id) to stdout;' > images.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_images cascade;'
|
||||
sed -i images.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select id, jsonb_build_object(
|
||||
'approved', approved,
|
||||
'animated', is_animated,
|
||||
'anonymous', anonymous,
|
||||
'aspect_ratio', nullif(image_aspect_ratio, 'NaN'::float8),
|
||||
'comment_count', comments_count,
|
||||
'created_at', created_at,
|
||||
'deletion_reason', deletion_reason,
|
||||
'description', description,
|
||||
'downvotes', downvotes_count,
|
||||
'duplicate_id', duplicate_id,
|
||||
'duration', (case when is_animated then image_duration else 0::float end),
|
||||
'faves', faves_count,
|
||||
'file_name', image_name,
|
||||
'fingerprint', fingerprint,
|
||||
'first_seen_at', first_seen_at,
|
||||
'height', image_height,
|
||||
'hidden_from_users', hidden_from_users,
|
||||
'id', id,
|
||||
'ip', ip,
|
||||
'mime_type', image_mime_type,
|
||||
'orig_sha512_hash', image_orig_sha512_hash,
|
||||
'original_format', image_format,
|
||||
'pixels', cast(image_width as bigint)*cast(image_height as bigint),
|
||||
'processed', processed,
|
||||
'score', score,
|
||||
'size', image_size,
|
||||
'orig_size', image_orig_size,
|
||||
'sha512_hash', image_sha512_hash,
|
||||
'thumbnails_generated', thumbnails_generated,
|
||||
'updated_at', updated_at,
|
||||
'upvotes', upvotes_count,
|
||||
'width', image_width,
|
||||
'wilson_score', temp_images.wilson_995(upvotes_count, downvotes_count)
|
||||
) from images;
|
||||
SQL
|
||||
|
||||
true_uploaders: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('true_uploader_id', u.id, 'true_uploader', u.name) from images i left join users u on u.id = i.user_id;
|
||||
SQL
|
||||
|
||||
uploaders: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('uploader_id', (case when i.anonymous = 't' then null else u.id end), 'uploader', (case when i.anonymous = 't' then null else lower(u.name) end)) from images i left join users u on u.id = i.user_id;
|
||||
SQL
|
||||
|
||||
deleters: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('deleted_by_user_id', u.id, 'deleted_by_user', lower(u.name)) from images i left join users u on u.id = i.deleted_by_id;
|
||||
SQL
|
||||
|
||||
galleries: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select gi.image_id, jsonb_build_object('gallery_interactions', jsonb_agg(jsonb_build_object('id', gi.gallery_id, 'position', gi.position))) from gallery_interactions gi group by image_id;
|
||||
SQL
|
||||
|
||||
tags: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select it.image_id, jsonb_build_object(
|
||||
'tag_ids', jsonb_agg(it.tag_id),
|
||||
'tag_count', count(*),
|
||||
'error_tag_count', count(case when t.category = 'error' then t.category else null end),
|
||||
'rating_tag_count', count(case when t.category = 'rating' then t.category else null end),
|
||||
'origin_tag_count', count(case when t.category = 'origin' then t.category else null end),
|
||||
'character_tag_count', count(case when t.category = 'character' then t.category else null end),
|
||||
'oc_tag_count', count(case when t.category = 'oc' then t.category else null end),
|
||||
'species_tag_count', count(case when t.category = 'species' then t.category else null end),
|
||||
'body_type_tag_count', count(case when t.category = 'body-type' then t.category else null end),
|
||||
'content_fanmade_tag_count', count(case when t.category = 'content-fanmade' then t.category else null end),
|
||||
'content_official_tag_count', count(case when t.category = 'content-official' then t.category else null end),
|
||||
'spoiler_tag_count', count(case when t.category = 'spoiler' then t.category else null end)
|
||||
) from image_taggings it inner join tags t on t.id = it.tag_id group by image_id;
|
||||
SQL
|
||||
|
||||
sources: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select s.image_id, jsonb_build_object('source_url', jsonb_agg(lower(s.source)), 'source_count', count(*)) from image_sources s group by image_id;
|
||||
SQL
|
||||
|
||||
hides: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select ih.image_id, jsonb_build_object('hidden_by_user_ids', jsonb_agg(ih.user_id), 'hidden_by_users', jsonb_agg(lower(u.name))) from image_hides ih inner join users u on u.id = ih.user_id group by image_id;
|
||||
SQL
|
||||
|
||||
downvotes: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('downvoter_ids', jsonb_agg(iv.user_id), 'downvoters', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = false group by image_id;
|
||||
SQL
|
||||
|
||||
upvotes: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('upvoter_ids', jsonb_agg(iv.user_id), 'upvoters', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = true group by image_id;
|
||||
SQL
|
||||
|
||||
faves: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select if.image_id, jsonb_build_object('favourited_by_user_ids', jsonb_agg(if.user_id), 'favourited_by_users', jsonb_agg(lower(u.name))) from image_faves if inner join users u on u.id = if.user_id group by image_id;
|
||||
SQL
|
||||
|
||||
tag_names: tags_with_aliases
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_images.image_search_json (image_id, object) select image_id, jsonb_build_object('namespaced_tags', jsonb_build_object('name', jsonb_agg(lower(tag_name)))) from temp_images.tags_with_aliases group by image_id;
|
||||
SQL
|
||||
|
||||
tags_with_aliases: image_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
create unlogged table if not exists temp_images.tags_with_aliases (image_id bigint not null, tag_name text not null);
|
||||
truncate temp_images.tags_with_aliases;
|
||||
insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it inner join tags t on t.id = it.tag_id;
|
||||
insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it left outer join tags t on t.aliased_tag_id = it.tag_id where t.name is not null;
|
||||
SQL
|
||||
|
||||
image_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_images cascade;
|
||||
create schema temp_images;
|
||||
create unlogged table temp_images.image_search_json (image_id bigint not null, object jsonb not null);
|
||||
create function temp_images.wilson_995(succ bigint, fail bigint) returns double precision as '
|
||||
declare
|
||||
n double precision;
|
||||
p_hat double precision;
|
||||
z double precision;
|
||||
z2 double precision;
|
||||
begin
|
||||
if succ <= 0 then
|
||||
return 0;
|
||||
end if;
|
||||
|
||||
n := succ + fail;
|
||||
p_hat := succ / n;
|
||||
z := 2.57583;
|
||||
z2 := 6.634900189;
|
||||
|
||||
return (p_hat + z2 / (2 * n) - z * sqrt((p_hat * (1 - p_hat) + z2 / (4 * n)) / n)) / (1 + z2 / n);
|
||||
end
|
||||
' language plpgsql;
|
||||
create aggregate temp_images.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
|
@ -1,51 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=posts.jsonl --output=$OPENSEARCH_URL --output-index=posts --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata authors
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_posts.jsonb_object_agg(object) from temp_posts.post_search_json group by post_id) to stdout;' > posts.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_posts cascade;'
|
||||
sed -i posts.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: post_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object(
|
||||
'id', p.id,
|
||||
'topic_id', p.topic_id,
|
||||
'body', p.body,
|
||||
'subject', t.title,
|
||||
'ip', p.ip,
|
||||
'user_agent', '',
|
||||
'referrer', '',
|
||||
'fingerprint', p.fingerprint,
|
||||
'topic_position', p.topic_position,
|
||||
'forum', f.short_name,
|
||||
'forum_id', t.forum_id,
|
||||
'user_id', p.user_id,
|
||||
'anonymous', p.anonymous,
|
||||
'created_at', p.created_at,
|
||||
'updated_at', p.updated_at,
|
||||
'deleted', p.hidden_from_users,
|
||||
'destroyed_content', p.destroyed_content,
|
||||
'access_level', f.access_level
|
||||
) from posts p inner join topics t on t.id=p.topic_id inner join forums f on f.id=t.forum_id;
|
||||
SQL
|
||||
|
||||
authors: post_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object('author', (case when p.anonymous='t' then null else u.name end)) from posts p left join users u on p.user_id=u.id;
|
||||
SQL
|
||||
|
||||
post_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_posts cascade;
|
||||
create schema temp_posts;
|
||||
create unlogged table temp_posts.post_search_json (post_id bigint not null, object jsonb not null);
|
||||
create or replace aggregate temp_posts.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
|
@ -1,51 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=reports.jsonl --output=$OPENSEARCH_URL --output-index=reports --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata image_ids comment_image_ids
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_reports.jsonb_object_agg(object) from temp_reports.report_search_json group by report_id) to stdout;' > reports.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_reports cascade;'
|
||||
sed -i reports.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: report_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object(
|
||||
'id', r.id,
|
||||
'created_at', r.created_at,
|
||||
'ip', r.ip,
|
||||
'state', r.state,
|
||||
'user', lower(u.name),
|
||||
'user_id', r.user_id,
|
||||
'admin', lower(a.name),
|
||||
'admin_id', r.admin_id,
|
||||
'reportable_type', r.reportable_type,
|
||||
'reportable_id', r.reportable_id,
|
||||
'fingerprint', r.fingerprint,
|
||||
'open', r.open,
|
||||
'reason', r.reason
|
||||
) from reports r left join users u on r.user_id=u.id left join users a on r.admin_id=a.id;
|
||||
SQL
|
||||
|
||||
image_ids: report_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', r.reportable_id) from reports r where r.reportable_type = 'Image';
|
||||
SQL
|
||||
|
||||
comment_image_ids: report_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', c.image_id) from reports r inner join comments c on c.id = r.reportable_id where r.reportable_type = 'Comment';
|
||||
SQL
|
||||
|
||||
report_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_reports cascade;
|
||||
create schema temp_reports;
|
||||
create unlogged table temp_reports.report_search_json (report_id bigint not null, object jsonb not null);
|
||||
create or replace aggregate temp_reports.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
|
@ -1,54 +0,0 @@
|
|||
DATABASE ?= philomena
|
||||
OPENSEARCH_URL ?= http://localhost:9200/
|
||||
ELASTICDUMP ?= elasticdump
|
||||
.ONESHELL:
|
||||
|
||||
all: import_es
|
||||
|
||||
import_es: dump_jsonl
|
||||
$(ELASTICDUMP) --input=tags.jsonl --output=$OPENSEARCH_URL --output-index=tags --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc); doc._id = doc.id"
|
||||
|
||||
dump_jsonl: metadata aliases implied_tags implied_by_tags
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'copy (select temp_tags.jsonb_object_agg(object) from temp_tags.tag_search_json group by tag_id) to stdout;' > tags.jsonl
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 -c 'drop schema temp_tags cascade;'
|
||||
sed -i tags.jsonl -e 's/\\\\/\\/g'
|
||||
|
||||
metadata: tag_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_tags.tag_search_json (tag_id, object) select t.id, jsonb_build_object(
|
||||
'id', t.id,
|
||||
'slug', t.slug,
|
||||
'name', t.name,
|
||||
'name_in_namespace', t.name_in_namespace,
|
||||
'namespace', t.namespace,
|
||||
'analyzed_name', t.name,
|
||||
'aliased_tag', at.name,
|
||||
'category', t.category,
|
||||
'aliased', (t.aliased_tag_id is not null),
|
||||
'description', t.description,
|
||||
'short_description', t.short_description
|
||||
) from tags t left join tags at on t.aliased_tag_id=at.id;
|
||||
SQL
|
||||
|
||||
aliases: tag_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_tags.tag_search_json (tag_id, object) select t.aliased_tag_id, jsonb_build_object('aliases', jsonb_agg(t.name)) from tags t inner join tags at on t.aliased_tag_id=t.id group by t.aliased_tag_id;
|
||||
SQL
|
||||
|
||||
implied_tags: tag_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_tags.tag_search_json (tag_id, object) select it.tag_id, jsonb_build_object('implied_tag_ids', jsonb_agg(it.implied_tag_id), 'implied_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.implied_tag_id group by it.tag_id;
|
||||
SQL
|
||||
|
||||
implied_by_tags: tag_search_json
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
insert into temp_tags.tag_search_json (tag_id, object) select it.implied_tag_id, jsonb_build_object('implied_by_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.tag_id group by it.implied_tag_id;
|
||||
SQL
|
||||
|
||||
tag_search_json:
|
||||
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
|
||||
drop schema if exists temp_tags cascade;
|
||||
create schema temp_tags;
|
||||
create unlogged table temp_tags.tag_search_json (tag_id bigint not null, object jsonb not null);
|
||||
create or replace aggregate temp_tags.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
|
||||
SQL
|
|
@ -1,31 +1,7 @@
|
|||
defmodule Mix.Tasks.ReindexAll do
|
||||
use Mix.Task
|
||||
|
||||
alias PhilomenaQuery.Search
|
||||
|
||||
alias Philomena.{
|
||||
Comments.Comment,
|
||||
Galleries.Gallery,
|
||||
Posts.Post,
|
||||
Images.Image,
|
||||
Reports.Report,
|
||||
Tags.Tag,
|
||||
Filters.Filter
|
||||
}
|
||||
|
||||
alias Philomena.{Comments, Galleries, Posts, Images, Tags, Filters}
|
||||
alias Philomena.Polymorphic
|
||||
alias Philomena.Repo
|
||||
import Ecto.Query
|
||||
|
||||
@indices [
|
||||
{Images, Image},
|
||||
{Comments, Comment},
|
||||
{Galleries, Gallery},
|
||||
{Tags, Tag},
|
||||
{Posts, Post},
|
||||
{Filters, Filter}
|
||||
]
|
||||
alias Philomena.SearchIndexer
|
||||
|
||||
@shortdoc "Destroys and recreates all OpenSearch indices."
|
||||
@requirements ["app.start"]
|
||||
|
@ -35,26 +11,6 @@ defmodule Mix.Tasks.ReindexAll do
|
|||
raise "do not run this task unless you know what you're doing"
|
||||
end
|
||||
|
||||
@indices
|
||||
|> Enum.map(fn {context, schema} ->
|
||||
Task.async(fn ->
|
||||
Search.delete_index!(schema)
|
||||
Search.create_index!(schema)
|
||||
|
||||
Search.reindex(preload(schema, ^context.indexing_preloads()), schema)
|
||||
end)
|
||||
end)
|
||||
|> Task.await_many(:infinity)
|
||||
|
||||
# Reports are a bit special
|
||||
|
||||
Search.delete_index!(Report)
|
||||
Search.create_index!(Report)
|
||||
|
||||
Report
|
||||
|> preload([:user, :admin])
|
||||
|> Repo.all()
|
||||
|> Polymorphic.load_polymorphic(reportable: [reportable_id: :reportable_type])
|
||||
|> Enum.map(&Search.index_document(&1, Report))
|
||||
SearchIndexer.recreate_reindex_all_destructive!()
|
||||
end
|
||||
end
|
||||
|
|
134
lib/philomena/search_indexer.ex
Normal file
134
lib/philomena/search_indexer.ex
Normal file
|
@ -0,0 +1,134 @@
|
|||
defmodule Philomena.SearchIndexer do
|
||||
alias PhilomenaQuery.Batch
|
||||
alias PhilomenaQuery.Search
|
||||
|
||||
alias Philomena.Comments
|
||||
alias Philomena.Comments.Comment
|
||||
alias Philomena.Filters
|
||||
alias Philomena.Filters.Filter
|
||||
alias Philomena.Galleries
|
||||
alias Philomena.Galleries.Gallery
|
||||
alias Philomena.Images
|
||||
alias Philomena.Images.Image
|
||||
alias Philomena.Posts
|
||||
alias Philomena.Posts.Post
|
||||
alias Philomena.Reports
|
||||
alias Philomena.Reports.Report
|
||||
alias Philomena.Tags
|
||||
alias Philomena.Tags.Tag
|
||||
|
||||
alias Philomena.Polymorphic
|
||||
import Ecto.Query
|
||||
|
||||
@schemas [
|
||||
Comment,
|
||||
Filter,
|
||||
Gallery,
|
||||
Image,
|
||||
Post,
|
||||
Report,
|
||||
Tag
|
||||
]
|
||||
|
||||
@contexts %{
|
||||
Comment => Comments,
|
||||
Filter => Filters,
|
||||
Gallery => Galleries,
|
||||
Image => Images,
|
||||
Post => Posts,
|
||||
Report => Reports,
|
||||
Tag => Tags
|
||||
}
|
||||
|
||||
@doc """
|
||||
Recreate the index corresponding to all schemas, and then reindex all of the
|
||||
documents within.
|
||||
|
||||
## Example
|
||||
|
||||
iex> SearchIndexer.recreate_reindex_all_destructive!()
|
||||
:ok
|
||||
|
||||
"""
|
||||
@spec recreate_reindex_all_destructive! :: :ok
|
||||
def recreate_reindex_all_destructive! do
|
||||
@schemas
|
||||
|> Task.async_stream(
|
||||
&recreate_reindex_schema_destructive!/1,
|
||||
ordered: false,
|
||||
timeout: :infinity
|
||||
)
|
||||
|> Stream.run()
|
||||
end
|
||||
|
||||
@doc """
|
||||
Recreate the index corresponding to a schema, and then reindex all of the
|
||||
documents within the schema.
|
||||
|
||||
## Example
|
||||
|
||||
iex> SearchIndexer.recreate_reindex_schema_destructive!(Report)
|
||||
:ok
|
||||
|
||||
"""
|
||||
@spec recreate_reindex_schema_destructive!(schema :: module()) :: :ok
|
||||
def recreate_reindex_schema_destructive!(schema) when schema in @schemas do
|
||||
Search.delete_index!(schema)
|
||||
Search.create_index!(schema)
|
||||
|
||||
reindex_schema(schema)
|
||||
end
|
||||
|
||||
@doc """
|
||||
Reindex all of the documents within all schemas.
|
||||
|
||||
## Example
|
||||
|
||||
iex> SearchIndexer.reindex_all()
|
||||
:ok
|
||||
|
||||
"""
|
||||
@spec reindex_all :: :ok
|
||||
def reindex_all do
|
||||
@schemas
|
||||
|> Task.async_stream(
|
||||
&reindex_schema/1,
|
||||
ordered: false,
|
||||
timeout: :infinity
|
||||
)
|
||||
|> Stream.run()
|
||||
end
|
||||
|
||||
@doc """
|
||||
Reindex all of the documents within a single schema.
|
||||
|
||||
## Example
|
||||
|
||||
iex> SearchIndexer.reindex_schema(Report)
|
||||
:ok
|
||||
|
||||
"""
|
||||
@spec reindex_schema(schema :: module()) :: :ok
|
||||
def reindex_schema(schema)
|
||||
|
||||
def reindex_schema(Report) do
|
||||
# Reports currently require handling for their polymorphic nature
|
||||
Report
|
||||
|> preload([:user, :admin])
|
||||
|> Batch.record_batches()
|
||||
|> Enum.each(fn records ->
|
||||
records
|
||||
|> Polymorphic.load_polymorphic(reportable: [reportable_id: :reportable_type])
|
||||
|> Enum.map(&Search.index_document(&1, Report))
|
||||
end)
|
||||
end
|
||||
|
||||
def reindex_schema(schema) when schema in @schemas do
|
||||
# Normal schemas can simply be reindexed with indexing_preloads
|
||||
context = Map.fetch!(@contexts, schema)
|
||||
|
||||
schema
|
||||
|> preload(^context.indexing_preloads())
|
||||
|> Search.reindex(schema)
|
||||
end
|
||||
end
|
|
@ -203,9 +203,8 @@ defmodule PhilomenaQuery.Search do
|
|||
def reindex(queryable, module, opts \\ []) do
|
||||
index = @policy.index_for(module)
|
||||
|
||||
queryable
|
||||
|> Batch.record_batches(opts)
|
||||
|> Enum.each(fn records ->
|
||||
process =
|
||||
fn records ->
|
||||
lines =
|
||||
Enum.flat_map(records, fn record ->
|
||||
doc = index.as_json(record)
|
||||
|
@ -217,7 +216,12 @@ defmodule PhilomenaQuery.Search do
|
|||
end)
|
||||
|
||||
Api.bulk(@policy.opensearch_url(), lines)
|
||||
end)
|
||||
end
|
||||
|
||||
queryable
|
||||
|> Batch.record_batches(opts)
|
||||
|> Task.async_stream(process, ordered: false, timeout: :infinity)
|
||||
|> Stream.run()
|
||||
end
|
||||
|
||||
@doc ~S"""
|
||||
|
|
Loading…
Reference in a new issue