add fast reindexer

This commit is contained in:
byte[] 2020-07-12 14:03:30 -04:00
parent 4e70124f36
commit 0ecfa6c6c0
9 changed files with 402 additions and 1 deletions

3
.gitignore vendored
View file

@ -41,3 +41,6 @@ npm-debug.log
# Intellij IDEA # Intellij IDEA
.idea .idea
# Index dumps
*.jsonl

22
index/all.mk Normal file
View file

@ -0,0 +1,22 @@
all: comments galleries images posts reports tags
comments:
$(MAKE) -f comments.mk
galleries:
$(MAKE) -f galleries.mk
images:
$(MAKE) -f images.mk
posts:
$(MAKE) -f posts.mk
reports:
$(MAKE) -f reports.mk
tags:
$(MAKE) -f tags.mk
clean:
rm -f ./*.jsonl

48
index/comments.mk Normal file
View file

@ -0,0 +1,48 @@
DATABASE ?= philomena
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=comments.jsonl --output=http://localhost:9200/ --output-index=comments --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)"
dump_jsonl: metadata authors tags
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_comments.jsonb_object_agg(object) from temp_comments.comment_search_json group by comment_id) to stdout;' > comments.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_comments cascade;'
sed -i comments.jsonl -e 's/\\\\/\\/g'
metadata: comment_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object(
'id', c.id,
'posted_at', c.created_at,
'ip', c.ip,
'fingerprint', c.fingerprint,
'image_id', c.image_id,
'user_id', c.user_id,
'anonymous', c.anonymous,
'body', c.body,
'hidden_from_users', (c.hidden_from_users or i.hidden_from_users)
) from comments c inner join images i on c.image_id=i.id;
SQL
authors: comment_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('author', (case when c.anonymous='t' then null else u.name end)) from comments c left join users u on c.user_id=u.id;
SQL
tags: comment_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
create unlogged table temp_comments.image_tags (image_id bigint not null, tags jsonb not null);
insert into temp_comments.image_tags (image_id, tags) select it.image_id, jsonb_agg(it.tag_id) from image_taggings it group by it.image_id;
insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('image_tag_ids', it.tags) from comments c inner join temp_comments.image_tags it on c.image_id=it.image_id;
SQL
comment_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_comments cascade;
create schema temp_comments;
create unlogged table temp_comments.comment_search_json (comment_id bigint not null, object jsonb not null);
create or replace aggregate temp_comments.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

44
index/galleries.mk Normal file
View file

@ -0,0 +1,44 @@
DATABASE ?= philomena
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=galleries.jsonl --output=http://localhost:9200/ --output-index=galleries --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)"
dump_jsonl: metadata subscribers images
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_galleries.jsonb_object_agg(object) from temp_galleries.gallery_search_json group by gallery_id) to stdout;' > galleries.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_galleries cascade;'
sed -i galleries.jsonl -e 's/\\\\/\\/g'
metadata: gallery_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_galleries.gallery_search_json (gallery_id, object) select g.id, jsonb_build_object(
'id', g.id,
'image_count', g.image_count,
'updated_at', g.updated_at,
'created_at', g.created_at,
'title', lower(g.title),
'creator', lower(u.name),
'description', g.description
) from galleries g left join users u on g.creator_id=u.id;
SQL
subscribers: gallery_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('watcher_ids', jsonb_agg(user_id), 'watcher_count', count(*)) from gallery_subscriptions group by gallery_id;
SQL
images: gallery_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('image_ids', jsonb_agg(image_id)) from gallery_interactions group by gallery_id;
SQL
gallery_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_galleries cascade;
create schema temp_galleries;
create unlogged table temp_galleries.gallery_search_json (gallery_id bigint not null, object jsonb not null);
create or replace aggregate temp_galleries.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

132
index/images.mk Normal file
View file

@ -0,0 +1,132 @@
DATABASE ?= philomena
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=images.jsonl --output=http://localhost:9200/ --output-index=images --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)"
dump_jsonl: metadata true_uploaders uploaders deleters galleries tags hides upvotes downvotes faves tag_names
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_images.jsonb_object_agg(object) from temp_images.image_search_json group by image_id) to stdout;' > images.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_images cascade;'
sed -i images.jsonl -e 's/\\\\/\\/g'
metadata: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select id, jsonb_build_object(
'anonymous', anonymous,
'aspect_ratio', nullif(image_aspect_ratio, 'NaN'::float8),
'comment_count', comments_count,
'created_at', created_at,
'deletion_reason', deletion_reason,
'description', description,
'downvotes', downvotes_count,
'duplicate_id', duplicate_id,
'faves', faves_count,
'file_name', image_name,
'fingerprint', fingerprint,
'first_seen_at', first_seen_at,
'height', image_height,
'hidden_from_users', hidden_from_users,
'id', id,
'ip', ip,
'mime_type', image_mime_type,
'orig_sha512_hash', image_orig_sha512_hash,
'original_format', image_format,
'pixels', cast(image_width as bigint)*cast(image_height as bigint),
'score', score,
'size', image_size,
'sha512_hash', image_sha512_hash,
'source_url', lower(source_url),
'updated_at', updated_at,
'upvotes', upvotes_count,
'width', image_width,
'wilson_score', temp_images.wilson_995(upvotes_count, downvotes_count)
) from images;
SQL
true_uploaders: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('true_uploader_id', u.id, 'true_uploader', u.name) from images i left join users u on u.id = i.user_id;
SQL
uploaders: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('uploader_id', (case when i.anonymous = 't' then null else u.id end), 'uploader', (case when i.anonymous = 't' then null else lower(u.name) end)) from images i left join users u on u.id = i.user_id;
SQL
deleters: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('deleted_by_user_id', u.id, 'deleted_by_user', lower(u.name)) from images i left join users u on u.id = i.deleted_by_id;
SQL
galleries: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select gi.image_id, jsonb_build_object('gallery_interactions', jsonb_agg(jsonb_build_object('gallery_id', gi.gallery_id, 'position', gi.position))) from gallery_interactions gi group by image_id;
SQL
tags: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select it.image_id, jsonb_build_object('tag_ids', jsonb_agg(it.tag_id), 'tag_count', count(*)) from image_taggings it group by image_id;
SQL
hides: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select ih.image_id, jsonb_build_object('hidden_by_ids', jsonb_agg(ih.user_id), 'hidden_by', jsonb_agg(lower(u.name))) from image_hides ih inner join users u on u.id = ih.user_id group by image_id;
SQL
downvotes: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('downvoted_by_ids', jsonb_agg(iv.user_id), 'downvoted_by', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = false group by image_id;
SQL
upvotes: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('upvoted_by_ids', jsonb_agg(iv.user_id), 'upvoted_by', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = true group by image_id;
SQL
faves: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select if.image_id, jsonb_build_object('faved_by_ids', jsonb_agg(if.user_id), 'faved_by', jsonb_agg(lower(u.name))) from image_faves if inner join users u on u.id = if.user_id group by image_id;
SQL
tag_names: tags_with_aliases
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_images.image_search_json (image_id, object) select image_id, jsonb_build_object('namespaced_tags', jsonb_build_object('name', jsonb_agg(lower(tag_name)))) from temp_images.tags_with_aliases group by image_id;
SQL
tags_with_aliases: image_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
create unlogged table if not exists temp_images.tags_with_aliases (image_id bigint not null, tag_name text not null);
truncate temp_images.tags_with_aliases;
insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it inner join tags t on t.id = it.tag_id;
insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it left outer join tags t on t.aliased_tag_id = it.tag_id where t.name is not null;
SQL
image_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_images cascade;
create schema temp_images;
create unlogged table temp_images.image_search_json (image_id bigint not null, object jsonb not null);
create function temp_images.wilson_995(succ bigint, fail bigint) returns double precision as '
declare
n double precision;
p_hat double precision;
z double precision;
z2 double precision;
begin
if succ <= 0 then
return 0;
end if;
n := succ + fail;
p_hat := succ / n;
z := 2.57583;
z2 := 6.634900189;
return (p_hat + z2 / (2 * n) - z * sqrt((p_hat * (1 - p_hat) + z2 / (4 * n)) / n)) / (1 + z2 / n);
end
' language plpgsql;
create aggregate temp_images.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

49
index/posts.mk Normal file
View file

@ -0,0 +1,49 @@
DATABASE ?= philomena
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=posts.jsonl --output=http://localhost:9200/ --output-index=posts --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)"
dump_jsonl: metadata authors
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_posts.jsonb_object_agg(object) from temp_posts.post_search_json group by post_id) to stdout;' > posts.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_posts cascade;'
sed -i posts.jsonl -e 's/\\\\/\\/g'
metadata: post_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object(
'id', p.id,
'topic_id', p.topic_id,
'body', p.body,
'subject', t.title,
'ip', p.ip,
'user_agent', p.user_agent,
'referrer', p.referrer,
'fingerprint', p.fingerprint,
'topic_position', p.topic_position,
'forum_id', t.forum_id,
'user_id', p.user_id,
'anonymous', p.anonymous,
'created_at', p.created_at,
'updated_at', p.updated_at,
'deleted', p.hidden_from_users,
'destroyed_content', p.destroyed_content,
'access_level', f.access_level
) from posts p inner join topics t on t.id=p.topic_id inner join forums f on f.id=t.forum_id;
SQL
authors: post_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object('author', (case when p.anonymous='t' then null else u.name end)) from posts p left join users u on p.user_id=u.id;
SQL
post_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_posts cascade;
create schema temp_posts;
create unlogged table temp_posts.post_search_json (post_id bigint not null, object jsonb not null);
create or replace aggregate temp_posts.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

50
index/reports.mk Normal file
View file

@ -0,0 +1,50 @@
DATABASE ?= philomena
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=reports.jsonl --output=http://localhost:9200/ --output-index=reports --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)"
dump_jsonl: metadata image_ids comment_image_ids
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_reports.jsonb_object_agg(object) from temp_reports.report_search_json group by report_id) to stdout;' > reports.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_reports cascade;'
sed -i reports.jsonl -e 's/\\\\/\\/g'
metadata: report_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object(
'id', r.id,
'created_at', r.created_at,
'ip', r.ip,
'state', r.state,
'user', lower(u.name),
'user_id', r.user_id,
'admin', lower(a.name),
'admin_id', r.admin_id,
'reportable_type', r.reportable_type,
'reportable_id', r.reportable_id,
'fingerprint', r.fingerprint,
'open', r.open,
'reason', r.reason
) from reports r left join users u on r.user_id=u.id left join users a on r.admin_id=a.id;
SQL
image_ids: report_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', r.reportable_id) from reports r where r.reportable_type = 'Image';
SQL
comment_image_ids: report_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', c.image_id) from reports r inner join comments c on c.id = r.reportable_id where r.reportable_type = 'Comment';
SQL
report_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_reports cascade;
create schema temp_reports;
create unlogged table temp_reports.report_search_json (report_id bigint not null, object jsonb not null);
create or replace aggregate temp_reports.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

53
index/tags.mk Normal file
View file

@ -0,0 +1,53 @@
DATABASE ?= philomena
ELASTICDUMP ?= elasticdump
.ONESHELL:
all: import_es
import_es: dump_jsonl
$(ELASTICDUMP) --input=tags.jsonl --output=http://localhost:9200/ --output-index=tags --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)"
dump_jsonl: metadata aliases implied_tags implied_by_tags
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_tags.jsonb_object_agg(object) from temp_tags.tag_search_json group by tag_id) to stdout;' > tags.jsonl
psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_tags cascade;'
sed -i tags.jsonl -e 's/\\\\/\\/g'
metadata: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select t.id, jsonb_build_object(
'id', t.id,
'slug', t.slug,
'name', t.name,
'name_in_namespace', t.name_in_namespace,
'namespace', t.namespace,
'analyzed_name', t.name,
'aliased_tag', at.name,
'category', t.category,
'aliased', (t.aliased_tag_id is not null),
'description', t.description,
'short_description', t.short_description
) from tags t left join tags at on t.aliased_tag_id=at.id;
SQL
aliases: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select t.aliased_tag_id, jsonb_build_object('aliases', jsonb_agg(t.name)) from tags t inner join tags at on t.aliased_tag_id=t.id group by t.aliased_tag_id;
SQL
implied_tags: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select it.tag_id, jsonb_build_object('implied_tag_ids', jsonb_agg(it.implied_tag_id), 'implied_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.implied_tag_id group by it.tag_id;
SQL
implied_by_tags: tag_search_json
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
insert into temp_tags.tag_search_json (tag_id, object) select it.implied_tag_id, jsonb_build_object('implied_by_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.tag_id group by it.implied_tag_id;
SQL
tag_search_json:
psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL
drop schema if exists temp_tags cascade;
create schema temp_tags;
create unlogged table temp_tags.tag_search_json (tag_id bigint not null, object jsonb not null);
create or replace aggregate temp_tags.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}');
SQL

View file

@ -19,7 +19,7 @@ defmodule Mix.Tasks.ReindexAll do
@shortdoc "Destroys and recreates all Elasticsearch indices." @shortdoc "Destroys and recreates all Elasticsearch indices."
def run(_) do def run(_) do
if Mix.env() == :prod do if Mix.env() == :prod and not Enum.member?(System.argv(), "--i-know-what-im-doing") do
raise "do not run this task in production" raise "do not run this task in production"
end end