From 0ecfa6c6c01caccc725bc5a585463cb3350c89a6 Mon Sep 17 00:00:00 2001 From: "byte[]" Date: Sun, 12 Jul 2020 14:03:30 -0400 Subject: [PATCH] add fast reindexer --- .gitignore | 3 + index/all.mk | 22 ++++++ index/comments.mk | 48 +++++++++++++ index/galleries.mk | 44 ++++++++++++ index/images.mk | 132 +++++++++++++++++++++++++++++++++++ index/posts.mk | 49 +++++++++++++ index/reports.mk | 50 +++++++++++++ index/tags.mk | 53 ++++++++++++++ lib/mix/tasks/reindex_all.ex | 2 +- 9 files changed, 402 insertions(+), 1 deletion(-) create mode 100644 index/all.mk create mode 100644 index/comments.mk create mode 100644 index/galleries.mk create mode 100644 index/images.mk create mode 100644 index/posts.mk create mode 100644 index/reports.mk create mode 100644 index/tags.mk diff --git a/.gitignore b/.gitignore index cce349ba..37f47d92 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,6 @@ npm-debug.log # Intellij IDEA .idea + +# Index dumps +*.jsonl diff --git a/index/all.mk b/index/all.mk new file mode 100644 index 00000000..a249b692 --- /dev/null +++ b/index/all.mk @@ -0,0 +1,22 @@ +all: comments galleries images posts reports tags + +comments: + $(MAKE) -f comments.mk + +galleries: + $(MAKE) -f galleries.mk + +images: + $(MAKE) -f images.mk + +posts: + $(MAKE) -f posts.mk + +reports: + $(MAKE) -f reports.mk + +tags: + $(MAKE) -f tags.mk + +clean: + rm -f ./*.jsonl diff --git a/index/comments.mk b/index/comments.mk new file mode 100644 index 00000000..0d55d20d --- /dev/null +++ b/index/comments.mk @@ -0,0 +1,48 @@ +DATABASE ?= philomena +ELASTICDUMP ?= elasticdump +.ONESHELL: + +all: import_es + +import_es: dump_jsonl + $(ELASTICDUMP) --input=comments.jsonl --output=http://localhost:9200/ --output-index=comments --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)" + +dump_jsonl: metadata authors tags + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_comments.jsonb_object_agg(object) from temp_comments.comment_search_json group by comment_id) to stdout;' > comments.jsonl + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_comments cascade;' + sed -i comments.jsonl -e 's/\\\\/\\/g' + +metadata: comment_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object( + 'id', c.id, + 'posted_at', c.created_at, + 'ip', c.ip, + 'fingerprint', c.fingerprint, + 'image_id', c.image_id, + 'user_id', c.user_id, + 'anonymous', c.anonymous, + 'body', c.body, + 'hidden_from_users', (c.hidden_from_users or i.hidden_from_users) + ) from comments c inner join images i on c.image_id=i.id; + SQL + +authors: comment_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('author', (case when c.anonymous='t' then null else u.name end)) from comments c left join users u on c.user_id=u.id; + SQL + +tags: comment_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + create unlogged table temp_comments.image_tags (image_id bigint not null, tags jsonb not null); + insert into temp_comments.image_tags (image_id, tags) select it.image_id, jsonb_agg(it.tag_id) from image_taggings it group by it.image_id; + insert into temp_comments.comment_search_json (comment_id, object) select c.id, jsonb_build_object('image_tag_ids', it.tags) from comments c inner join temp_comments.image_tags it on c.image_id=it.image_id; + SQL + +comment_search_json: + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + drop schema if exists temp_comments cascade; + create schema temp_comments; + create unlogged table temp_comments.comment_search_json (comment_id bigint not null, object jsonb not null); + create or replace aggregate temp_comments.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}'); + SQL diff --git a/index/galleries.mk b/index/galleries.mk new file mode 100644 index 00000000..b6155080 --- /dev/null +++ b/index/galleries.mk @@ -0,0 +1,44 @@ +DATABASE ?= philomena +ELASTICDUMP ?= elasticdump +.ONESHELL: + +all: import_es + +import_es: dump_jsonl + $(ELASTICDUMP) --input=galleries.jsonl --output=http://localhost:9200/ --output-index=galleries --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)" + +dump_jsonl: metadata subscribers images + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_galleries.jsonb_object_agg(object) from temp_galleries.gallery_search_json group by gallery_id) to stdout;' > galleries.jsonl + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_galleries cascade;' + sed -i galleries.jsonl -e 's/\\\\/\\/g' + +metadata: gallery_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_galleries.gallery_search_json (gallery_id, object) select g.id, jsonb_build_object( + 'id', g.id, + 'image_count', g.image_count, + 'updated_at', g.updated_at, + 'created_at', g.created_at, + 'title', lower(g.title), + 'creator', lower(u.name), + 'description', g.description + ) from galleries g left join users u on g.creator_id=u.id; + SQL + +subscribers: gallery_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('watcher_ids', jsonb_agg(user_id), 'watcher_count', count(*)) from gallery_subscriptions group by gallery_id; + SQL + +images: gallery_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_galleries.gallery_search_json (gallery_id, object) select gallery_id, json_build_object('image_ids', jsonb_agg(image_id)) from gallery_interactions group by gallery_id; + SQL + +gallery_search_json: + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + drop schema if exists temp_galleries cascade; + create schema temp_galleries; + create unlogged table temp_galleries.gallery_search_json (gallery_id bigint not null, object jsonb not null); + create or replace aggregate temp_galleries.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}'); + SQL diff --git a/index/images.mk b/index/images.mk new file mode 100644 index 00000000..cddef990 --- /dev/null +++ b/index/images.mk @@ -0,0 +1,132 @@ +DATABASE ?= philomena +ELASTICDUMP ?= elasticdump +.ONESHELL: + +all: import_es + +import_es: dump_jsonl + $(ELASTICDUMP) --input=images.jsonl --output=http://localhost:9200/ --output-index=images --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)" + +dump_jsonl: metadata true_uploaders uploaders deleters galleries tags hides upvotes downvotes faves tag_names + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_images.jsonb_object_agg(object) from temp_images.image_search_json group by image_id) to stdout;' > images.jsonl + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_images cascade;' + sed -i images.jsonl -e 's/\\\\/\\/g' + +metadata: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select id, jsonb_build_object( + 'anonymous', anonymous, + 'aspect_ratio', nullif(image_aspect_ratio, 'NaN'::float8), + 'comment_count', comments_count, + 'created_at', created_at, + 'deletion_reason', deletion_reason, + 'description', description, + 'downvotes', downvotes_count, + 'duplicate_id', duplicate_id, + 'faves', faves_count, + 'file_name', image_name, + 'fingerprint', fingerprint, + 'first_seen_at', first_seen_at, + 'height', image_height, + 'hidden_from_users', hidden_from_users, + 'id', id, + 'ip', ip, + 'mime_type', image_mime_type, + 'orig_sha512_hash', image_orig_sha512_hash, + 'original_format', image_format, + 'pixels', cast(image_width as bigint)*cast(image_height as bigint), + 'score', score, + 'size', image_size, + 'sha512_hash', image_sha512_hash, + 'source_url', lower(source_url), + 'updated_at', updated_at, + 'upvotes', upvotes_count, + 'width', image_width, + 'wilson_score', temp_images.wilson_995(upvotes_count, downvotes_count) + ) from images; + SQL + +true_uploaders: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('true_uploader_id', u.id, 'true_uploader', u.name) from images i left join users u on u.id = i.user_id; + SQL + +uploaders: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('uploader_id', (case when i.anonymous = 't' then null else u.id end), 'uploader', (case when i.anonymous = 't' then null else lower(u.name) end)) from images i left join users u on u.id = i.user_id; + SQL + +deleters: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select i.id, jsonb_build_object('deleted_by_user_id', u.id, 'deleted_by_user', lower(u.name)) from images i left join users u on u.id = i.deleted_by_id; + SQL + +galleries: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select gi.image_id, jsonb_build_object('gallery_interactions', jsonb_agg(jsonb_build_object('gallery_id', gi.gallery_id, 'position', gi.position))) from gallery_interactions gi group by image_id; + SQL + +tags: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select it.image_id, jsonb_build_object('tag_ids', jsonb_agg(it.tag_id), 'tag_count', count(*)) from image_taggings it group by image_id; + SQL + +hides: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select ih.image_id, jsonb_build_object('hidden_by_ids', jsonb_agg(ih.user_id), 'hidden_by', jsonb_agg(lower(u.name))) from image_hides ih inner join users u on u.id = ih.user_id group by image_id; + SQL + +downvotes: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('downvoted_by_ids', jsonb_agg(iv.user_id), 'downvoted_by', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = false group by image_id; + SQL + +upvotes: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select iv.image_id, jsonb_build_object('upvoted_by_ids', jsonb_agg(iv.user_id), 'upvoted_by', jsonb_agg(lower(u.name))) from image_votes iv inner join users u on u.id = iv.user_id where iv.up = true group by image_id; + SQL + +faves: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select if.image_id, jsonb_build_object('faved_by_ids', jsonb_agg(if.user_id), 'faved_by', jsonb_agg(lower(u.name))) from image_faves if inner join users u on u.id = if.user_id group by image_id; + SQL + +tag_names: tags_with_aliases + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_images.image_search_json (image_id, object) select image_id, jsonb_build_object('namespaced_tags', jsonb_build_object('name', jsonb_agg(lower(tag_name)))) from temp_images.tags_with_aliases group by image_id; + SQL + +tags_with_aliases: image_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + create unlogged table if not exists temp_images.tags_with_aliases (image_id bigint not null, tag_name text not null); + truncate temp_images.tags_with_aliases; + insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it inner join tags t on t.id = it.tag_id; + insert into temp_images.tags_with_aliases (image_id, tag_name) select it.image_id, t.name from image_taggings it left outer join tags t on t.aliased_tag_id = it.tag_id where t.name is not null; + SQL + +image_search_json: + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + drop schema if exists temp_images cascade; + create schema temp_images; + create unlogged table temp_images.image_search_json (image_id bigint not null, object jsonb not null); + create function temp_images.wilson_995(succ bigint, fail bigint) returns double precision as ' + declare + n double precision; + p_hat double precision; + z double precision; + z2 double precision; + begin + if succ <= 0 then + return 0; + end if; + + n := succ + fail; + p_hat := succ / n; + z := 2.57583; + z2 := 6.634900189; + + return (p_hat + z2 / (2 * n) - z * sqrt((p_hat * (1 - p_hat) + z2 / (4 * n)) / n)) / (1 + z2 / n); + end + ' language plpgsql; + create aggregate temp_images.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}'); + SQL diff --git a/index/posts.mk b/index/posts.mk new file mode 100644 index 00000000..1f0b1b00 --- /dev/null +++ b/index/posts.mk @@ -0,0 +1,49 @@ +DATABASE ?= philomena +ELASTICDUMP ?= elasticdump +.ONESHELL: + +all: import_es + +import_es: dump_jsonl + $(ELASTICDUMP) --input=posts.jsonl --output=http://localhost:9200/ --output-index=posts --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)" + +dump_jsonl: metadata authors + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_posts.jsonb_object_agg(object) from temp_posts.post_search_json group by post_id) to stdout;' > posts.jsonl + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_posts cascade;' + sed -i posts.jsonl -e 's/\\\\/\\/g' + +metadata: post_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object( + 'id', p.id, + 'topic_id', p.topic_id, + 'body', p.body, + 'subject', t.title, + 'ip', p.ip, + 'user_agent', p.user_agent, + 'referrer', p.referrer, + 'fingerprint', p.fingerprint, + 'topic_position', p.topic_position, + 'forum_id', t.forum_id, + 'user_id', p.user_id, + 'anonymous', p.anonymous, + 'created_at', p.created_at, + 'updated_at', p.updated_at, + 'deleted', p.hidden_from_users, + 'destroyed_content', p.destroyed_content, + 'access_level', f.access_level + ) from posts p inner join topics t on t.id=p.topic_id inner join forums f on f.id=t.forum_id; + SQL + +authors: post_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_posts.post_search_json (post_id, object) select p.id, jsonb_build_object('author', (case when p.anonymous='t' then null else u.name end)) from posts p left join users u on p.user_id=u.id; + SQL + +post_search_json: + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + drop schema if exists temp_posts cascade; + create schema temp_posts; + create unlogged table temp_posts.post_search_json (post_id bigint not null, object jsonb not null); + create or replace aggregate temp_posts.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}'); + SQL diff --git a/index/reports.mk b/index/reports.mk new file mode 100644 index 00000000..139f194a --- /dev/null +++ b/index/reports.mk @@ -0,0 +1,50 @@ +DATABASE ?= philomena +ELASTICDUMP ?= elasticdump +.ONESHELL: + +all: import_es + +import_es: dump_jsonl + $(ELASTICDUMP) --input=reports.jsonl --output=http://localhost:9200/ --output-index=reports --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)" + +dump_jsonl: metadata image_ids comment_image_ids + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_reports.jsonb_object_agg(object) from temp_reports.report_search_json group by report_id) to stdout;' > reports.jsonl + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_reports cascade;' + sed -i reports.jsonl -e 's/\\\\/\\/g' + +metadata: report_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object( + 'id', r.id, + 'created_at', r.created_at, + 'ip', r.ip, + 'state', r.state, + 'user', lower(u.name), + 'user_id', r.user_id, + 'admin', lower(a.name), + 'admin_id', r.admin_id, + 'reportable_type', r.reportable_type, + 'reportable_id', r.reportable_id, + 'fingerprint', r.fingerprint, + 'open', r.open, + 'reason', r.reason + ) from reports r left join users u on r.user_id=u.id left join users a on r.admin_id=a.id; + SQL + +image_ids: report_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', r.reportable_id) from reports r where r.reportable_type = 'Image'; + SQL + +comment_image_ids: report_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_reports.report_search_json (report_id, object) select r.id, jsonb_build_object('image_id', c.image_id) from reports r inner join comments c on c.id = r.reportable_id where r.reportable_type = 'Comment'; + SQL + +report_search_json: + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + drop schema if exists temp_reports cascade; + create schema temp_reports; + create unlogged table temp_reports.report_search_json (report_id bigint not null, object jsonb not null); + create or replace aggregate temp_reports.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}'); + SQL diff --git a/index/tags.mk b/index/tags.mk new file mode 100644 index 00000000..0c5413f9 --- /dev/null +++ b/index/tags.mk @@ -0,0 +1,53 @@ +DATABASE ?= philomena +ELASTICDUMP ?= elasticdump +.ONESHELL: + +all: import_es + +import_es: dump_jsonl + $(ELASTICDUMP) --input=tags.jsonl --output=http://localhost:9200/ --output-index=tags --limit 10000 --retryAttempts=5 --type=data --transform="doc._source = Object.assign({},doc)" + +dump_jsonl: metadata aliases implied_tags implied_by_tags + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'copy (select temp_tags.jsonb_object_agg(object) from temp_tags.tag_search_json group by tag_id) to stdout;' > tags.jsonl + psql $(DATABASE) -v ON_ERROR_STOP=1 <<< 'drop schema temp_tags cascade;' + sed -i tags.jsonl -e 's/\\\\/\\/g' + +metadata: tag_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_tags.tag_search_json (tag_id, object) select t.id, jsonb_build_object( + 'id', t.id, + 'slug', t.slug, + 'name', t.name, + 'name_in_namespace', t.name_in_namespace, + 'namespace', t.namespace, + 'analyzed_name', t.name, + 'aliased_tag', at.name, + 'category', t.category, + 'aliased', (t.aliased_tag_id is not null), + 'description', t.description, + 'short_description', t.short_description + ) from tags t left join tags at on t.aliased_tag_id=at.id; + SQL + +aliases: tag_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_tags.tag_search_json (tag_id, object) select t.aliased_tag_id, jsonb_build_object('aliases', jsonb_agg(t.name)) from tags t inner join tags at on t.aliased_tag_id=t.id group by t.aliased_tag_id; + SQL + +implied_tags: tag_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_tags.tag_search_json (tag_id, object) select it.tag_id, jsonb_build_object('implied_tag_ids', jsonb_agg(it.implied_tag_id), 'implied_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.implied_tag_id group by it.tag_id; + SQL + +implied_by_tags: tag_search_json + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + insert into temp_tags.tag_search_json (tag_id, object) select it.implied_tag_id, jsonb_build_object('implied_by_tags', jsonb_agg(t.name)) from tags_implied_tags it inner join tags t on t.id=it.tag_id group by it.implied_tag_id; + SQL + +tag_search_json: + psql $(DATABASE) -v ON_ERROR_STOP=1 <<-SQL + drop schema if exists temp_tags cascade; + create schema temp_tags; + create unlogged table temp_tags.tag_search_json (tag_id bigint not null, object jsonb not null); + create or replace aggregate temp_tags.jsonb_object_agg(jsonb) (sfunc = 'jsonb_concat', stype = jsonb, initcond='{}'); + SQL diff --git a/lib/mix/tasks/reindex_all.ex b/lib/mix/tasks/reindex_all.ex index e77e1dd4..274c658d 100644 --- a/lib/mix/tasks/reindex_all.ex +++ b/lib/mix/tasks/reindex_all.ex @@ -19,7 +19,7 @@ defmodule Mix.Tasks.ReindexAll do @shortdoc "Destroys and recreates all Elasticsearch indices." def run(_) do - if Mix.env() == :prod do + if Mix.env() == :prod and not Enum.member?(System.argv(), "--i-know-what-im-doing") do raise "do not run this task in production" end