From 8cb0cbc244dd49cb9c912505d7d719481d542b54 Mon Sep 17 00:00:00 2001 From: "Liam P. White" <byteslice@airmail.cc> Date: Tue, 27 Aug 2019 19:37:26 -0400 Subject: [PATCH] finished parser bundle --- lib/philomena/images/query.ex | 68 +++++++++++++++++- lib/philomena/search/helpers.ex | 18 +++++ lib/philomena/search/lexer.ex | 20 +++--- lib/philomena/search/parser.ex | 121 ++++++++++++++------------------ 4 files changed, 145 insertions(+), 82 deletions(-) diff --git a/lib/philomena/images/query.ex b/lib/philomena/images/query.ex index a4114851..d84f135a 100644 --- a/lib/philomena/images/query.ex +++ b/lib/philomena/images/query.ex @@ -1,5 +1,7 @@ defmodule Philomena.Images.Query do - use Philomena.Search.Parser, + import Philomena.Search.Parser + + defparser "anonymous", int: ~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count), float: ~W(aspect_ratio wilson_score), @@ -16,6 +18,66 @@ defmodule Philomena.Images.Query do "faved_by" => "favourited_by_users", "faved_by_id" => "favourited_by_user_ids" }, - default: "namespaced_tags.name", - name: "anonymous" + default: "namespaced_tags.name" + + defparser "user", + int: + ~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count), + float: ~W(aspect_ratio wilson_score), + date: ~W(created_at updated_at first_seen_at), + literal: ~W(faved_by orig_sha512_hash sha512_hash uploader source_url original_format), + ngram: ~W(description), + custom: ~W(gallery_id my), + transforms: %{ + "gallery_id" => fn _ctx, value -> + %{nested: %{path: :galleries, query: %{term: %{"galleries.id" => value}}}} + end, + "my" => fn + %{user: %{id: id}}, "faves" -> %{term: %{favourited_by_user_ids: id}} + %{user: %{id: id}}, "upvotes" -> %{term: %{upvoter_ids: id}} + %{user: %{id: id}}, "downvotes" -> %{term: %{downvoter_ids: id}} + %{user: _u}, "watched" -> + %{query: %{match_all: %{}}} # todo + end + }, + aliases: %{ + "faved_by" => "favourited_by_users", + "faved_by_id" => "favourited_by_user_ids" + }, + default: "namespaced_tags.name" + + defparser "moderator", + int: + ~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id upvoted_by_id downvoted_by_id tag_count true_uploader_id hidden_by_id deleted_by_user-id), + float: ~W(aspect_ratio wilson_score), + date: ~W(created_at updated_at first_seen_at), + literal: ~W(faved_by orig_sha512_hash sha512_hash uploader source_url original_format fingerprint upvoted_by downvoted_by true_uploader hidden_by deleted_by_user), + ngram: ~W(description deletion_reason), + ip: ~W(ip), + bool: ~W(deleted), + custom: ~W(gallery_id my), + transforms: %{ + "gallery_id" => fn _ctx, value -> + %{nested: %{path: :galleries, query: %{term: %{"galleries.id" => value}}}} + end, + "my" => fn + %{user: %{id: id}}, "faves" -> %{term: %{favourited_by_user_ids: id}} + %{user: %{id: id}}, "upvotes" -> %{term: %{upvoter_ids: id}} + %{user: %{id: id}}, "downvotes" -> %{term: %{downvoter_ids: id}} + %{user: _u}, "watched" -> + %{query: %{match_all: %{}}} # todo + end + }, + aliases: %{ + "faved_by" => "favourited_by_users", + "upvoted_by" => "upvoters", + "downvoted_by" => "downvoters", + "faved_by_id" => "favourited_by_user_ids", + "upvoted_by_id" => "upvoter_ids", + "downvoted_by_id" => "downvoter_ids", + "hidden_by" => "hidden_by_users", + "hidden_by_id" => "hidden_by_user_ids", + "deleted" => "hidden_from_users" + }, + default: "namespaced_tags.name" end diff --git a/lib/philomena/search/helpers.ex b/lib/philomena/search/helpers.ex index 96b2abc3..58b4af9e 100644 --- a/lib/philomena/search/helpers.ex +++ b/lib/philomena/search/helpers.ex @@ -125,4 +125,22 @@ defmodule Philomena.Search.Helpers do def full_choice(combinator, choices) do choice(combinator, choices) end + + def contains_wildcard?(value) do + String.match?(value, ~r/(?<!\\)(?:\\\\)*[\*\?]/) + end + + def unescape_wildcard(value) do + # '*' and '?' are wildcard characters in the right context; + # don't unescape them. + Regex.replace(~r/(?<!\\)(?:\\)*([^\\\*\?])/, value, "\\1") + end + + def unescape_regular(value) do + Regex.replace(~r/(?<!\\)(?:\\)*(.)/, value, "\\1") + end + + def process_term(term) do + term |> String.trim() |> String.downcase() + end end diff --git a/lib/philomena/search/lexer.ex b/lib/philomena/search/lexer.ex index ebe7a90c..83b21fe7 100644 --- a/lib/philomena/search/lexer.ex +++ b/lib/philomena/search/lexer.ex @@ -1,5 +1,5 @@ defmodule Philomena.Search.Lexer do - defmacro __using__(opts) do + defmacro deflexer(name, opts) do literal_fields = Keyword.get(opts, :literal, []) |> Macro.expand(__CALLER__) ngram_fields = Keyword.get(opts, :ngram, []) |> Macro.expand(__CALLER__) bool_fields = Keyword.get(opts, :bool, []) |> Macro.expand(__CALLER__) @@ -8,9 +8,8 @@ defmodule Philomena.Search.Lexer do int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__) ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__) custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__) - lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer" - quote do + quote location: :keep do import NimbleParsec import Philomena.Search.Helpers @@ -342,29 +341,30 @@ defmodule Philomena.Search.Lexer do quoted_numeric = ignore(quot) |> concat(numeric) |> ignore(quot) stop_words = - choice([ + repeat(space) + |> choice([ string("\\") |> eos(), string(","), - concat(space, l_and), - concat(space, l_or), + l_and, + l_or, rparen, fuzz, boost ]) defcombinatorp( - :text, + unquote(:"#{name}_text"), lookahead_not(stop_words) |> choice([ string("\\") |> utf8_char([]), - string("(") |> parsec(:text) |> string(")"), + string("(") |> parsec(unquote(:"#{name}_text")) |> string(")"), utf8_char([]) ]) |> times(min: 1) ) text = - parsec(:text) + parsec(unquote(:"#{name}_text")) |> reduce({List, :to_string, []}) |> unwrap_and_tag(:text) @@ -462,7 +462,7 @@ defmodule Philomena.Search.Lexer do times(outer, min: 1) |> eos() - defparsec(unquote(lexer_name), search) + defparsec(unquote(:"#{name}_lexer"), search) end end end diff --git a/lib/philomena/search/parser.ex b/lib/philomena/search/parser.ex index 9dbfe024..57e7eb3d 100644 --- a/lib/philomena/search/parser.ex +++ b/lib/philomena/search/parser.ex @@ -1,49 +1,50 @@ defmodule Philomena.Search.Parser do - defmacro __using__(opts) do - lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer" - parser_name = :"#{Keyword.fetch!(opts, :name)}_parser" + defmacro defparser(name, opts) do field_transforms = Keyword.get(opts, :transforms, %{}) field_aliases = Keyword.get(opts, :aliases, %{}) default_field = Keyword.fetch!(opts, :default) quote location: :keep do - use Philomena.Search.Lexer, unquote(opts) + import Philomena.Search.Lexer + import Philomena.Search.Helpers - def unquote(parser_name)(ctx, input) do - with {:ok, tree, _1, _2, _3, _4} <- unquote(lexer_name)(input) do - parse(ctx, tree) + deflexer unquote(name), unquote(opts) + + def unquote(:"#{name}_parser")(ctx, input) do + with {:ok, tree, _1, _2, _3, _4} <- unquote(:"#{name}_lexer")(input) do + unquote(:"#{name}_parse")(ctx, tree) else {:error, msg, _1, _2, _3, _4} -> {:error, msg} end end - defp parse(ctx, tokens) do - {tree, []} = search_top(ctx, tokens) + defp unquote(:"#{name}_parse")(ctx, tokens) do + {tree, []} = unquote(:"#{name}_top")(ctx, tokens) {:ok, tree} - rescue - e in ArgumentError -> - {:error, e.message} + #rescue + # e in ArgumentError -> + # {:error, e.message} - _ -> - {:error, "Parsing error."} + # _ -> + # {:error, "Parsing error."} end # # Predictive LL(k) parser for search grammar # - defp search_top(ctx, tokens), do: search_or(ctx, tokens) + defp unquote(:"#{name}_top")(ctx, tokens), do: unquote(:"#{name}_or")(ctx, tokens) # # Boolean OR # - defp search_or(ctx, tokens) do - case search_and(ctx, tokens) do + defp unquote(:"#{name}_or")(ctx, tokens) do + case unquote(:"#{name}_and")(ctx, tokens) do {left, [{:or, _} | r_tokens]} -> - {right, rest} = search_or(ctx, r_tokens) + {right, rest} = unquote(:"#{name}_or")(ctx, r_tokens) {%{bool: %{should: [left, right]}}, rest} {child, rest} -> @@ -55,10 +56,10 @@ defmodule Philomena.Search.Parser do # Boolean AND # - defp search_and(ctx, tokens) do - case search_boost(ctx, tokens) do + defp unquote(:"#{name}_and")(ctx, tokens) do + case unquote(:"#{name}_boost")(ctx, tokens) do {left, [{:and, _} | r_tokens]} -> - {right, rest} = search_and(ctx, r_tokens) + {right, rest} = unquote(:"#{name}_and")(ctx, r_tokens) {%{bool: %{must: [left, right]}}, rest} {child, rest} -> @@ -70,8 +71,8 @@ defmodule Philomena.Search.Parser do # Subquery score boosting # - defp search_boost(ctx, tokens) do - case search_not(ctx, tokens) do + defp unquote(:"#{name}_boost")(ctx, tokens) do + case unquote(:"#{name}_not")(ctx, tokens) do {child, [{:boost, _}, {:number, value} | r_tokens]} -> {%{function_score: %{query: child, boost_factor: value}}, r_tokens} @@ -84,20 +85,20 @@ defmodule Philomena.Search.Parser do # Boolean NOT # - defp search_not(ctx, [{:not, _} | r_tokens]) do - {child, rest} = search_top(ctx, r_tokens) + defp unquote(:"#{name}_not")(ctx, [{:not, _} | r_tokens]) do + {child, rest} = unquote(:"#{name}_not")(ctx, r_tokens) {%{bool: %{must_not: child}}, rest} end - defp search_not(ctx, tokens), do: search_group(ctx, tokens) + defp unquote(:"#{name}_not")(ctx, tokens), do: unquote(:"#{name}_group")(ctx, tokens) # # Logical grouping # - defp search_group(ctx, [{:lparen, _} | rest]) do - case search_top(ctx, rest) do + defp unquote(:"#{name}_group")(ctx, [{:lparen, _} | rest]) do + case unquote(:"#{name}_top")(ctx, rest) do {child, [{:rparen, _} | r_tokens]} -> {child, r_tokens} @@ -106,20 +107,20 @@ defmodule Philomena.Search.Parser do end end - defp search_group(_ctx, [{:rparen, _} | _rest]), + defp unquote(:"#{name}_group")(_ctx, [{:rparen, _} | _rest]), do: raise(ArgumentError, "Imbalanced parentheses.") - defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens) + defp unquote(:"#{name}_group")(ctx, tokens), do: unquote(:"#{name}_fuzz")(ctx, tokens) # # Terms and term fuzzing # - defp search_fuzz(ctx, tokens) do + defp unquote(:"#{name}_fuzz")(ctx, tokens) do case tokens do [{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> {%{ - range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}} + range: %{unquote(:"#{name}_alias")(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}} }, r_tokens} [ @@ -130,7 +131,7 @@ defmodule Philomena.Search.Parser do {:number, fuzz} | r_tokens ] -> {%{ - range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}} + range: %{unquote(:"#{name}_alias")(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}} }, r_tokens} [ @@ -140,7 +141,7 @@ defmodule Philomena.Search.Parser do {:fuzz, _}, {:number, fuzz} | r_tokens ] -> - {%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} + {%{fuzzy: %{unquote(:"#{name}_alias")(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} [ {:ngram_field, field}, @@ -149,13 +150,13 @@ defmodule Philomena.Search.Parser do {:fuzz, _}, {:number, fuzz} | r_tokens ] -> - {%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} + {%{fuzzy: %{unquote(:"#{name}_alias")(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} [{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> {%{fuzzy: %{unquote(default_field) => %{value: value, fuzziness: fuzz}}}, r_tokens} _ -> - search_range(ctx, tokens) + unquote(:"#{name}_range")(ctx, tokens) end end @@ -163,60 +164,60 @@ defmodule Philomena.Search.Parser do # Range queries # - defp search_range(ctx, tokens) do + defp unquote(:"#{name}_range")(ctx, tokens) do case tokens do [{:int_field, field}, {range, _}, {:int, value} | r_tokens] when range in [:gt, :gte, :lt, :lte] -> - {%{range: %{try_alias(field) => %{range => value}}}, r_tokens} + {%{range: %{unquote(:"#{name}_alias")(field) => %{range => value}}}, r_tokens} [{:float_field, field}, {range, _}, {:number, value} | r_tokens] when range in [:gt, :gte, :lt, :lte] -> - {%{range: %{try_alias(field) => %{range => value}}}, r_tokens} + {%{range: %{unquote(:"#{name}_alias")(field) => %{range => value}}}, r_tokens} [{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens] when range in [:gt, :gte, :lt, :lte] -> - {%{range: %{try_alias(field) => %{range => lower}}}, r_tokens} + {%{range: %{unquote(:"#{name}_alias")(field) => %{range => lower}}}, r_tokens} _ -> - search_custom(ctx, tokens) + unquote(:"#{name}_custom")(ctx, tokens) end end - defp search_custom(ctx, tokens) do + defp unquote(:"#{name}_custom")(ctx, tokens) do case tokens do [{:custom_field, field}, {:text, value} | r_tokens] -> {unquote(field_transforms)[field].(ctx, value), r_tokens} _ -> - search_term(ctx, tokens) + unquote(:"#{name}_term")(ctx, tokens) end end - defp search_term(_ctx, tokens) do + defp unquote(:"#{name}_term")(_ctx, tokens) do case tokens do [{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] -> - {%{range: %{try_alias(field) => %{gte: lower, lte: higher}}}, r_tokens} + {%{range: %{unquote(:"#{name}_alias")(field) => %{gte: lower, lte: higher}}}, r_tokens} [{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] -> value = process_term(value) if contains_wildcard?(value) do - {%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens} + {%{wildcard: %{unquote(:"#{name}_alias")(field) => unescape_wildcard(value)}}, r_tokens} else - {%{match: %{try_alias(field) => unescape_regular(value)}}, r_tokens} + {%{match: %{unquote(:"#{name}_alias")(field) => unescape_regular(value)}}, r_tokens} end [{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] -> value = process_term(value) if contains_wildcard?(value) do - {%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens} + {%{wildcard: %{unquote(:"#{name}_alias")(field) => unescape_wildcard(value)}}, r_tokens} else - {%{term: %{try_alias(field) => unescape_regular(value)}}, r_tokens} + {%{term: %{unquote(:"#{name}_alias")(field) => unescape_regular(value)}}, r_tokens} end [{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] -> - {%{term: %{try_alias(field) => value}}, r_tokens} + {%{term: %{unquote(:"#{name}_alias")(field) => value}}, r_tokens} [{:default, [text: value]} | r_tokens] -> value = process_term(value) @@ -232,25 +233,7 @@ defmodule Philomena.Search.Parser do end end - defp contains_wildcard?(value) do - String.match?(value, ~r/(?<!\\)(?:\\\\)*[\*\?]/) - end - - defp unescape_wildcard(value) do - # '*' and '?' are wildcard characters in the right context; - # don't unescape them. - Regex.replace(~r/(?<!\\)(?:\\)*([^\\\*\?])/, value, "\\1") - end - - defp unescape_regular(value) do - Regex.replace(~r/(?<!\\)(?:\\)*(.)/, value, "\\1") - end - - defp process_term(term) do - term |> String.trim() |> String.downcase() - end - - defp try_alias(field) do + defp unquote(:"#{name}_alias")(field) do unquote(field_aliases)[field] || field end end