From a0fa66628a55d518a44517b57c1a805d6fd1a984 Mon Sep 17 00:00:00 2001 From: "Liam P. White" Date: Mon, 26 Aug 2019 20:00:39 -0400 Subject: [PATCH] parser bundle --- lib/philomena/images/query.ex | 16 +- lib/philomena/search/lexer.ex | 13 +- lib/philomena/search/parser.ex | 416 ++++++++++++++++++--------------- 3 files changed, 243 insertions(+), 202 deletions(-) diff --git a/lib/philomena/images/query.ex b/lib/philomena/images/query.ex index 549840dc..a4114851 100644 --- a/lib/philomena/images/query.ex +++ b/lib/philomena/images/query.ex @@ -1,9 +1,21 @@ defmodule Philomena.Images.Query do - use Philomena.Search.Lexer, + use Philomena.Search.Parser, int: ~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count), float: ~W(aspect_ratio wilson_score), date: ~W(created_at updated_at first_seen_at), literal: ~W(faved_by orig_sha512_hash sha512_hash uploader source_url original_format), - ngram: ~W(description) + ngram: ~W(description), + custom: ~W(gallery_id), + transforms: %{ + "gallery_id" => fn _ctx, value -> + %{nested: %{path: :galleries, query: %{term: %{"galleries.id" => value}}}} + end + }, + aliases: %{ + "faved_by" => "favourited_by_users", + "faved_by_id" => "favourited_by_user_ids" + }, + default: "namespaced_tags.name", + name: "anonymous" end diff --git a/lib/philomena/search/lexer.ex b/lib/philomena/search/lexer.ex index 14edce6a..cd430e37 100644 --- a/lib/philomena/search/lexer.ex +++ b/lib/philomena/search/lexer.ex @@ -8,8 +8,9 @@ defmodule Philomena.Search.Lexer do int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__) ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__) custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__) + lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer" - quote location: :keep do + quote do import NimbleParsec import Philomena.Search.Helpers @@ -181,10 +182,10 @@ defmodule Philomena.Search.Lexer do |> reduce({List, :to_string, []}) ip_address = - choice([ - ipv4_address |> optional(ipv4_prefix), - ipv6_address |> optional(ipv6_prefix) - ]) + #choice([ + ipv4_address |> optional(ipv4_prefix)#, + #ipv6_address |> optional(ipv6_prefix) + #]) |> reduce({Enum, :join, []}) |> label("a valid IPv4 or IPv6 address and optional CIDR prefix") |> unwrap_and_tag(:ip) @@ -459,7 +460,7 @@ defmodule Philomena.Search.Lexer do times(outer, min: 1) |> eos() - defparsec(:search, search) + defparsec(unquote(lexer_name), search) end end end diff --git a/lib/philomena/search/parser.ex b/lib/philomena/search/parser.ex index c1e35e4d..917501ac 100644 --- a/lib/philomena/search/parser.ex +++ b/lib/philomena/search/parser.ex @@ -1,210 +1,238 @@ defmodule Philomena.Search.Parser do - def parse(ctx, tokens) do - {tree, []} = search_top(ctx, tokens) + defmacro __using__(opts) do + lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer" + parser_name = :"#{Keyword.fetch!(opts, :name)}_parser" + field_transforms = Keyword.get(opts, :transforms, %{}) + field_aliases = Keyword.get(opts, :aliases, %{}) + default_field = Keyword.fetch!(opts, :default) - {:ok, tree} - rescue - e in ArgumentError -> - {:error, e.message} + quote location: :keep do + use Philomena.Search.Lexer, unquote(opts) - _ -> - {:error, "Parsing error."} - end - - # - # Predictive LL(k) parser for search grammar - # - defp search_top(ctx, tokens), do: search_or(ctx, tokens) - - # - # Boolean OR - # - - defp search_or(ctx, tokens) do - case search_and(ctx, tokens) do - {left, [{:or, _} | r_tokens]} -> - {right, rest} = search_top(ctx, r_tokens) - {%{bool: %{should: [left, right]}}, rest} - - {child, rest} -> - {child, rest} - end - end - - # - # Boolean AND - # - - defp search_and(ctx, tokens) do - case search_boost(ctx, tokens) do - {left, [{:and, _} | r_tokens]} -> - {right, rest} = search_top(ctx, r_tokens) - {%{bool: %{must: [left, right]}}, rest} - - {child, rest} -> - {child, rest} - end - end - - # - # Subquery score boosting - # - - defp search_boost(ctx, tokens) do - case search_not(ctx, tokens) do - {child, [{:boost, _}, {:number, value} | r_tokens]} -> - {%{function_score: %{query: child, boost_factor: value}}, r_tokens} - - {child, rest} -> - {child, rest} - end - end - - # - # Boolean NOT - # - - defp search_not(ctx, [{:not, _} | r_tokens]) do - {child, rest} = search_top(ctx, r_tokens) - - {%{bool: %{must_not: child}}, rest} - end - - defp search_not(ctx, tokens), do: search_group(ctx, tokens) - - # - # Logical grouping - # - - defp search_group(ctx, [{:lparen, _} | rest]) do - case search_top(ctx, rest) do - {child, [{:rparen, _} | r_tokens]} -> - {child, r_tokens} - - _ -> - raise ArgumentError, "Imbalanced parentheses." - end - end - - defp search_group(_ctx, [{:rparen, _} | _rest]), - do: raise(ArgumentError, "Imbalanced parentheses.") - - defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens) - - # - # Terms and term fuzzing - # - - defp search_fuzz(%{default_field: default_field} = ctx, tokens) do - case tokens do - [{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> - {%{range: %{field => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens} - - [{:float_field, field}, {:eq, _}, {:float, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> - {%{range: %{field => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens} - - [{:literal_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> - {%{fuzzy: %{field => %{value: value, fuzziness: fuzz}}}, r_tokens} - - [{:ngram_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> - {%{fuzzy: %{field => %{value: value, fuzziness: fuzz}}}, r_tokens} - - [{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> - {%{fuzzy: %{default_field => %{value: value, fuzziness: fuzz}}}, r_tokens} - - _ -> - search_range(ctx, tokens) - end - end - - # - # Range queries - # - - defp search_range(ctx, tokens) do - case tokens do - [{:int_field, field}, {range, _}, {:int, value} | r_tokens] - when range in [:gt, :gte, :lt, :lte] -> - {%{range: %{field => %{range => value}}}, r_tokens} - - [{:float_field, field}, {range, _}, {:number, value} | r_tokens] - when range in [:gt, :gte, :lt, :lte] -> - {%{range: %{field => %{range => value}}}, r_tokens} - - [{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens] - when range in [:gt, :gte, :lt, :lte] -> - {%{range: %{field => %{range => lower}}}, r_tokens} - - _ -> - search_custom(ctx, tokens) - end - end - - defp search_custom(ctx, tokens) do - case tokens do - [{:custom_field, field}, {:text, value} | r_tokens] -> - {ctx[:field_transforms][field].(value), r_tokens} - - _ -> - search_term(ctx, tokens) - end - end - - defp search_term(ctx, tokens) do - case tokens do - [{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] -> - {%{range: %{field => %{gte: lower, lte: higher}}}, r_tokens} - - [{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] -> - value = process_term(value) - - if contains_wildcard?(value) do - {%{wildcard: %{field => unescape_wildcard(value)}}, r_tokens} + def unquote(parser_name)(ctx, input) do + with {:ok, tree, _1, _2, _3, _4} <- unquote(lexer_name)(input) do + parse(ctx, tree) else - {%{match: %{field => unescape_regular(value)}}, r_tokens} + {:error, msg, _1, _2, _3, _4} -> + {:error, msg} + + {:error, msg} -> + {:error, msg} end + end - [{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] -> - value = process_term(value) + defp parse(ctx, tokens) do + {tree, []} = search_top(ctx, tokens) - if contains_wildcard?(value) do - {%{wildcard: %{field => unescape_wildcard(value)}}, r_tokens} - else - {%{term: %{field => unescape_regular(value)}}, r_tokens} + {:ok, tree} + rescue + e in ArgumentError -> + {:error, e.message} + + _ -> + {:error, "Parsing error."} + end + + # + # Predictive LL(k) parser for search grammar + # + defp search_top(ctx, tokens), do: search_or(ctx, tokens) + + # + # Boolean OR + # + + defp search_or(ctx, tokens) do + case search_and(ctx, tokens) do + {left, [{:or, _} | r_tokens]} -> + {right, rest} = search_top(ctx, r_tokens) + {%{bool: %{should: [left, right]}}, rest} + + {child, rest} -> + {child, rest} end + end - [{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] -> - {%{term: %{field => value}}, r_tokens} + # + # Boolean AND + # - [{:default, [text: value]} | r_tokens] -> - value = process_term(value) + defp search_and(ctx, tokens) do + case search_boost(ctx, tokens) do + {left, [{:and, _} | r_tokens]} -> + {right, rest} = search_top(ctx, r_tokens) + {%{bool: %{must: [left, right]}}, rest} - if contains_wildcard?(value) do - {%{wildcard: %{ctx[:default_field] => unescape_wildcard(value)}}, r_tokens} - else - {%{term: %{ctx[:default_field] => unescape_regular(value)}}, r_tokens} + {child, rest} -> + {child, rest} end + end - _ -> - raise ArgumentError, "Expected a term" + # + # Subquery score boosting + # + + defp search_boost(ctx, tokens) do + case search_not(ctx, tokens) do + {child, [{:boost, _}, {:number, value} | r_tokens]} -> + {%{function_score: %{query: child, boost_factor: value}}, r_tokens} + + {child, rest} -> + {child, rest} + end + end + + # + # Boolean NOT + # + + defp search_not(ctx, [{:not, _} | r_tokens]) do + {child, rest} = search_top(ctx, r_tokens) + + {%{bool: %{must_not: child}}, rest} + end + + defp search_not(ctx, tokens), do: search_group(ctx, tokens) + + # + # Logical grouping + # + + defp search_group(ctx, [{:lparen, _} | rest]) do + case search_top(ctx, rest) do + {child, [{:rparen, _} | r_tokens]} -> + {child, r_tokens} + + _ -> + raise ArgumentError, "Imbalanced parentheses." + end + end + + defp search_group(_ctx, [{:rparen, _} | _rest]), + do: raise(ArgumentError, "Imbalanced parentheses.") + + defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens) + + # + # Terms and term fuzzing + # + + defp search_fuzz(ctx, tokens) do + case tokens do + [{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> + {%{range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens} + + [{:float_field, field}, {:eq, _}, {:float, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> + {%{range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens} + + [{:literal_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> + {%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} + + [{:ngram_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> + {%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} + + [{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> + {%{fuzzy: %{unquote(default_field) => %{value: value, fuzziness: fuzz}}}, r_tokens} + + _ -> + search_range(ctx, tokens) + end + end + + # + # Range queries + # + + defp search_range(ctx, tokens) do + case tokens do + [{:int_field, field}, {range, _}, {:int, value} | r_tokens] + when range in [:gt, :gte, :lt, :lte] -> + {%{range: %{try_alias(field) => %{range => value}}}, r_tokens} + + [{:float_field, field}, {range, _}, {:number, value} | r_tokens] + when range in [:gt, :gte, :lt, :lte] -> + {%{range: %{try_alias(field) => %{range => value}}}, r_tokens} + + [{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens] + when range in [:gt, :gte, :lt, :lte] -> + {%{range: %{try_alias(field) => %{range => lower}}}, r_tokens} + + _ -> + search_custom(ctx, tokens) + end + end + + defp search_custom(ctx, tokens) do + case tokens do + [{:custom_field, field}, {:text, value} | r_tokens] -> + {unquote(field_transforms)[field].(ctx, value), r_tokens} + + _ -> + search_term(ctx, tokens) + end + end + + defp search_term(_ctx, tokens) do + case tokens do + [{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] -> + {%{range: %{try_alias(field) => %{gte: lower, lte: higher}}}, r_tokens} + + [{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] -> + value = process_term(value) + + if contains_wildcard?(value) do + {%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens} + else + {%{match: %{try_alias(field) => unescape_regular(value)}}, r_tokens} + end + + [{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] -> + value = process_term(value) + + if contains_wildcard?(value) do + {%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens} + else + {%{term: %{try_alias(field) => unescape_regular(value)}}, r_tokens} + end + + [{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] -> + {%{term: %{try_alias(field) => value}}, r_tokens} + + [{:default, [text: value]} | r_tokens] -> + value = process_term(value) + + if contains_wildcard?(value) do + {%{wildcard: %{unquote(default_field) => unescape_wildcard(value)}}, r_tokens} + else + {%{term: %{unquote(default_field) => unescape_regular(value)}}, r_tokens} + end + + _ -> + raise ArgumentError, "Expected a term" + end + end + + defp contains_wildcard?(value) do + String.match?(value, ~r/(? String.trim() |> String.downcase() + end + + defp try_alias(field) do + unquote(field_aliases)[field] || field + end end end - - defp contains_wildcard?(value) do - String.match?(value, ~r/(? String.trim() |> String.downcase() - end end