diff --git a/lib/philomena/search/lexer.ex b/lib/philomena/search/lexer.ex index 56d7bde8..24407a64 100644 --- a/lib/philomena/search/lexer.ex +++ b/lib/philomena/search/lexer.ex @@ -1,6 +1,11 @@ defmodule Philomena.Search.Lexer do def lex(input) do - accept(input |> to_charlist, :outer) + {:ok, accept(input |> to_charlist, :outer)} + rescue + e in ArgumentError -> + {:error, e.message} + _ -> + {:error, "Parsing error."} end # @@ -33,8 +38,8 @@ defmodule Philomena.Search.Lexer do # Quoted term state # - defp accept([?\\, c] ++ rest, term, :quoted_term), do: accept(rest, term <> <>, :quoted_term) - defp accept([?\\], _term, :quoted_term), do: raise ArgumentError, "Unpaired backslash." + defp accept('\\"' ++ rest, term, :quoted_term), do: accept(rest, term <> "\"", :quoted_term) + defp accept('\\', _term, :quoted_term), do: raise ArgumentError, "Unpaired backslash." defp accept('"' ++ rest, term, :quoted_term), do: [{:term, term} | accept(rest, :outer)] defp accept([c] ++ rest, term, :quoted_term), do: accept(rest, term <> <>, :quoted_term) defp accept([], _term, :quoted_term), do: raise ArgumentError, "Imbalanced quotes." @@ -43,22 +48,22 @@ defmodule Philomena.Search.Lexer do # Term state # - defp accept([?\\, c] ++ rest, term, depth, :term), do: accept(rest, term <> <>, depth, :term) + defp accept([?\\, c] ++ rest, term, depth, :term) when c in '()\\', do: accept(rest, term <> <>, depth, :term) defp accept('\\', _term, _depth, :term), do: raise ArgumentError, "Unpaired backslash." defp accept('(' ++ rest, term, depth, :term), do: accept(rest, term <> "(", depth + 1, :term) - defp accept(')' ++ rest, term, 0, :term), do: [{:term, term}, {:rparen, ")"} | accept(rest, :outer)] + defp accept(')' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:rparen, ")"} | accept(rest, :outer)] defp accept(')' ++ rest, term, depth, :term), do: accept(rest, term <> ")", depth - 1, :term) - defp accept(' AND' ++ rest, term, 0, :term), do: [{:term, term}, {:and, "AND"} | accept(rest, :outer)] - defp accept(' OR' ++ rest, term, 0, :term), do: [{:term, term}, {:or, "OR"} | accept(rest, :outer)] - defp accept(' &&' ++ rest, term, 0, :term), do: [{:term, term}, {:and, "&&"} | accept(rest, :outer)] - defp accept(' ||' ++ rest, term, 0, :term), do: [{:term, term}, {:or, "||"} | accept(rest, :outer)] - defp accept(',' ++ rest, term, 0, :term), do: [{:term, term}, {:and, ","} | accept(rest, :outer)] + defp accept(' AND' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:and, "AND"} | accept(rest, :outer)] + defp accept(' OR' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:or, "OR"} | accept(rest, :outer)] + defp accept(' &&' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:and, "&&"} | accept(rest, :outer)] + defp accept(' ||' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:or, "||"} | accept(rest, :outer)] + defp accept(',' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:and, ","} | accept(rest, :outer)] defp accept([?^, c] ++ rest, term, 0, :term) when c in '+-0123456789', do: [{:term, term}, {:boost, "^"} | accept([c | rest], :float)] defp accept([?~, c] ++ rest, term, 0, :term) when c in '+-0123456789', do: [{:term, term}, {:fuzz, "~"} | accept([c | rest], :float)] defp accept('^' ++ rest, term, depth, :term), do: accept(rest, term <> "^", depth, :term) defp accept('~' ++ rest, term, depth, :term), do: accept(rest, term <> "~", depth, :term) defp accept([c] ++ rest, term, depth, :term), do: accept(rest, term <> <>, depth, :term) - defp accept([], term, 0, :term), do: [term: term, eof: "$"] + defp accept([], term, 0, :term), do: [term: String.trim(term), eof: "$"] defp accept([], _term, _depth, :term), do: raise ArgumentError, "Imbalanced parentheses." # @@ -79,6 +84,7 @@ defmodule Philomena.Search.Lexer do defp accept(',' ++ rest, term, :float_w), do: [{:float, to_number(term)}, {:and, ","} | accept(rest, :outer)] defp accept('^' ++ rest, term, :float_w), do: [{:float, to_number(term)}, {:boost, "^"} | accept(rest, :float)] defp accept('~' ++ rest, term, :float_w), do: [{:float, to_number(term)}, {:fuzz, "~"} | accept(rest, :float)] + defp accept(' ' ++ rest, term, :float_w), do: [{:float, to_number(term)} | accept(rest, :outer)] defp accept([], term, :float_w), do: [float: to_number(term), eof: "$"] defp accept(_input, _term, :float_w), do: raise ArgumentError, "Expected a number." @@ -91,6 +97,7 @@ defmodule Philomena.Search.Lexer do defp accept(',' ++ rest, term, :float_f), do: [{:float, to_number(term)}, {:and, ","} | accept(rest, :outer)] defp accept('^' ++ rest, term, :float_f), do: [{:float, to_number(term)}, {:boost, "^"} | accept(rest, :float)] defp accept('~' ++ rest, term, :float_f), do: [{:float, to_number(term)}, {:fuzz, "~"} | accept(rest, :float)] + defp accept(' ' ++ rest, term, :float_f), do: [{:float, to_number(term)} | accept(rest, :outer)] defp accept([], term, :float_f), do: [float: to_number(term), eof: "$"] defp accept(_input, _term, :float_f), do: raise ArgumentError, "Expected a number." diff --git a/lib/philomena/search/lexer_two.ex b/lib/philomena/search/lexer_two.ex new file mode 100644 index 00000000..2041f999 --- /dev/null +++ b/lib/philomena/search/lexer_two.ex @@ -0,0 +1,103 @@ +defmodule Philomena.Search.LexerTwo do + import NimbleParsec + + defp to_number(term) do + {float_val, _} = :string.to_float(term) + {int_val, _} = :string.to_integer(term) + + cond do + is_float(float_val) -> + float_val + is_integer(int_val) -> + int_val + end + end + + l_and = + choice([string("AND"), string("&&"), string(",")]) + |> unwrap_and_tag(:and) + + l_or = + choice([string("OR"), string("||")]) + |> unwrap_and_tag(:or) + + l_not = + choice([string("NOT"), string("!"), string("-")]) + |> unwrap_and_tag(:not) + + lparen = string("(") |> unwrap_and_tag(:lparen) + rparen = string(")") |> unwrap_and_tag(:rparen) + + number = + optional(ascii_char('-+')) + |> ascii_char([?0..?9]) + |> times(min: 1) + |> optional(ascii_char('.') |> ascii_char([?0..?9]) |> times(min: 1)) + |> reduce(:to_number) + + boost = ignore(string("^")) |> unwrap_and_tag(number, :boost) + fuzz = ignore(string("~")) |> unwrap_and_tag(number, :fuzz) + + space = + choice([string(" "), string("\t"), string("\n"), string("\r"), string("\v"), string("\f")]) + |> ignore() + + quot = string("\"") + + quoted_term = + ignore(quot) + |> choice([ + ignore(string("\\")) |> string("\""), + ignore(string("\\")) |> string("\\"), + string("\\") |> utf8_char([]), + utf8_char(not: ?") + ]) + |> times(min: 1) + |> ignore(quot) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:term) + + stop_words = choice([ + string("\\") |> eos(), + string(","), + concat(space, l_and), + concat(space, l_or), + concat(space, l_not), + rparen, + fuzz, + boost + ]) + + defcombinatorp :simple_term, + lookahead_not(stop_words) + |> choice([ + string("\\") |> utf8_char([]), + string("(") |> parsec(:simple_term) |> string(")"), + utf8_char([]), + ]) + |> times(min: 1) + + unquoted_term = + parsec(:simple_term) + |> reduce({List, :to_string, []}) + |> unwrap_and_tag(:term) + + outer = choice([ + l_and, + l_or, + l_not, + lparen, + rparen, + boost, + fuzz, + space, + quoted_term, + unquoted_term + ]) + + search = + times(outer, min: 1) + |> eos() + + defparsec :search, search +end \ No newline at end of file diff --git a/lib/philomena/search/parser.ex b/lib/philomena/search/parser.ex new file mode 100644 index 00000000..9fa3d66f --- /dev/null +++ b/lib/philomena/search/parser.ex @@ -0,0 +1,97 @@ +defmodule Philomena.Search.Parser do + alias Philomena.Search.Lexer + + def parse(ctx, tokens) do + {tree, [eof: "$"]} = search_top(ctx, tokens) + + {:ok, tree} + rescue + e in ArgumentError -> + {:error, e.message} + _ -> + {:error, "Parsing error."} + end + + # + # Predictive LL(k) parser for search grammar + # + defp search_top(ctx, tokens), do: search_or(ctx, tokens) + + # + # Boolean OR + # + defp search_or(ctx, tokens) do + case search_and(ctx, tokens) do + {left, [{:or, _} | r_tokens]} -> + {right, rest} = search_top(ctx, r_tokens) + {%{bool: %{should: [left, right]}}, rest} + {child, rest} -> + {child, rest} + end + end + + # + # Boolean AND + # + defp search_and(ctx, tokens) do + case search_boost(ctx, tokens) do + {left, [{:and, _} | r_tokens]} -> + {right, rest} = search_top(ctx, r_tokens) + {%{bool: %{must: [left, right]}}, rest} + {child, rest} -> + {child, rest} + end + end + + # + # Subquery score boosting + # + defp search_boost(ctx, tokens) do + case search_not(ctx, tokens) do + {child, [{:boost, _}, {:float, value} | r_tokens]} -> + {%{function_score: %{query: child, boost_factor: value}}, r_tokens} + {child, rest} -> + {child, rest} + end + end + + # + # Boolean NOT + # + defp search_not(ctx, [{:not, _} | r_tokens]) do + {child, rest} = search_top(ctx, r_tokens) + + {%{bool: %{must_not: child}}, rest} + end + defp search_not(ctx, tokens), do: search_group(ctx, tokens) + + # + # Logical grouping + # + defp search_group(ctx, [{:lparen, _} | rest]) do + case search_top(ctx, rest) do + {child, [{:rparen, _} | r_tokens]} -> + {child, r_tokens} + _ -> + raise ArgumentError, "Imbalanced parentheses." + end + end + defp search_group(_ctx, [{:rparen, _} | _rest]), do: raise ArgumentError, "Imbalanced parentheses." + defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens) + + # + # Term fuzzing + # + defp search_fuzz(ctx, tokens) do + nil + end + + # + # Search terms + # + defp search_term(ctx, [{:term, t} | rest]) do + {TermParser.parse(ctx, t), rest} + end + defp search_term(_ctx, [eof: "$"]), do: raise ArgumentError, "Expected a term, got ." + defp search_term(_ctx, [{_, text} | _rest]), do: raise ArgumentError, "Expected a term, got `#{text}'." +end \ No newline at end of file diff --git a/lib/philomena/search/term_lexer.ex b/lib/philomena/search/term_lexer.ex new file mode 100644 index 00000000..f27532ae --- /dev/null +++ b/lib/philomena/search/term_lexer.ex @@ -0,0 +1,247 @@ +""" +defmodule Philomena.Search.TermLexer do + def lex(opts, input) do + {:ok | accept(opts, input, :literal)} + #rescue + # e in ArgumentError -> + # {:error, e.message} + # _ -> + # {:error, "Parsing error."} + end + + # + # Literal fields + # + + defp accept([field | r_fields], opts, input, :literal_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:literal_field, field} | accept(rest, :literal)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:literal_field, field} | accept(rest, :literal)] + _ -> + accept(r_fields, opts, input, :literal_field) + end + end + defp accept([], %{boolean_fields: fields} = opts, input, :literal_field), do: accept(fields, opts, input, :boolean_field) + + # + # Boolean fields + # + + defp accept([field | r_fields], opts, input, :boolean_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:boolean_field, field} | accept(rest, :boolean)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:boolean_field, field} | accept(rest, :boolean)] + _ -> + accept(r_fields, opts, input, :boolean_field) + end + end + defp accept([], %{ngram_fields: fields} = opts, input, :boolean_field), do: accept(fields, opts, input, :ngram_field) + + # + # NLP-analyzed fields + # + + defp accept([field | r_fields], opts, input, :ngram_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:ngram_field, field} | accept(rest, :literal)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:ngram_field, field} | accept(rest, :literal)] + _ -> + accept(r_fields, opts, input, :ngram_field) + end + end + defp accept([], %{ip_fields: fields} = opts, input, :ngram_field), do: accept(fields, opts, input, :ip_fieldngram + + # + # IP address and CIDR range fields + # + + defp accept([field | r_fields], opts, input, :ip_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:ip_field, field} | accept(rest, :ip)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:ip_field, field} | accept(rest, :ip)] + _ -> + accept(r_fields, opts, input, :ip_field) + end + end + defp accept([], %{int_fields: fields} = opts, input, :ip_field), do: accept(fields, opts, input, :int_field) + + # + # Integer fields + # + + defp accept([field | r_fields], opts, input, :int_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:int_field, field}, {:range, :eq} | accept(rest, :int)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:int_field, field}, {:range, :eq} | accept(rest, :int)] + <<^field::binary-size(sz), ".lt:", rest::binary>> -> + [{:int_field, field}, {:range, :lt} | accept(rest, :int)] + <<^field::binary-size(sz), ".lte:", rest::binary>> -> + [{:int_field, field}, {:range, :lte}. accept(rest, :int)] + <<^field::binary-size(sz), ".gt:", rest::binary>> -> + [{:int_field, field}, {:range, :gt} | accept(rest, :int)] + <<^field::binary-size(sz), ".gte:", rest::binary>> -> + [{:int_field, field}, {:range, :gte} | accept(rest, :int)] + _ -> + accept(r_fields, opts, input, :int_field) + end + end + defp accept([], %{float_fields: fields} = opts, input, :int_field), do: accept(fields, opts, input, :float_field) + + # + # Float fields + # + + defp accept([field | r_fields], opts, input, :float_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:float_field, field}, {:range, :eq} | accept(rest, :float)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:float_field, field}, {:range, :eq} | accept(rest, :float)] + <<^field::binary-size(sz), ".lt:", rest::binary>> -> + [{:float_field, field}, {:range, :lt} | accept(rest, :float)] + <<^field::binary-size(sz), ".lte:", rest::binary>> -> + [{:float_field, field}, {:range, :lte} | accept(rest, :float)] + <<^field::binary-size(sz), ".gt:", rest::binary>> -> + [{:float_field, field}, {:range, :gt} | accept(rest, :float)] + <<^field::binary-size(sz), ".gte:", rest::binary>> -> + [{:float_field, field}, {:range, :gte} | accept(rest, :float)] + _ -> + accept(r_fields, opts, input, :float_field) + end + end + defp accept([], %{date_fields: fields} = opts, input, :float_field), do: accept(fields, opts, input, :date_field) + + # + # Date fields + # + + defp accept([field | r_fields], opts, input, :date_field) do + sz = field |> byte_size + + case input do + <<^field::binary-size(sz), ":", rest::binary>> -> + [{:date_field, field}, {:range, :eq} | accept(rest, :date)] + <<^field::binary-size(sz), ".eq:", rest::binary>> -> + [{:date_field, field}, {:range, :eq} | accept(rest, :date)] + <<^field::binary-size(sz), ".lt:", rest::binary>> -> + [{:date_field, field}, {:range, :lt} | accept(rest, :date)] + <<^field::binary-size(sz), ".lte:", rest::binary>> -> + [{:date_field, field}, {:range, :lte} | accept(rest, :date)] + <<^field::binary-size(sz), ".gt:", rest::binary>> -> + [{:date_field, field}, {:range, :gt} | accept(rest, :date)] + <<^field::binary-size(sz), ".gte:", rest::binary>> -> + [{:date_field, field}, {:range, :gte} | accept(rest, :date)] + _ -> + accept(r_fields, opts, input, :date_field) + end + end + + # + # Default field handling + # + + defp accept([], %{default_field: field} = opts, input, :date_field) do + [{:literal_field, field} | accept(input, :literal)] + end + + # + # Text and wildcarded text + # + + defp accept(input, :literal), do: accept(input, "", :literal) + + defp accept(<<"\\", c::utf8, rest::binary>>, term, :literal), do: accept(rest, term <> <>, :literal) + defp accept(<>, term, :literal) when c in '*?', do: accept(rest, term <> <>, :wildcard) + defp accept(<>, term, :literal), do: accept(rest, term <> <>, :literal) + defp accept(<<>>, term, :literal), do: [literal: term] + + defp accept(<<"\\", c::utf8, rest::binary>>, term, :wildcard) when c in '*?', do: accept(rest, term <> <<"\\", c::utf8>>, :wildcard) + defp accept(<<"\\", c::utf8, rest::binary>>, term, :wildcard), do: accept(rest, term <> <>, :wildcard) + defp accept(<>, term, :wildcard), do: accept(rest, term <> <>, :wildcard) + defp accept(<<>>, term, :wildcard), do: [wildcard: term] + + # + # Booleans + # + + defp accept("true", :boolean), do: [boolean: true] + defp accept("false", :boolean), do: [boolean: false] + defp accept(input, :boolean), do: raise ArgumentError, "Expected a boolean, got `\#{input}'." + + # + # Floats (integers are also considered valid) + # + + defp accept(<<"+", rest::binary>>, :float), do: accept(rest, "", :float_w) + defp accept(<<"-", rest::binary>>, :float), do: accept(rest, "-", :float_w) + defp accept(input, :float), do: accept(input, "", :float_w) + + defp accept(<>, term, :float_w) when c in ?0..?9, do: accept(rest, term <> <>, :float_w) + defp accept(<<".", rest::binary>>, term, :float_w), do: accept(rest, term <> ".", :float_f) + defp accept(<>, term, :float_w), do: raise ArgumentError, "Expected a float, got `\#{<>}'." + defp accept(<<>>, term, :float_w), do: [float: to_number(term)] + + defp accept(<>, term, :float_f) when c in ?0..?9, do: accept(rest, term <> <>, :float_f) + defp accept(<>, term, :float_f), do: raise ArgumentError, "Expected a float, got `\#{<>}'." + defp accept(<<>>, term, :float_f), do: [float: to_number(term)] + + # + # Integers + # + + defp accept(<<"+", rest::binary>>, :int), do: accept(rest, "", :int_w) + defp accept(<<"-", rest::binary>>, :int), do: accept(rest, "-", :int_w) + defp accept(input, :int), do: accept(input, "", :int_w) + + defp accept(<>, term, :int_w) when c in ?0..?9, do: accept(rest, term <> <>, :int_w) + defp accept(<>, term, :int_w), do: raise ArgumentError, "Expected an integer, got `\#{<>}'." + defp accept(<<>>, term, :int_w), do: [int: to_number(term)] + + # + # IP addresses + # + + defp accept(<>, :ip) when c1 in ?0..9 and c2 in ?0..?9 and c3 in ?0..?9, do: accept({}) + defp accept(<<"::ffff:", c1::utf8, c2::utf8, c3::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9 and c2 in ?0..?9 and c3 in ?0..?9, do: accept({}) + defp accept(<>, :ip) when c1 in ?0..9 and c2 in ?0..?9, do: accept({}) + defp accept(<<"::ffff:",c1::utf8, c2::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9 and c2 in ?0..?9, do: accept({}) + defp accept(<>, :ip) when c1 in ?0..9, do: accept({}) + defp accept(<<"::ffff:", c1::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9, do: accept({}) + + defp to_number(term) do + {float_val, _} = :string.to_float(term) + {int_val, _} = :string.to_integer(term) + + cond do + is_float(float_val) -> + float_val + is_integer(int_val) -> + int_val + true -> + raise ArgumentError, "Expected a number." + end + end +end +""" \ No newline at end of file diff --git a/mix.exs b/mix.exs index f3ceb164..38b82378 100644 --- a/mix.exs +++ b/mix.exs @@ -49,7 +49,8 @@ defmodule Philomena.MixProject do {:bcrypt_elixir, "~> 2.0"}, {:pot, "~> 0.10.1"}, {:secure_compare, "~> 0.1.0"}, - {:elastix, "~> 0.7.1"} + {:elastix, "~> 0.7.1"}, + {:nimble_parsec, "~> 0.5.1"}, ] end diff --git a/mix.lock b/mix.lock index c9a7a73f..af277f52 100644 --- a/mix.lock +++ b/mix.lock @@ -22,6 +22,7 @@ "mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm"}, "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm"}, "neotoma": {:hex, :neotoma, "1.7.3", "d8bd5404b73273989946e4f4f6d529e5c2088f5fa1ca790b4dbe81f4be408e61", [:rebar], [], "hexpm"}, + "nimble_parsec": {:hex, :nimble_parsec, "0.5.1", "c90796ecee0289dbb5ad16d3ad06f957b0cd1199769641c961cfe0b97db190e0", [:mix], [], "hexpm"}, "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"}, "pbkdf2": {:hex, :pbkdf2, "2.0.0", "11c23279fded5c0027ab3996cfae77805521d7ef4babde2bd7ec04a9086cf499", [:rebar3], [], "hexpm"}, "phoenix": {:hex, :phoenix, "1.4.9", "746d098e10741c334d88143d3c94cab1756435f94387a63441792e66ec0ee974", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 1.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.8.1 or ~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm"},