move to combinator approach

This commit is contained in:
byte[] 2019-08-21 20:42:07 -04:00
parent 1ab0db43c7
commit 0188a27893
6 changed files with 468 additions and 12 deletions

View file

@ -1,6 +1,11 @@
defmodule Philomena.Search.Lexer do
def lex(input) do
accept(input |> to_charlist, :outer)
{:ok, accept(input |> to_charlist, :outer)}
rescue
e in ArgumentError ->
{:error, e.message}
_ ->
{:error, "Parsing error."}
end
#
@ -33,8 +38,8 @@ defmodule Philomena.Search.Lexer do
# Quoted term state
#
defp accept([?\\, c] ++ rest, term, :quoted_term), do: accept(rest, term <> <<c::utf8>>, :quoted_term)
defp accept([?\\], _term, :quoted_term), do: raise ArgumentError, "Unpaired backslash."
defp accept('\\"' ++ rest, term, :quoted_term), do: accept(rest, term <> "\"", :quoted_term)
defp accept('\\', _term, :quoted_term), do: raise ArgumentError, "Unpaired backslash."
defp accept('"' ++ rest, term, :quoted_term), do: [{:term, term} | accept(rest, :outer)]
defp accept([c] ++ rest, term, :quoted_term), do: accept(rest, term <> <<c::utf8>>, :quoted_term)
defp accept([], _term, :quoted_term), do: raise ArgumentError, "Imbalanced quotes."
@ -43,22 +48,22 @@ defmodule Philomena.Search.Lexer do
# Term state
#
defp accept([?\\, c] ++ rest, term, depth, :term), do: accept(rest, term <> <<c::utf8>>, depth, :term)
defp accept([?\\, c] ++ rest, term, depth, :term) when c in '()\\', do: accept(rest, term <> <<c::utf8>>, depth, :term)
defp accept('\\', _term, _depth, :term), do: raise ArgumentError, "Unpaired backslash."
defp accept('(' ++ rest, term, depth, :term), do: accept(rest, term <> "(", depth + 1, :term)
defp accept(')' ++ rest, term, 0, :term), do: [{:term, term}, {:rparen, ")"} | accept(rest, :outer)]
defp accept(')' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:rparen, ")"} | accept(rest, :outer)]
defp accept(')' ++ rest, term, depth, :term), do: accept(rest, term <> ")", depth - 1, :term)
defp accept(' AND' ++ rest, term, 0, :term), do: [{:term, term}, {:and, "AND"} | accept(rest, :outer)]
defp accept(' OR' ++ rest, term, 0, :term), do: [{:term, term}, {:or, "OR"} | accept(rest, :outer)]
defp accept(' &&' ++ rest, term, 0, :term), do: [{:term, term}, {:and, "&&"} | accept(rest, :outer)]
defp accept(' ||' ++ rest, term, 0, :term), do: [{:term, term}, {:or, "||"} | accept(rest, :outer)]
defp accept(',' ++ rest, term, 0, :term), do: [{:term, term}, {:and, ","} | accept(rest, :outer)]
defp accept(' AND' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:and, "AND"} | accept(rest, :outer)]
defp accept(' OR' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:or, "OR"} | accept(rest, :outer)]
defp accept(' &&' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:and, "&&"} | accept(rest, :outer)]
defp accept(' ||' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:or, "||"} | accept(rest, :outer)]
defp accept(',' ++ rest, term, 0, :term), do: [{:term, String.trim(term)}, {:and, ","} | accept(rest, :outer)]
defp accept([?^, c] ++ rest, term, 0, :term) when c in '+-0123456789', do: [{:term, term}, {:boost, "^"} | accept([c | rest], :float)]
defp accept([?~, c] ++ rest, term, 0, :term) when c in '+-0123456789', do: [{:term, term}, {:fuzz, "~"} | accept([c | rest], :float)]
defp accept('^' ++ rest, term, depth, :term), do: accept(rest, term <> "^", depth, :term)
defp accept('~' ++ rest, term, depth, :term), do: accept(rest, term <> "~", depth, :term)
defp accept([c] ++ rest, term, depth, :term), do: accept(rest, term <> <<c::utf8>>, depth, :term)
defp accept([], term, 0, :term), do: [term: term, eof: "$"]
defp accept([], term, 0, :term), do: [term: String.trim(term), eof: "$"]
defp accept([], _term, _depth, :term), do: raise ArgumentError, "Imbalanced parentheses."
#
@ -79,6 +84,7 @@ defmodule Philomena.Search.Lexer do
defp accept(',' ++ rest, term, :float_w), do: [{:float, to_number(term)}, {:and, ","} | accept(rest, :outer)]
defp accept('^' ++ rest, term, :float_w), do: [{:float, to_number(term)}, {:boost, "^"} | accept(rest, :float)]
defp accept('~' ++ rest, term, :float_w), do: [{:float, to_number(term)}, {:fuzz, "~"} | accept(rest, :float)]
defp accept(' ' ++ rest, term, :float_w), do: [{:float, to_number(term)} | accept(rest, :outer)]
defp accept([], term, :float_w), do: [float: to_number(term), eof: "$"]
defp accept(_input, _term, :float_w), do: raise ArgumentError, "Expected a number."
@ -91,6 +97,7 @@ defmodule Philomena.Search.Lexer do
defp accept(',' ++ rest, term, :float_f), do: [{:float, to_number(term)}, {:and, ","} | accept(rest, :outer)]
defp accept('^' ++ rest, term, :float_f), do: [{:float, to_number(term)}, {:boost, "^"} | accept(rest, :float)]
defp accept('~' ++ rest, term, :float_f), do: [{:float, to_number(term)}, {:fuzz, "~"} | accept(rest, :float)]
defp accept(' ' ++ rest, term, :float_f), do: [{:float, to_number(term)} | accept(rest, :outer)]
defp accept([], term, :float_f), do: [float: to_number(term), eof: "$"]
defp accept(_input, _term, :float_f), do: raise ArgumentError, "Expected a number."

View file

@ -0,0 +1,103 @@
defmodule Philomena.Search.LexerTwo do
import NimbleParsec
defp to_number(term) do
{float_val, _} = :string.to_float(term)
{int_val, _} = :string.to_integer(term)
cond do
is_float(float_val) ->
float_val
is_integer(int_val) ->
int_val
end
end
l_and =
choice([string("AND"), string("&&"), string(",")])
|> unwrap_and_tag(:and)
l_or =
choice([string("OR"), string("||")])
|> unwrap_and_tag(:or)
l_not =
choice([string("NOT"), string("!"), string("-")])
|> unwrap_and_tag(:not)
lparen = string("(") |> unwrap_and_tag(:lparen)
rparen = string(")") |> unwrap_and_tag(:rparen)
number =
optional(ascii_char('-+'))
|> ascii_char([?0..?9])
|> times(min: 1)
|> optional(ascii_char('.') |> ascii_char([?0..?9]) |> times(min: 1))
|> reduce(:to_number)
boost = ignore(string("^")) |> unwrap_and_tag(number, :boost)
fuzz = ignore(string("~")) |> unwrap_and_tag(number, :fuzz)
space =
choice([string(" "), string("\t"), string("\n"), string("\r"), string("\v"), string("\f")])
|> ignore()
quot = string("\"")
quoted_term =
ignore(quot)
|> choice([
ignore(string("\\")) |> string("\""),
ignore(string("\\")) |> string("\\"),
string("\\") |> utf8_char([]),
utf8_char(not: ?")
])
|> times(min: 1)
|> ignore(quot)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:term)
stop_words = choice([
string("\\") |> eos(),
string(","),
concat(space, l_and),
concat(space, l_or),
concat(space, l_not),
rparen,
fuzz,
boost
])
defcombinatorp :simple_term,
lookahead_not(stop_words)
|> choice([
string("\\") |> utf8_char([]),
string("(") |> parsec(:simple_term) |> string(")"),
utf8_char([]),
])
|> times(min: 1)
unquoted_term =
parsec(:simple_term)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:term)
outer = choice([
l_and,
l_or,
l_not,
lparen,
rparen,
boost,
fuzz,
space,
quoted_term,
unquoted_term
])
search =
times(outer, min: 1)
|> eos()
defparsec :search, search
end

View file

@ -0,0 +1,97 @@
defmodule Philomena.Search.Parser do
alias Philomena.Search.Lexer
def parse(ctx, tokens) do
{tree, [eof: "$"]} = search_top(ctx, tokens)
{:ok, tree}
rescue
e in ArgumentError ->
{:error, e.message}
_ ->
{:error, "Parsing error."}
end
#
# Predictive LL(k) parser for search grammar
#
defp search_top(ctx, tokens), do: search_or(ctx, tokens)
#
# Boolean OR
#
defp search_or(ctx, tokens) do
case search_and(ctx, tokens) do
{left, [{:or, _} | r_tokens]} ->
{right, rest} = search_top(ctx, r_tokens)
{%{bool: %{should: [left, right]}}, rest}
{child, rest} ->
{child, rest}
end
end
#
# Boolean AND
#
defp search_and(ctx, tokens) do
case search_boost(ctx, tokens) do
{left, [{:and, _} | r_tokens]} ->
{right, rest} = search_top(ctx, r_tokens)
{%{bool: %{must: [left, right]}}, rest}
{child, rest} ->
{child, rest}
end
end
#
# Subquery score boosting
#
defp search_boost(ctx, tokens) do
case search_not(ctx, tokens) do
{child, [{:boost, _}, {:float, value} | r_tokens]} ->
{%{function_score: %{query: child, boost_factor: value}}, r_tokens}
{child, rest} ->
{child, rest}
end
end
#
# Boolean NOT
#
defp search_not(ctx, [{:not, _} | r_tokens]) do
{child, rest} = search_top(ctx, r_tokens)
{%{bool: %{must_not: child}}, rest}
end
defp search_not(ctx, tokens), do: search_group(ctx, tokens)
#
# Logical grouping
#
defp search_group(ctx, [{:lparen, _} | rest]) do
case search_top(ctx, rest) do
{child, [{:rparen, _} | r_tokens]} ->
{child, r_tokens}
_ ->
raise ArgumentError, "Imbalanced parentheses."
end
end
defp search_group(_ctx, [{:rparen, _} | _rest]), do: raise ArgumentError, "Imbalanced parentheses."
defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens)
#
# Term fuzzing
#
defp search_fuzz(ctx, tokens) do
nil
end
#
# Search terms
#
defp search_term(ctx, [{:term, t} | rest]) do
{TermParser.parse(ctx, t), rest}
end
defp search_term(_ctx, [eof: "$"]), do: raise ArgumentError, "Expected a term, got <end of input>."
defp search_term(_ctx, [{_, text} | _rest]), do: raise ArgumentError, "Expected a term, got `#{text}'."
end

View file

@ -0,0 +1,247 @@
"""
defmodule Philomena.Search.TermLexer do
def lex(opts, input) do
{:ok | accept(opts, input, :literal)}
#rescue
# e in ArgumentError ->
# {:error, e.message}
# _ ->
# {:error, "Parsing error."}
end
#
# Literal fields
#
defp accept([field | r_fields], opts, input, :literal_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:literal_field, field} | accept(rest, :literal)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:literal_field, field} | accept(rest, :literal)]
_ ->
accept(r_fields, opts, input, :literal_field)
end
end
defp accept([], %{boolean_fields: fields} = opts, input, :literal_field), do: accept(fields, opts, input, :boolean_field)
#
# Boolean fields
#
defp accept([field | r_fields], opts, input, :boolean_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:boolean_field, field} | accept(rest, :boolean)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:boolean_field, field} | accept(rest, :boolean)]
_ ->
accept(r_fields, opts, input, :boolean_field)
end
end
defp accept([], %{ngram_fields: fields} = opts, input, :boolean_field), do: accept(fields, opts, input, :ngram_field)
#
# NLP-analyzed fields
#
defp accept([field | r_fields], opts, input, :ngram_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:ngram_field, field} | accept(rest, :literal)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:ngram_field, field} | accept(rest, :literal)]
_ ->
accept(r_fields, opts, input, :ngram_field)
end
end
defp accept([], %{ip_fields: fields} = opts, input, :ngram_field), do: accept(fields, opts, input, :ip_fieldngram
#
# IP address and CIDR range fields
#
defp accept([field | r_fields], opts, input, :ip_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:ip_field, field} | accept(rest, :ip)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:ip_field, field} | accept(rest, :ip)]
_ ->
accept(r_fields, opts, input, :ip_field)
end
end
defp accept([], %{int_fields: fields} = opts, input, :ip_field), do: accept(fields, opts, input, :int_field)
#
# Integer fields
#
defp accept([field | r_fields], opts, input, :int_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:int_field, field}, {:range, :eq} | accept(rest, :int)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:int_field, field}, {:range, :eq} | accept(rest, :int)]
<<^field::binary-size(sz), ".lt:", rest::binary>> ->
[{:int_field, field}, {:range, :lt} | accept(rest, :int)]
<<^field::binary-size(sz), ".lte:", rest::binary>> ->
[{:int_field, field}, {:range, :lte}. accept(rest, :int)]
<<^field::binary-size(sz), ".gt:", rest::binary>> ->
[{:int_field, field}, {:range, :gt} | accept(rest, :int)]
<<^field::binary-size(sz), ".gte:", rest::binary>> ->
[{:int_field, field}, {:range, :gte} | accept(rest, :int)]
_ ->
accept(r_fields, opts, input, :int_field)
end
end
defp accept([], %{float_fields: fields} = opts, input, :int_field), do: accept(fields, opts, input, :float_field)
#
# Float fields
#
defp accept([field | r_fields], opts, input, :float_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:float_field, field}, {:range, :eq} | accept(rest, :float)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:float_field, field}, {:range, :eq} | accept(rest, :float)]
<<^field::binary-size(sz), ".lt:", rest::binary>> ->
[{:float_field, field}, {:range, :lt} | accept(rest, :float)]
<<^field::binary-size(sz), ".lte:", rest::binary>> ->
[{:float_field, field}, {:range, :lte} | accept(rest, :float)]
<<^field::binary-size(sz), ".gt:", rest::binary>> ->
[{:float_field, field}, {:range, :gt} | accept(rest, :float)]
<<^field::binary-size(sz), ".gte:", rest::binary>> ->
[{:float_field, field}, {:range, :gte} | accept(rest, :float)]
_ ->
accept(r_fields, opts, input, :float_field)
end
end
defp accept([], %{date_fields: fields} = opts, input, :float_field), do: accept(fields, opts, input, :date_field)
#
# Date fields
#
defp accept([field | r_fields], opts, input, :date_field) do
sz = field |> byte_size
case input do
<<^field::binary-size(sz), ":", rest::binary>> ->
[{:date_field, field}, {:range, :eq} | accept(rest, :date)]
<<^field::binary-size(sz), ".eq:", rest::binary>> ->
[{:date_field, field}, {:range, :eq} | accept(rest, :date)]
<<^field::binary-size(sz), ".lt:", rest::binary>> ->
[{:date_field, field}, {:range, :lt} | accept(rest, :date)]
<<^field::binary-size(sz), ".lte:", rest::binary>> ->
[{:date_field, field}, {:range, :lte} | accept(rest, :date)]
<<^field::binary-size(sz), ".gt:", rest::binary>> ->
[{:date_field, field}, {:range, :gt} | accept(rest, :date)]
<<^field::binary-size(sz), ".gte:", rest::binary>> ->
[{:date_field, field}, {:range, :gte} | accept(rest, :date)]
_ ->
accept(r_fields, opts, input, :date_field)
end
end
#
# Default field handling
#
defp accept([], %{default_field: field} = opts, input, :date_field) do
[{:literal_field, field} | accept(input, :literal)]
end
#
# Text and wildcarded text
#
defp accept(input, :literal), do: accept(input, "", :literal)
defp accept(<<"\\", c::utf8, rest::binary>>, term, :literal), do: accept(rest, term <> <<c::utf8>>, :literal)
defp accept(<<c::utf8, rest::binary>>, term, :literal) when c in '*?', do: accept(rest, term <> <<c::utf8>>, :wildcard)
defp accept(<<c::utf8, rest::binary>>, term, :literal), do: accept(rest, term <> <<c::utf8>>, :literal)
defp accept(<<>>, term, :literal), do: [literal: term]
defp accept(<<"\\", c::utf8, rest::binary>>, term, :wildcard) when c in '*?', do: accept(rest, term <> <<"\\", c::utf8>>, :wildcard)
defp accept(<<"\\", c::utf8, rest::binary>>, term, :wildcard), do: accept(rest, term <> <<c::utf8>>, :wildcard)
defp accept(<<c::utf8, rest::binary>>, term, :wildcard), do: accept(rest, term <> <<c::utf8>>, :wildcard)
defp accept(<<>>, term, :wildcard), do: [wildcard: term]
#
# Booleans
#
defp accept("true", :boolean), do: [boolean: true]
defp accept("false", :boolean), do: [boolean: false]
defp accept(input, :boolean), do: raise ArgumentError, "Expected a boolean, got `\#{input}'."
#
# Floats (integers are also considered valid)
#
defp accept(<<"+", rest::binary>>, :float), do: accept(rest, "", :float_w)
defp accept(<<"-", rest::binary>>, :float), do: accept(rest, "-", :float_w)
defp accept(input, :float), do: accept(input, "", :float_w)
defp accept(<<c::utf8, rest::binary>>, term, :float_w) when c in ?0..?9, do: accept(rest, term <> <<c::utf8>>, :float_w)
defp accept(<<".", rest::binary>>, term, :float_w), do: accept(rest, term <> ".", :float_f)
defp accept(<<c::utf8, rest::binary>>, term, :float_w), do: raise ArgumentError, "Expected a float, got `\#{<<term::binary, c::utf8, rest::binary>>}'."
defp accept(<<>>, term, :float_w), do: [float: to_number(term)]
defp accept(<<c::utf8, rest::binary>>, term, :float_f) when c in ?0..?9, do: accept(rest, term <> <<c::utf8>>, :float_f)
defp accept(<<c::utf8, rest::binary>>, term, :float_f), do: raise ArgumentError, "Expected a float, got `\#{<<term::binary, c::utf8, rest::binary>>}'."
defp accept(<<>>, term, :float_f), do: [float: to_number(term)]
#
# Integers
#
defp accept(<<"+", rest::binary>>, :int), do: accept(rest, "", :int_w)
defp accept(<<"-", rest::binary>>, :int), do: accept(rest, "-", :int_w)
defp accept(input, :int), do: accept(input, "", :int_w)
defp accept(<<c::utf8, rest::binary>>, term, :int_w) when c in ?0..?9, do: accept(rest, term <> <<c::utf8>>, :int_w)
defp accept(<<c::utf8, rest::binary>>, term, :int_w), do: raise ArgumentError, "Expected an integer, got `\#{<<term::binary, c::utf8, rest::binary>>}'."
defp accept(<<>>, term, :int_w), do: [int: to_number(term)]
#
# IP addresses
#
defp accept(<<c1::utf8, c2::utf8, c3::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9 and c2 in ?0..?9 and c3 in ?0..?9, do: accept({})
defp accept(<<"::ffff:", c1::utf8, c2::utf8, c3::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9 and c2 in ?0..?9 and c3 in ?0..?9, do: accept({})
defp accept(<<c1::utf8, c2::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9 and c2 in ?0..?9, do: accept({})
defp accept(<<"::ffff:",c1::utf8, c2::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9 and c2 in ?0..?9, do: accept({})
defp accept(<<c1::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9, do: accept({})
defp accept(<<"::ffff:", c1::utf8, ".", rest::binary>>, :ip) when c1 in ?0..9, do: accept({})
defp to_number(term) do
{float_val, _} = :string.to_float(term)
{int_val, _} = :string.to_integer(term)
cond do
is_float(float_val) ->
float_val
is_integer(int_val) ->
int_val
true ->
raise ArgumentError, "Expected a number."
end
end
end
"""

View file

@ -49,7 +49,8 @@ defmodule Philomena.MixProject do
{:bcrypt_elixir, "~> 2.0"},
{:pot, "~> 0.10.1"},
{:secure_compare, "~> 0.1.0"},
{:elastix, "~> 0.7.1"}
{:elastix, "~> 0.7.1"},
{:nimble_parsec, "~> 0.5.1"},
]
end

View file

@ -22,6 +22,7 @@
"mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm"},
"neotoma": {:hex, :neotoma, "1.7.3", "d8bd5404b73273989946e4f4f6d529e5c2088f5fa1ca790b4dbe81f4be408e61", [:rebar], [], "hexpm"},
"nimble_parsec": {:hex, :nimble_parsec, "0.5.1", "c90796ecee0289dbb5ad16d3ad06f957b0cd1199769641c961cfe0b97db190e0", [:mix], [], "hexpm"},
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
"pbkdf2": {:hex, :pbkdf2, "2.0.0", "11c23279fded5c0027ab3996cfae77805521d7ef4babde2bd7ec04a9086cf499", [:rebar3], [], "hexpm"},
"phoenix": {:hex, :phoenix, "1.4.9", "746d098e10741c334d88143d3c94cab1756435f94387a63441792e66ec0ee974", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 1.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.8.1 or ~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm"},