finished parser bundle

This commit is contained in:
Liam P. White 2019-08-27 19:37:26 -04:00
parent 9239418d4b
commit 8cb0cbc244
4 changed files with 145 additions and 82 deletions

View file

@ -1,5 +1,7 @@
defmodule Philomena.Images.Query do defmodule Philomena.Images.Query do
use Philomena.Search.Parser, import Philomena.Search.Parser
defparser "anonymous",
int: int:
~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count), ~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count),
float: ~W(aspect_ratio wilson_score), float: ~W(aspect_ratio wilson_score),
@ -16,6 +18,66 @@ defmodule Philomena.Images.Query do
"faved_by" => "favourited_by_users", "faved_by" => "favourited_by_users",
"faved_by_id" => "favourited_by_user_ids" "faved_by_id" => "favourited_by_user_ids"
}, },
default: "namespaced_tags.name", default: "namespaced_tags.name"
name: "anonymous"
defparser "user",
int:
~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count),
float: ~W(aspect_ratio wilson_score),
date: ~W(created_at updated_at first_seen_at),
literal: ~W(faved_by orig_sha512_hash sha512_hash uploader source_url original_format),
ngram: ~W(description),
custom: ~W(gallery_id my),
transforms: %{
"gallery_id" => fn _ctx, value ->
%{nested: %{path: :galleries, query: %{term: %{"galleries.id" => value}}}}
end,
"my" => fn
%{user: %{id: id}}, "faves" -> %{term: %{favourited_by_user_ids: id}}
%{user: %{id: id}}, "upvotes" -> %{term: %{upvoter_ids: id}}
%{user: %{id: id}}, "downvotes" -> %{term: %{downvoter_ids: id}}
%{user: _u}, "watched" ->
%{query: %{match_all: %{}}} # todo
end
},
aliases: %{
"faved_by" => "favourited_by_users",
"faved_by_id" => "favourited_by_user_ids"
},
default: "namespaced_tags.name"
defparser "moderator",
int:
~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id upvoted_by_id downvoted_by_id tag_count true_uploader_id hidden_by_id deleted_by_user-id),
float: ~W(aspect_ratio wilson_score),
date: ~W(created_at updated_at first_seen_at),
literal: ~W(faved_by orig_sha512_hash sha512_hash uploader source_url original_format fingerprint upvoted_by downvoted_by true_uploader hidden_by deleted_by_user),
ngram: ~W(description deletion_reason),
ip: ~W(ip),
bool: ~W(deleted),
custom: ~W(gallery_id my),
transforms: %{
"gallery_id" => fn _ctx, value ->
%{nested: %{path: :galleries, query: %{term: %{"galleries.id" => value}}}}
end,
"my" => fn
%{user: %{id: id}}, "faves" -> %{term: %{favourited_by_user_ids: id}}
%{user: %{id: id}}, "upvotes" -> %{term: %{upvoter_ids: id}}
%{user: %{id: id}}, "downvotes" -> %{term: %{downvoter_ids: id}}
%{user: _u}, "watched" ->
%{query: %{match_all: %{}}} # todo
end
},
aliases: %{
"faved_by" => "favourited_by_users",
"upvoted_by" => "upvoters",
"downvoted_by" => "downvoters",
"faved_by_id" => "favourited_by_user_ids",
"upvoted_by_id" => "upvoter_ids",
"downvoted_by_id" => "downvoter_ids",
"hidden_by" => "hidden_by_users",
"hidden_by_id" => "hidden_by_user_ids",
"deleted" => "hidden_from_users"
},
default: "namespaced_tags.name"
end end

View file

@ -125,4 +125,22 @@ defmodule Philomena.Search.Helpers do
def full_choice(combinator, choices) do def full_choice(combinator, choices) do
choice(combinator, choices) choice(combinator, choices)
end end
def contains_wildcard?(value) do
String.match?(value, ~r/(?<!\\)(?:\\\\)*[\*\?]/)
end
def unescape_wildcard(value) do
# '*' and '?' are wildcard characters in the right context;
# don't unescape them.
Regex.replace(~r/(?<!\\)(?:\\)*([^\\\*\?])/, value, "\\1")
end
def unescape_regular(value) do
Regex.replace(~r/(?<!\\)(?:\\)*(.)/, value, "\\1")
end
def process_term(term) do
term |> String.trim() |> String.downcase()
end
end end

View file

@ -1,5 +1,5 @@
defmodule Philomena.Search.Lexer do defmodule Philomena.Search.Lexer do
defmacro __using__(opts) do defmacro deflexer(name, opts) do
literal_fields = Keyword.get(opts, :literal, []) |> Macro.expand(__CALLER__) literal_fields = Keyword.get(opts, :literal, []) |> Macro.expand(__CALLER__)
ngram_fields = Keyword.get(opts, :ngram, []) |> Macro.expand(__CALLER__) ngram_fields = Keyword.get(opts, :ngram, []) |> Macro.expand(__CALLER__)
bool_fields = Keyword.get(opts, :bool, []) |> Macro.expand(__CALLER__) bool_fields = Keyword.get(opts, :bool, []) |> Macro.expand(__CALLER__)
@ -8,9 +8,8 @@ defmodule Philomena.Search.Lexer do
int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__) int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__)
ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__) ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__)
custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__) custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__)
lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer"
quote do quote location: :keep do
import NimbleParsec import NimbleParsec
import Philomena.Search.Helpers import Philomena.Search.Helpers
@ -342,29 +341,30 @@ defmodule Philomena.Search.Lexer do
quoted_numeric = ignore(quot) |> concat(numeric) |> ignore(quot) quoted_numeric = ignore(quot) |> concat(numeric) |> ignore(quot)
stop_words = stop_words =
choice([ repeat(space)
|> choice([
string("\\") |> eos(), string("\\") |> eos(),
string(","), string(","),
concat(space, l_and), l_and,
concat(space, l_or), l_or,
rparen, rparen,
fuzz, fuzz,
boost boost
]) ])
defcombinatorp( defcombinatorp(
:text, unquote(:"#{name}_text"),
lookahead_not(stop_words) lookahead_not(stop_words)
|> choice([ |> choice([
string("\\") |> utf8_char([]), string("\\") |> utf8_char([]),
string("(") |> parsec(:text) |> string(")"), string("(") |> parsec(unquote(:"#{name}_text")) |> string(")"),
utf8_char([]) utf8_char([])
]) ])
|> times(min: 1) |> times(min: 1)
) )
text = text =
parsec(:text) parsec(unquote(:"#{name}_text"))
|> reduce({List, :to_string, []}) |> reduce({List, :to_string, []})
|> unwrap_and_tag(:text) |> unwrap_and_tag(:text)
@ -462,7 +462,7 @@ defmodule Philomena.Search.Lexer do
times(outer, min: 1) times(outer, min: 1)
|> eos() |> eos()
defparsec(unquote(lexer_name), search) defparsec(unquote(:"#{name}_lexer"), search)
end end
end end
end end

View file

@ -1,49 +1,50 @@
defmodule Philomena.Search.Parser do defmodule Philomena.Search.Parser do
defmacro __using__(opts) do defmacro defparser(name, opts) do
lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer"
parser_name = :"#{Keyword.fetch!(opts, :name)}_parser"
field_transforms = Keyword.get(opts, :transforms, %{}) field_transforms = Keyword.get(opts, :transforms, %{})
field_aliases = Keyword.get(opts, :aliases, %{}) field_aliases = Keyword.get(opts, :aliases, %{})
default_field = Keyword.fetch!(opts, :default) default_field = Keyword.fetch!(opts, :default)
quote location: :keep do quote location: :keep do
use Philomena.Search.Lexer, unquote(opts) import Philomena.Search.Lexer
import Philomena.Search.Helpers
def unquote(parser_name)(ctx, input) do deflexer unquote(name), unquote(opts)
with {:ok, tree, _1, _2, _3, _4} <- unquote(lexer_name)(input) do
parse(ctx, tree) def unquote(:"#{name}_parser")(ctx, input) do
with {:ok, tree, _1, _2, _3, _4} <- unquote(:"#{name}_lexer")(input) do
unquote(:"#{name}_parse")(ctx, tree)
else else
{:error, msg, _1, _2, _3, _4} -> {:error, msg, _1, _2, _3, _4} ->
{:error, msg} {:error, msg}
end end
end end
defp parse(ctx, tokens) do defp unquote(:"#{name}_parse")(ctx, tokens) do
{tree, []} = search_top(ctx, tokens) {tree, []} = unquote(:"#{name}_top")(ctx, tokens)
{:ok, tree} {:ok, tree}
rescue #rescue
e in ArgumentError -> # e in ArgumentError ->
{:error, e.message} # {:error, e.message}
_ -> # _ ->
{:error, "Parsing error."} # {:error, "Parsing error."}
end end
# #
# Predictive LL(k) parser for search grammar # Predictive LL(k) parser for search grammar
# #
defp search_top(ctx, tokens), do: search_or(ctx, tokens) defp unquote(:"#{name}_top")(ctx, tokens), do: unquote(:"#{name}_or")(ctx, tokens)
# #
# Boolean OR # Boolean OR
# #
defp search_or(ctx, tokens) do defp unquote(:"#{name}_or")(ctx, tokens) do
case search_and(ctx, tokens) do case unquote(:"#{name}_and")(ctx, tokens) do
{left, [{:or, _} | r_tokens]} -> {left, [{:or, _} | r_tokens]} ->
{right, rest} = search_or(ctx, r_tokens) {right, rest} = unquote(:"#{name}_or")(ctx, r_tokens)
{%{bool: %{should: [left, right]}}, rest} {%{bool: %{should: [left, right]}}, rest}
{child, rest} -> {child, rest} ->
@ -55,10 +56,10 @@ defmodule Philomena.Search.Parser do
# Boolean AND # Boolean AND
# #
defp search_and(ctx, tokens) do defp unquote(:"#{name}_and")(ctx, tokens) do
case search_boost(ctx, tokens) do case unquote(:"#{name}_boost")(ctx, tokens) do
{left, [{:and, _} | r_tokens]} -> {left, [{:and, _} | r_tokens]} ->
{right, rest} = search_and(ctx, r_tokens) {right, rest} = unquote(:"#{name}_and")(ctx, r_tokens)
{%{bool: %{must: [left, right]}}, rest} {%{bool: %{must: [left, right]}}, rest}
{child, rest} -> {child, rest} ->
@ -70,8 +71,8 @@ defmodule Philomena.Search.Parser do
# Subquery score boosting # Subquery score boosting
# #
defp search_boost(ctx, tokens) do defp unquote(:"#{name}_boost")(ctx, tokens) do
case search_not(ctx, tokens) do case unquote(:"#{name}_not")(ctx, tokens) do
{child, [{:boost, _}, {:number, value} | r_tokens]} -> {child, [{:boost, _}, {:number, value} | r_tokens]} ->
{%{function_score: %{query: child, boost_factor: value}}, r_tokens} {%{function_score: %{query: child, boost_factor: value}}, r_tokens}
@ -84,20 +85,20 @@ defmodule Philomena.Search.Parser do
# Boolean NOT # Boolean NOT
# #
defp search_not(ctx, [{:not, _} | r_tokens]) do defp unquote(:"#{name}_not")(ctx, [{:not, _} | r_tokens]) do
{child, rest} = search_top(ctx, r_tokens) {child, rest} = unquote(:"#{name}_not")(ctx, r_tokens)
{%{bool: %{must_not: child}}, rest} {%{bool: %{must_not: child}}, rest}
end end
defp search_not(ctx, tokens), do: search_group(ctx, tokens) defp unquote(:"#{name}_not")(ctx, tokens), do: unquote(:"#{name}_group")(ctx, tokens)
# #
# Logical grouping # Logical grouping
# #
defp search_group(ctx, [{:lparen, _} | rest]) do defp unquote(:"#{name}_group")(ctx, [{:lparen, _} | rest]) do
case search_top(ctx, rest) do case unquote(:"#{name}_top")(ctx, rest) do
{child, [{:rparen, _} | r_tokens]} -> {child, [{:rparen, _} | r_tokens]} ->
{child, r_tokens} {child, r_tokens}
@ -106,20 +107,20 @@ defmodule Philomena.Search.Parser do
end end
end end
defp search_group(_ctx, [{:rparen, _} | _rest]), defp unquote(:"#{name}_group")(_ctx, [{:rparen, _} | _rest]),
do: raise(ArgumentError, "Imbalanced parentheses.") do: raise(ArgumentError, "Imbalanced parentheses.")
defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens) defp unquote(:"#{name}_group")(ctx, tokens), do: unquote(:"#{name}_fuzz")(ctx, tokens)
# #
# Terms and term fuzzing # Terms and term fuzzing
# #
defp search_fuzz(ctx, tokens) do defp unquote(:"#{name}_fuzz")(ctx, tokens) do
case tokens do case tokens do
[{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> [{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
{%{ {%{
range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}} range: %{unquote(:"#{name}_alias")(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}
}, r_tokens} }, r_tokens}
[ [
@ -130,7 +131,7 @@ defmodule Philomena.Search.Parser do
{:number, fuzz} | r_tokens {:number, fuzz} | r_tokens
] -> ] ->
{%{ {%{
range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}} range: %{unquote(:"#{name}_alias")(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}
}, r_tokens} }, r_tokens}
[ [
@ -140,7 +141,7 @@ defmodule Philomena.Search.Parser do
{:fuzz, _}, {:fuzz, _},
{:number, fuzz} | r_tokens {:number, fuzz} | r_tokens
] -> ] ->
{%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} {%{fuzzy: %{unquote(:"#{name}_alias")(field) => %{value: value, fuzziness: fuzz}}}, r_tokens}
[ [
{:ngram_field, field}, {:ngram_field, field},
@ -149,13 +150,13 @@ defmodule Philomena.Search.Parser do
{:fuzz, _}, {:fuzz, _},
{:number, fuzz} | r_tokens {:number, fuzz} | r_tokens
] -> ] ->
{%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens} {%{fuzzy: %{unquote(:"#{name}_alias")(field) => %{value: value, fuzziness: fuzz}}}, r_tokens}
[{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] -> [{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
{%{fuzzy: %{unquote(default_field) => %{value: value, fuzziness: fuzz}}}, r_tokens} {%{fuzzy: %{unquote(default_field) => %{value: value, fuzziness: fuzz}}}, r_tokens}
_ -> _ ->
search_range(ctx, tokens) unquote(:"#{name}_range")(ctx, tokens)
end end
end end
@ -163,60 +164,60 @@ defmodule Philomena.Search.Parser do
# Range queries # Range queries
# #
defp search_range(ctx, tokens) do defp unquote(:"#{name}_range")(ctx, tokens) do
case tokens do case tokens do
[{:int_field, field}, {range, _}, {:int, value} | r_tokens] [{:int_field, field}, {range, _}, {:int, value} | r_tokens]
when range in [:gt, :gte, :lt, :lte] -> when range in [:gt, :gte, :lt, :lte] ->
{%{range: %{try_alias(field) => %{range => value}}}, r_tokens} {%{range: %{unquote(:"#{name}_alias")(field) => %{range => value}}}, r_tokens}
[{:float_field, field}, {range, _}, {:number, value} | r_tokens] [{:float_field, field}, {range, _}, {:number, value} | r_tokens]
when range in [:gt, :gte, :lt, :lte] -> when range in [:gt, :gte, :lt, :lte] ->
{%{range: %{try_alias(field) => %{range => value}}}, r_tokens} {%{range: %{unquote(:"#{name}_alias")(field) => %{range => value}}}, r_tokens}
[{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens] [{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens]
when range in [:gt, :gte, :lt, :lte] -> when range in [:gt, :gte, :lt, :lte] ->
{%{range: %{try_alias(field) => %{range => lower}}}, r_tokens} {%{range: %{unquote(:"#{name}_alias")(field) => %{range => lower}}}, r_tokens}
_ -> _ ->
search_custom(ctx, tokens) unquote(:"#{name}_custom")(ctx, tokens)
end end
end end
defp search_custom(ctx, tokens) do defp unquote(:"#{name}_custom")(ctx, tokens) do
case tokens do case tokens do
[{:custom_field, field}, {:text, value} | r_tokens] -> [{:custom_field, field}, {:text, value} | r_tokens] ->
{unquote(field_transforms)[field].(ctx, value), r_tokens} {unquote(field_transforms)[field].(ctx, value), r_tokens}
_ -> _ ->
search_term(ctx, tokens) unquote(:"#{name}_term")(ctx, tokens)
end end
end end
defp search_term(_ctx, tokens) do defp unquote(:"#{name}_term")(_ctx, tokens) do
case tokens do case tokens do
[{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] -> [{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] ->
{%{range: %{try_alias(field) => %{gte: lower, lte: higher}}}, r_tokens} {%{range: %{unquote(:"#{name}_alias")(field) => %{gte: lower, lte: higher}}}, r_tokens}
[{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] -> [{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] ->
value = process_term(value) value = process_term(value)
if contains_wildcard?(value) do if contains_wildcard?(value) do
{%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens} {%{wildcard: %{unquote(:"#{name}_alias")(field) => unescape_wildcard(value)}}, r_tokens}
else else
{%{match: %{try_alias(field) => unescape_regular(value)}}, r_tokens} {%{match: %{unquote(:"#{name}_alias")(field) => unescape_regular(value)}}, r_tokens}
end end
[{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] -> [{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] ->
value = process_term(value) value = process_term(value)
if contains_wildcard?(value) do if contains_wildcard?(value) do
{%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens} {%{wildcard: %{unquote(:"#{name}_alias")(field) => unescape_wildcard(value)}}, r_tokens}
else else
{%{term: %{try_alias(field) => unescape_regular(value)}}, r_tokens} {%{term: %{unquote(:"#{name}_alias")(field) => unescape_regular(value)}}, r_tokens}
end end
[{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] -> [{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] ->
{%{term: %{try_alias(field) => value}}, r_tokens} {%{term: %{unquote(:"#{name}_alias")(field) => value}}, r_tokens}
[{:default, [text: value]} | r_tokens] -> [{:default, [text: value]} | r_tokens] ->
value = process_term(value) value = process_term(value)
@ -232,25 +233,7 @@ defmodule Philomena.Search.Parser do
end end
end end
defp contains_wildcard?(value) do defp unquote(:"#{name}_alias")(field) do
String.match?(value, ~r/(?<!\\)(?:\\\\)*[\*\?]/)
end
defp unescape_wildcard(value) do
# '*' and '?' are wildcard characters in the right context;
# don't unescape them.
Regex.replace(~r/(?<!\\)(?:\\)*([^\\\*\?])/, value, "\\1")
end
defp unescape_regular(value) do
Regex.replace(~r/(?<!\\)(?:\\)*(.)/, value, "\\1")
end
defp process_term(term) do
term |> String.trim() |> String.downcase()
end
defp try_alias(field) do
unquote(field_aliases)[field] || field unquote(field_aliases)[field] || field
end end
end end