mirror of
https://github.com/philomena-dev/philomena.git
synced 2025-01-19 22:27:59 +01:00
parser bundle
This commit is contained in:
parent
675a7de13b
commit
a0fa66628a
3 changed files with 243 additions and 202 deletions
|
@ -1,9 +1,21 @@
|
|||
defmodule Philomena.Images.Query do
|
||||
use Philomena.Search.Lexer,
|
||||
use Philomena.Search.Parser,
|
||||
int:
|
||||
~W(id width height comment_count score upvotes downvotes faves uploader_id faved_by_id tag_count),
|
||||
float: ~W(aspect_ratio wilson_score),
|
||||
date: ~W(created_at updated_at first_seen_at),
|
||||
literal: ~W(faved_by orig_sha512_hash sha512_hash uploader source_url original_format),
|
||||
ngram: ~W(description)
|
||||
ngram: ~W(description),
|
||||
custom: ~W(gallery_id),
|
||||
transforms: %{
|
||||
"gallery_id" => fn _ctx, value ->
|
||||
%{nested: %{path: :galleries, query: %{term: %{"galleries.id" => value}}}}
|
||||
end
|
||||
},
|
||||
aliases: %{
|
||||
"faved_by" => "favourited_by_users",
|
||||
"faved_by_id" => "favourited_by_user_ids"
|
||||
},
|
||||
default: "namespaced_tags.name",
|
||||
name: "anonymous"
|
||||
end
|
||||
|
|
|
@ -8,8 +8,9 @@ defmodule Philomena.Search.Lexer do
|
|||
int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__)
|
||||
ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__)
|
||||
custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__)
|
||||
lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer"
|
||||
|
||||
quote location: :keep do
|
||||
quote do
|
||||
import NimbleParsec
|
||||
import Philomena.Search.Helpers
|
||||
|
||||
|
@ -181,10 +182,10 @@ defmodule Philomena.Search.Lexer do
|
|||
|> reduce({List, :to_string, []})
|
||||
|
||||
ip_address =
|
||||
choice([
|
||||
ipv4_address |> optional(ipv4_prefix),
|
||||
ipv6_address |> optional(ipv6_prefix)
|
||||
])
|
||||
#choice([
|
||||
ipv4_address |> optional(ipv4_prefix)#,
|
||||
#ipv6_address |> optional(ipv6_prefix)
|
||||
#])
|
||||
|> reduce({Enum, :join, []})
|
||||
|> label("a valid IPv4 or IPv6 address and optional CIDR prefix")
|
||||
|> unwrap_and_tag(:ip)
|
||||
|
@ -459,7 +460,7 @@ defmodule Philomena.Search.Lexer do
|
|||
times(outer, min: 1)
|
||||
|> eos()
|
||||
|
||||
defparsec(:search, search)
|
||||
defparsec(unquote(lexer_name), search)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,210 +1,238 @@
|
|||
defmodule Philomena.Search.Parser do
|
||||
def parse(ctx, tokens) do
|
||||
{tree, []} = search_top(ctx, tokens)
|
||||
defmacro __using__(opts) do
|
||||
lexer_name = :"#{Keyword.fetch!(opts, :name)}_lexer"
|
||||
parser_name = :"#{Keyword.fetch!(opts, :name)}_parser"
|
||||
field_transforms = Keyword.get(opts, :transforms, %{})
|
||||
field_aliases = Keyword.get(opts, :aliases, %{})
|
||||
default_field = Keyword.fetch!(opts, :default)
|
||||
|
||||
{:ok, tree}
|
||||
rescue
|
||||
e in ArgumentError ->
|
||||
{:error, e.message}
|
||||
quote location: :keep do
|
||||
use Philomena.Search.Lexer, unquote(opts)
|
||||
|
||||
_ ->
|
||||
{:error, "Parsing error."}
|
||||
end
|
||||
|
||||
#
|
||||
# Predictive LL(k) parser for search grammar
|
||||
#
|
||||
defp search_top(ctx, tokens), do: search_or(ctx, tokens)
|
||||
|
||||
#
|
||||
# Boolean OR
|
||||
#
|
||||
|
||||
defp search_or(ctx, tokens) do
|
||||
case search_and(ctx, tokens) do
|
||||
{left, [{:or, _} | r_tokens]} ->
|
||||
{right, rest} = search_top(ctx, r_tokens)
|
||||
{%{bool: %{should: [left, right]}}, rest}
|
||||
|
||||
{child, rest} ->
|
||||
{child, rest}
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Boolean AND
|
||||
#
|
||||
|
||||
defp search_and(ctx, tokens) do
|
||||
case search_boost(ctx, tokens) do
|
||||
{left, [{:and, _} | r_tokens]} ->
|
||||
{right, rest} = search_top(ctx, r_tokens)
|
||||
{%{bool: %{must: [left, right]}}, rest}
|
||||
|
||||
{child, rest} ->
|
||||
{child, rest}
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Subquery score boosting
|
||||
#
|
||||
|
||||
defp search_boost(ctx, tokens) do
|
||||
case search_not(ctx, tokens) do
|
||||
{child, [{:boost, _}, {:number, value} | r_tokens]} ->
|
||||
{%{function_score: %{query: child, boost_factor: value}}, r_tokens}
|
||||
|
||||
{child, rest} ->
|
||||
{child, rest}
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Boolean NOT
|
||||
#
|
||||
|
||||
defp search_not(ctx, [{:not, _} | r_tokens]) do
|
||||
{child, rest} = search_top(ctx, r_tokens)
|
||||
|
||||
{%{bool: %{must_not: child}}, rest}
|
||||
end
|
||||
|
||||
defp search_not(ctx, tokens), do: search_group(ctx, tokens)
|
||||
|
||||
#
|
||||
# Logical grouping
|
||||
#
|
||||
|
||||
defp search_group(ctx, [{:lparen, _} | rest]) do
|
||||
case search_top(ctx, rest) do
|
||||
{child, [{:rparen, _} | r_tokens]} ->
|
||||
{child, r_tokens}
|
||||
|
||||
_ ->
|
||||
raise ArgumentError, "Imbalanced parentheses."
|
||||
end
|
||||
end
|
||||
|
||||
defp search_group(_ctx, [{:rparen, _} | _rest]),
|
||||
do: raise(ArgumentError, "Imbalanced parentheses.")
|
||||
|
||||
defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens)
|
||||
|
||||
#
|
||||
# Terms and term fuzzing
|
||||
#
|
||||
|
||||
defp search_fuzz(%{default_field: default_field} = ctx, tokens) do
|
||||
case tokens do
|
||||
[{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{range: %{field => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens}
|
||||
|
||||
[{:float_field, field}, {:eq, _}, {:float, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{range: %{field => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens}
|
||||
|
||||
[{:literal_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{fuzzy: %{field => %{value: value, fuzziness: fuzz}}}, r_tokens}
|
||||
|
||||
[{:ngram_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{fuzzy: %{field => %{value: value, fuzziness: fuzz}}}, r_tokens}
|
||||
|
||||
[{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{fuzzy: %{default_field => %{value: value, fuzziness: fuzz}}}, r_tokens}
|
||||
|
||||
_ ->
|
||||
search_range(ctx, tokens)
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Range queries
|
||||
#
|
||||
|
||||
defp search_range(ctx, tokens) do
|
||||
case tokens do
|
||||
[{:int_field, field}, {range, _}, {:int, value} | r_tokens]
|
||||
when range in [:gt, :gte, :lt, :lte] ->
|
||||
{%{range: %{field => %{range => value}}}, r_tokens}
|
||||
|
||||
[{:float_field, field}, {range, _}, {:number, value} | r_tokens]
|
||||
when range in [:gt, :gte, :lt, :lte] ->
|
||||
{%{range: %{field => %{range => value}}}, r_tokens}
|
||||
|
||||
[{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens]
|
||||
when range in [:gt, :gte, :lt, :lte] ->
|
||||
{%{range: %{field => %{range => lower}}}, r_tokens}
|
||||
|
||||
_ ->
|
||||
search_custom(ctx, tokens)
|
||||
end
|
||||
end
|
||||
|
||||
defp search_custom(ctx, tokens) do
|
||||
case tokens do
|
||||
[{:custom_field, field}, {:text, value} | r_tokens] ->
|
||||
{ctx[:field_transforms][field].(value), r_tokens}
|
||||
|
||||
_ ->
|
||||
search_term(ctx, tokens)
|
||||
end
|
||||
end
|
||||
|
||||
defp search_term(ctx, tokens) do
|
||||
case tokens do
|
||||
[{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] ->
|
||||
{%{range: %{field => %{gte: lower, lte: higher}}}, r_tokens}
|
||||
|
||||
[{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] ->
|
||||
value = process_term(value)
|
||||
|
||||
if contains_wildcard?(value) do
|
||||
{%{wildcard: %{field => unescape_wildcard(value)}}, r_tokens}
|
||||
def unquote(parser_name)(ctx, input) do
|
||||
with {:ok, tree, _1, _2, _3, _4} <- unquote(lexer_name)(input) do
|
||||
parse(ctx, tree)
|
||||
else
|
||||
{%{match: %{field => unescape_regular(value)}}, r_tokens}
|
||||
{:error, msg, _1, _2, _3, _4} ->
|
||||
{:error, msg}
|
||||
|
||||
{:error, msg} ->
|
||||
{:error, msg}
|
||||
end
|
||||
end
|
||||
|
||||
[{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] ->
|
||||
value = process_term(value)
|
||||
defp parse(ctx, tokens) do
|
||||
{tree, []} = search_top(ctx, tokens)
|
||||
|
||||
if contains_wildcard?(value) do
|
||||
{%{wildcard: %{field => unescape_wildcard(value)}}, r_tokens}
|
||||
else
|
||||
{%{term: %{field => unescape_regular(value)}}, r_tokens}
|
||||
{:ok, tree}
|
||||
rescue
|
||||
e in ArgumentError ->
|
||||
{:error, e.message}
|
||||
|
||||
_ ->
|
||||
{:error, "Parsing error."}
|
||||
end
|
||||
|
||||
#
|
||||
# Predictive LL(k) parser for search grammar
|
||||
#
|
||||
defp search_top(ctx, tokens), do: search_or(ctx, tokens)
|
||||
|
||||
#
|
||||
# Boolean OR
|
||||
#
|
||||
|
||||
defp search_or(ctx, tokens) do
|
||||
case search_and(ctx, tokens) do
|
||||
{left, [{:or, _} | r_tokens]} ->
|
||||
{right, rest} = search_top(ctx, r_tokens)
|
||||
{%{bool: %{should: [left, right]}}, rest}
|
||||
|
||||
{child, rest} ->
|
||||
{child, rest}
|
||||
end
|
||||
end
|
||||
|
||||
[{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] ->
|
||||
{%{term: %{field => value}}, r_tokens}
|
||||
#
|
||||
# Boolean AND
|
||||
#
|
||||
|
||||
[{:default, [text: value]} | r_tokens] ->
|
||||
value = process_term(value)
|
||||
defp search_and(ctx, tokens) do
|
||||
case search_boost(ctx, tokens) do
|
||||
{left, [{:and, _} | r_tokens]} ->
|
||||
{right, rest} = search_top(ctx, r_tokens)
|
||||
{%{bool: %{must: [left, right]}}, rest}
|
||||
|
||||
if contains_wildcard?(value) do
|
||||
{%{wildcard: %{ctx[:default_field] => unescape_wildcard(value)}}, r_tokens}
|
||||
else
|
||||
{%{term: %{ctx[:default_field] => unescape_regular(value)}}, r_tokens}
|
||||
{child, rest} ->
|
||||
{child, rest}
|
||||
end
|
||||
end
|
||||
|
||||
_ ->
|
||||
raise ArgumentError, "Expected a term"
|
||||
#
|
||||
# Subquery score boosting
|
||||
#
|
||||
|
||||
defp search_boost(ctx, tokens) do
|
||||
case search_not(ctx, tokens) do
|
||||
{child, [{:boost, _}, {:number, value} | r_tokens]} ->
|
||||
{%{function_score: %{query: child, boost_factor: value}}, r_tokens}
|
||||
|
||||
{child, rest} ->
|
||||
{child, rest}
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Boolean NOT
|
||||
#
|
||||
|
||||
defp search_not(ctx, [{:not, _} | r_tokens]) do
|
||||
{child, rest} = search_top(ctx, r_tokens)
|
||||
|
||||
{%{bool: %{must_not: child}}, rest}
|
||||
end
|
||||
|
||||
defp search_not(ctx, tokens), do: search_group(ctx, tokens)
|
||||
|
||||
#
|
||||
# Logical grouping
|
||||
#
|
||||
|
||||
defp search_group(ctx, [{:lparen, _} | rest]) do
|
||||
case search_top(ctx, rest) do
|
||||
{child, [{:rparen, _} | r_tokens]} ->
|
||||
{child, r_tokens}
|
||||
|
||||
_ ->
|
||||
raise ArgumentError, "Imbalanced parentheses."
|
||||
end
|
||||
end
|
||||
|
||||
defp search_group(_ctx, [{:rparen, _} | _rest]),
|
||||
do: raise(ArgumentError, "Imbalanced parentheses.")
|
||||
|
||||
defp search_group(ctx, tokens), do: search_fuzz(ctx, tokens)
|
||||
|
||||
#
|
||||
# Terms and term fuzzing
|
||||
#
|
||||
|
||||
defp search_fuzz(ctx, tokens) do
|
||||
case tokens do
|
||||
[{:int_field, field}, {:eq, _}, {:int, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens}
|
||||
|
||||
[{:float_field, field}, {:eq, _}, {:float, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{range: %{try_alias(field) => %{gte: trunc(value - fuzz), lte: trunc(value + fuzz)}}}, r_tokens}
|
||||
|
||||
[{:literal_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens}
|
||||
|
||||
[{:ngram_field, field}, {:eq, _}, {:text, value}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{fuzzy: %{try_alias(field) => %{value: value, fuzziness: fuzz}}}, r_tokens}
|
||||
|
||||
[{:default, [text: value]}, {:fuzz, _}, {:number, fuzz} | r_tokens] ->
|
||||
{%{fuzzy: %{unquote(default_field) => %{value: value, fuzziness: fuzz}}}, r_tokens}
|
||||
|
||||
_ ->
|
||||
search_range(ctx, tokens)
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Range queries
|
||||
#
|
||||
|
||||
defp search_range(ctx, tokens) do
|
||||
case tokens do
|
||||
[{:int_field, field}, {range, _}, {:int, value} | r_tokens]
|
||||
when range in [:gt, :gte, :lt, :lte] ->
|
||||
{%{range: %{try_alias(field) => %{range => value}}}, r_tokens}
|
||||
|
||||
[{:float_field, field}, {range, _}, {:number, value} | r_tokens]
|
||||
when range in [:gt, :gte, :lt, :lte] ->
|
||||
{%{range: %{try_alias(field) => %{range => value}}}, r_tokens}
|
||||
|
||||
[{:date_field, field}, {range, _}, {:date, [lower, _higher]} | r_tokens]
|
||||
when range in [:gt, :gte, :lt, :lte] ->
|
||||
{%{range: %{try_alias(field) => %{range => lower}}}, r_tokens}
|
||||
|
||||
_ ->
|
||||
search_custom(ctx, tokens)
|
||||
end
|
||||
end
|
||||
|
||||
defp search_custom(ctx, tokens) do
|
||||
case tokens do
|
||||
[{:custom_field, field}, {:text, value} | r_tokens] ->
|
||||
{unquote(field_transforms)[field].(ctx, value), r_tokens}
|
||||
|
||||
_ ->
|
||||
search_term(ctx, tokens)
|
||||
end
|
||||
end
|
||||
|
||||
defp search_term(_ctx, tokens) do
|
||||
case tokens do
|
||||
[{:date_field, field}, {:eq, _}, {:date, [lower, higher]} | r_tokens] ->
|
||||
{%{range: %{try_alias(field) => %{gte: lower, lte: higher}}}, r_tokens}
|
||||
|
||||
[{:ngram_field, field}, {:eq, _}, {:text, value} | r_tokens] ->
|
||||
value = process_term(value)
|
||||
|
||||
if contains_wildcard?(value) do
|
||||
{%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens}
|
||||
else
|
||||
{%{match: %{try_alias(field) => unescape_regular(value)}}, r_tokens}
|
||||
end
|
||||
|
||||
[{:literal_field, field}, {:eq, _}, {:text, value} | r_tokens] ->
|
||||
value = process_term(value)
|
||||
|
||||
if contains_wildcard?(value) do
|
||||
{%{wildcard: %{try_alias(field) => unescape_wildcard(value)}}, r_tokens}
|
||||
else
|
||||
{%{term: %{try_alias(field) => unescape_regular(value)}}, r_tokens}
|
||||
end
|
||||
|
||||
[{_field_type, field}, {:eq, _}, {_value_type, value} | r_tokens] ->
|
||||
{%{term: %{try_alias(field) => value}}, r_tokens}
|
||||
|
||||
[{:default, [text: value]} | r_tokens] ->
|
||||
value = process_term(value)
|
||||
|
||||
if contains_wildcard?(value) do
|
||||
{%{wildcard: %{unquote(default_field) => unescape_wildcard(value)}}, r_tokens}
|
||||
else
|
||||
{%{term: %{unquote(default_field) => unescape_regular(value)}}, r_tokens}
|
||||
end
|
||||
|
||||
_ ->
|
||||
raise ArgumentError, "Expected a term"
|
||||
end
|
||||
end
|
||||
|
||||
defp contains_wildcard?(value) do
|
||||
String.match?(value, ~r/(?<!\\)(?:\\\\)*[\*\?]/)
|
||||
end
|
||||
|
||||
defp unescape_wildcard(value) do
|
||||
# '*' and '?' are wildcard characters in the right context;
|
||||
# don't unescape them.
|
||||
Regex.replace(~r/(?<!\\)(?:\\)*([^\\\*\?])/, value, "\\1")
|
||||
end
|
||||
|
||||
defp unescape_regular(value) do
|
||||
Regex.replace(~r/(?<!\\)(?:\\)*(.)/, value, "\\1")
|
||||
end
|
||||
|
||||
defp process_term(term) do
|
||||
term |> String.trim() |> String.downcase()
|
||||
end
|
||||
|
||||
defp try_alias(field) do
|
||||
unquote(field_aliases)[field] || field
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp contains_wildcard?(value) do
|
||||
String.match?(value, ~r/(?<!\\)(?:\\\\)*[\*\?]/)
|
||||
end
|
||||
|
||||
defp unescape_wildcard(value) do
|
||||
# '*' and '?' are wildcard characters in the right context;
|
||||
# don't unescape them.
|
||||
Regex.replace(~r/(?<!\\)(?:\\)*([^\\\*\?])/, value, "\\1")
|
||||
end
|
||||
|
||||
defp unescape_regular(value) do
|
||||
Regex.replace(~r/(?<!\\)(?:\\)*(.)/, value, "\\1")
|
||||
end
|
||||
|
||||
defp process_term(term) do
|
||||
term |> String.trim() |> String.downcase()
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue