mirror of
https://github.com/philomena-dev/philomena.git
synced 2025-02-15 01:54:24 +01:00
468 lines
14 KiB
Elixir
468 lines
14 KiB
Elixir
defmodule Philomena.Search.Lexer do
|
|
defmacro deflexer(name, opts) do
|
|
literal_fields = Keyword.get(opts, :literal, []) |> Macro.expand(__CALLER__)
|
|
ngram_fields = Keyword.get(opts, :ngram, []) |> Macro.expand(__CALLER__)
|
|
bool_fields = Keyword.get(opts, :bool, []) |> Macro.expand(__CALLER__)
|
|
date_fields = Keyword.get(opts, :date, []) |> Macro.expand(__CALLER__)
|
|
float_fields = Keyword.get(opts, :float, []) |> Macro.expand(__CALLER__)
|
|
int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__)
|
|
ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__)
|
|
custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__)
|
|
|
|
quote location: :keep do
|
|
import NimbleParsec
|
|
import Philomena.Search.Helpers
|
|
|
|
l_and =
|
|
choice([string("AND"), string("&&"), string(",")])
|
|
|> unwrap_and_tag(:and)
|
|
|
|
l_or =
|
|
choice([string("OR"), string("||")])
|
|
|> unwrap_and_tag(:or)
|
|
|
|
l_not =
|
|
choice([string("NOT"), string("!"), string("-")])
|
|
|> unwrap_and_tag(:not)
|
|
|
|
lparen = string("(") |> unwrap_and_tag(:lparen)
|
|
rparen = string(")") |> unwrap_and_tag(:rparen)
|
|
|
|
space =
|
|
choice([string(" "), string("\t"), string("\n"), string("\r"), string("\v"), string("\f")])
|
|
|> ignore()
|
|
|
|
int =
|
|
optional(ascii_char('-+'))
|
|
|> ascii_string([?0..?9], min: 1)
|
|
|> reduce({List, :to_string, []})
|
|
|> reduce(:to_number)
|
|
|> unwrap_and_tag(:int)
|
|
|> label("an integer, such as `-100' or `5'")
|
|
|
|
number =
|
|
optional(ascii_char('-+'))
|
|
|> ascii_string([?0..?9], min: 1)
|
|
|> optional(ascii_char('.') |> ascii_string([?0..?9], min: 1))
|
|
|> reduce({List, :to_string, []})
|
|
|> reduce(:to_number)
|
|
|> unwrap_and_tag(:number)
|
|
|> label("a real number, such as `-2.71828' or `10'")
|
|
|
|
bool =
|
|
choice([
|
|
string("true"),
|
|
string("false")
|
|
])
|
|
|> label("a boolean, such as `false'")
|
|
|> reduce({Jason, :decode!, []})
|
|
|
|
ipv4_octet =
|
|
choice([
|
|
ascii_char('2') |> ascii_char('5') |> ascii_char([?0..?5]),
|
|
ascii_char('2') |> ascii_char([?0..?4]) |> ascii_char([?0..?9]),
|
|
ascii_char('1') |> ascii_char([?0..?9]) |> ascii_char([?0..?9]),
|
|
ascii_char([?1..?9]) |> ascii_char([?0..?9]),
|
|
ascii_char([?0..?9])
|
|
])
|
|
|> reduce({List, :to_string, []})
|
|
|
|
ipv4_address =
|
|
times(ipv4_octet |> string("."), 3)
|
|
|> concat(ipv4_octet)
|
|
|
|
ipv4_prefix =
|
|
ascii_char('/')
|
|
|> choice([
|
|
ascii_char('3') |> ascii_char([?0..?2]),
|
|
ascii_char([?1..?2]) |> ascii_char([?0..?9]),
|
|
ascii_char([?0..?9])
|
|
])
|
|
|> reduce({List, :to_string, []})
|
|
|
|
ipv6_hexadectet = ascii_string('0123456789abcdefABCDEF', min: 1, max: 4)
|
|
|
|
ipv6_ls32 =
|
|
choice([
|
|
ipv6_hexadectet |> string(":") |> concat(ipv6_hexadectet),
|
|
ipv4_address
|
|
])
|
|
|
|
ipv6_fragment = ipv6_hexadectet |> string(":")
|
|
|
|
ipv6_address =
|
|
choice([
|
|
times(ipv6_fragment, 6) |> concat(ipv6_ls32),
|
|
string("::") |> times(ipv6_fragment, 5) |> concat(ipv6_ls32),
|
|
ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
|
|
string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 1)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> times(ipv6_fragment, 3)
|
|
|> concat(ipv6_ls32),
|
|
ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
|
|
string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 2)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> times(ipv6_fragment, 2)
|
|
|> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 1)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> times(ipv6_fragment, 2)
|
|
|> concat(ipv6_ls32),
|
|
ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
|
|
string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 3)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_fragment)
|
|
|> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 2)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_fragment)
|
|
|> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 1)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_fragment)
|
|
|> concat(ipv6_ls32),
|
|
ipv6_hexadectet |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
|
|
string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
|
|
ipv6_hexadectet |> string("::") |> concat(ipv6_ls32),
|
|
string("::") |> concat(ipv6_ls32),
|
|
times(ipv6_fragment, 5)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_hexadectet),
|
|
times(ipv6_fragment, 4)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_hexadectet),
|
|
times(ipv6_fragment, 3)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_hexadectet),
|
|
times(ipv6_fragment, 2)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_hexadectet),
|
|
times(ipv6_fragment, 1)
|
|
|> concat(ipv6_hexadectet)
|
|
|> string("::")
|
|
|> concat(ipv6_hexadectet),
|
|
ipv6_hexadectet |> string("::") |> concat(ipv6_hexadectet),
|
|
string("::") |> concat(ipv6_hexadectet),
|
|
times(ipv6_fragment, 6) |> concat(ipv6_hexadectet) |> string("::"),
|
|
times(ipv6_fragment, 5) |> concat(ipv6_hexadectet) |> string("::"),
|
|
times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::"),
|
|
times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::"),
|
|
times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::"),
|
|
times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::"),
|
|
ipv6_hexadectet |> string("::"),
|
|
string("::")
|
|
])
|
|
|
|
ipv6_prefix =
|
|
ascii_char('/')
|
|
|> choice([
|
|
ascii_char('1') |> ascii_char('2') |> ascii_char([?0..?8]),
|
|
ascii_char('1') |> ascii_char([?0..?1]) |> ascii_char([?0..?9]),
|
|
ascii_char([?1..?9]) |> ascii_char([?0..?9]),
|
|
ascii_char([?0..?9])
|
|
])
|
|
|> reduce({List, :to_string, []})
|
|
|
|
# choice([
|
|
# ,
|
|
ip_address =
|
|
ipv4_address
|
|
|> optional(ipv4_prefix)
|
|
# ipv6_address |> optional(ipv6_prefix)
|
|
# ])
|
|
|> reduce({Enum, :join, []})
|
|
|> label("a valid IPv4 or IPv6 address and optional CIDR prefix")
|
|
|> unwrap_and_tag(:ip)
|
|
|
|
year = integer(4)
|
|
month = integer(2)
|
|
day = integer(2)
|
|
|
|
hour = integer(2)
|
|
minute = integer(2)
|
|
second = integer(2)
|
|
tz_hour = integer(2)
|
|
tz_minute = integer(2)
|
|
|
|
ymd_sep = ignore(string("-"))
|
|
hms_sep = ignore(string(":"))
|
|
iso8601_sep = ignore(choice([string("T"), string("t"), space]))
|
|
iso8601_tzsep = choice([string("+"), string("-")])
|
|
zulu = ignore(choice([string("Z"), string("z")]))
|
|
|
|
date_part =
|
|
year
|
|
|> optional(
|
|
ymd_sep
|
|
|> concat(month)
|
|
|> optional(
|
|
ymd_sep
|
|
|> concat(day)
|
|
|> optional(
|
|
iso8601_sep
|
|
|> optional(
|
|
hour
|
|
|> optional(
|
|
hms_sep
|
|
|> concat(minute)
|
|
|> optional(concat(hms_sep, second))
|
|
)
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|> tag(:date)
|
|
|
|
timezone_part =
|
|
choice([
|
|
iso8601_tzsep
|
|
|> concat(tz_hour)
|
|
|> optional(
|
|
hms_sep
|
|
|> concat(tz_minute)
|
|
)
|
|
|> tag(:timezone),
|
|
zulu
|
|
])
|
|
|
|
absolute_date =
|
|
date_part
|
|
|> optional(timezone_part)
|
|
|> reduce(:absolute_datetime)
|
|
|> unwrap_and_tag(:date)
|
|
|> label("an RFC3339 date and optional time, such as `2019-08-01'")
|
|
|
|
relative_date =
|
|
integer(min: 1)
|
|
|> ignore(concat(space, empty()))
|
|
|> choice([
|
|
string("second") |> optional(string("s")) |> replace(1),
|
|
string("minute") |> optional(string("s")) |> replace(60),
|
|
string("hour") |> optional(string("s")) |> replace(3600),
|
|
string("day") |> optional(string("s")) |> replace(86400),
|
|
string("week") |> optional(string("s")) |> replace(604_800),
|
|
string("month") |> optional(string("s")) |> replace(2_592_000),
|
|
string("year") |> optional(string("s")) |> replace(31_536_000)
|
|
])
|
|
|> ignore(string(" ago"))
|
|
|> reduce(:relative_datetime)
|
|
|> unwrap_and_tag(:date)
|
|
|> label("a relative date, such as `3 days ago'")
|
|
|
|
date =
|
|
choice([
|
|
absolute_date,
|
|
relative_date
|
|
])
|
|
|
|
eq = choice([string(":"), string(".eq:")]) |> unwrap_and_tag(:eq)
|
|
lt = string(".lt:") |> unwrap_and_tag(:lt)
|
|
lte = string(".lte:") |> unwrap_and_tag(:lte)
|
|
gt = string(".gt:") |> unwrap_and_tag(:gt)
|
|
gte = string(".gte:") |> unwrap_and_tag(:gte)
|
|
|
|
range_relation =
|
|
choice([
|
|
eq,
|
|
lt,
|
|
lte,
|
|
gt,
|
|
gte
|
|
])
|
|
|
|
boost =
|
|
string("^")
|
|
|> unwrap_and_tag(:boost)
|
|
|> concat(number)
|
|
|
|
fuzz =
|
|
string("~")
|
|
|> unwrap_and_tag(:fuzz)
|
|
|> concat(number)
|
|
|
|
quot = string("\"")
|
|
|
|
bool_value =
|
|
full_choice(unquote(for f <- bool_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:bool_field)
|
|
|> concat(eq)
|
|
|> concat(bool)
|
|
|
|
date_value =
|
|
full_choice(unquote(for f <- date_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:date_field)
|
|
|> concat(range_relation)
|
|
|> concat(date)
|
|
|
|
float_value =
|
|
full_choice(unquote(for f <- float_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:float_field)
|
|
|> concat(range_relation)
|
|
|> concat(number)
|
|
|
|
int_value =
|
|
full_choice(unquote(for f <- int_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:int_field)
|
|
|> concat(range_relation)
|
|
|> concat(int)
|
|
|
|
ip_value =
|
|
full_choice(unquote(for f <- ip_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:ip_field)
|
|
|> ignore(eq)
|
|
|> concat(ip_address)
|
|
|
|
numeric =
|
|
choice([
|
|
bool_value,
|
|
date_value,
|
|
float_value,
|
|
int_value,
|
|
ip_value
|
|
])
|
|
|
|
quoted_numeric = ignore(quot) |> concat(numeric) |> ignore(quot)
|
|
|
|
stop_words =
|
|
repeat(space)
|
|
|> choice([
|
|
string("\\") |> eos(),
|
|
string(","),
|
|
l_and,
|
|
l_or,
|
|
rparen,
|
|
fuzz,
|
|
boost
|
|
])
|
|
|
|
defcombinatorp(
|
|
unquote(:"#{name}_text"),
|
|
lookahead_not(stop_words)
|
|
|> choice([
|
|
string("\\") |> utf8_char([]),
|
|
string("(") |> parsec(unquote(:"#{name}_text")) |> string(")"),
|
|
utf8_char([])
|
|
])
|
|
|> times(min: 1)
|
|
)
|
|
|
|
text =
|
|
parsec(unquote(:"#{name}_text"))
|
|
|> reduce({List, :to_string, []})
|
|
|> unwrap_and_tag(:text)
|
|
|
|
quoted_text =
|
|
choice([
|
|
ignore(string("\\")) |> string("\""),
|
|
ignore(string("\\")) |> string("\\"),
|
|
string("\\") |> utf8_char([]),
|
|
utf8_char(not: ?")
|
|
])
|
|
|> times(min: 1)
|
|
|> reduce({List, :to_string, []})
|
|
|> unwrap_and_tag(:text)
|
|
|
|
literal =
|
|
full_choice(unquote(for f <- literal_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:literal_field)
|
|
|> concat(eq)
|
|
|> concat(text)
|
|
|
|
ngram =
|
|
full_choice(unquote(for f <- ngram_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:ngram_field)
|
|
|> concat(eq)
|
|
|> concat(text)
|
|
|
|
custom =
|
|
full_choice(unquote(for f <- custom_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:custom_field)
|
|
|> ignore(string(":"))
|
|
|> concat(text)
|
|
|
|
quoted_literal =
|
|
ignore(quot)
|
|
|> full_choice(unquote(for f <- literal_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:literal_field)
|
|
|> concat(eq)
|
|
|> concat(quoted_text)
|
|
|> ignore(quot)
|
|
|
|
quoted_ngram =
|
|
ignore(quot)
|
|
|> full_choice(unquote(for f <- ngram_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:ngram_field)
|
|
|> concat(eq)
|
|
|> concat(quoted_text)
|
|
|> ignore(quot)
|
|
|
|
quoted_custom =
|
|
ignore(quot)
|
|
|> full_choice(unquote(for f <- custom_fields, do: [string: f]))
|
|
|> unwrap_and_tag(:custom_field)
|
|
|> ignore(string(":"))
|
|
|> concat(quoted_text)
|
|
|> ignore(quot)
|
|
|
|
default =
|
|
text
|
|
|> tag(:default)
|
|
|
|
quoted_default =
|
|
ignore(quot)
|
|
|> concat(quoted_text)
|
|
|> ignore(quot)
|
|
|> tag(:default)
|
|
|
|
term =
|
|
choice([
|
|
quoted_numeric,
|
|
quoted_literal,
|
|
quoted_ngram,
|
|
quoted_custom,
|
|
quoted_default,
|
|
numeric,
|
|
literal,
|
|
ngram,
|
|
custom,
|
|
default
|
|
])
|
|
|
|
outer =
|
|
choice([
|
|
l_and,
|
|
l_or,
|
|
l_not,
|
|
lparen,
|
|
rparen,
|
|
boost,
|
|
fuzz,
|
|
space,
|
|
term
|
|
])
|
|
|
|
search =
|
|
times(outer, min: 1)
|
|
|> eos()
|
|
|
|
defparsec(unquote(:"#{name}_lexer"), search)
|
|
end
|
|
end
|
|
end
|