philomena/lib/philomena/search/lexer.ex
2019-08-27 19:37:26 -04:00

468 lines
14 KiB
Elixir

defmodule Philomena.Search.Lexer do
defmacro deflexer(name, opts) do
literal_fields = Keyword.get(opts, :literal, []) |> Macro.expand(__CALLER__)
ngram_fields = Keyword.get(opts, :ngram, []) |> Macro.expand(__CALLER__)
bool_fields = Keyword.get(opts, :bool, []) |> Macro.expand(__CALLER__)
date_fields = Keyword.get(opts, :date, []) |> Macro.expand(__CALLER__)
float_fields = Keyword.get(opts, :float, []) |> Macro.expand(__CALLER__)
int_fields = Keyword.get(opts, :int, []) |> Macro.expand(__CALLER__)
ip_fields = Keyword.get(opts, :ip, []) |> Macro.expand(__CALLER__)
custom_fields = Keyword.get(opts, :custom, []) |> Macro.expand(__CALLER__)
quote location: :keep do
import NimbleParsec
import Philomena.Search.Helpers
l_and =
choice([string("AND"), string("&&"), string(",")])
|> unwrap_and_tag(:and)
l_or =
choice([string("OR"), string("||")])
|> unwrap_and_tag(:or)
l_not =
choice([string("NOT"), string("!"), string("-")])
|> unwrap_and_tag(:not)
lparen = string("(") |> unwrap_and_tag(:lparen)
rparen = string(")") |> unwrap_and_tag(:rparen)
space =
choice([string(" "), string("\t"), string("\n"), string("\r"), string("\v"), string("\f")])
|> ignore()
int =
optional(ascii_char('-+'))
|> ascii_string([?0..?9], min: 1)
|> reduce({List, :to_string, []})
|> reduce(:to_number)
|> unwrap_and_tag(:int)
|> label("an integer, such as `-100' or `5'")
number =
optional(ascii_char('-+'))
|> ascii_string([?0..?9], min: 1)
|> optional(ascii_char('.') |> ascii_string([?0..?9], min: 1))
|> reduce({List, :to_string, []})
|> reduce(:to_number)
|> unwrap_and_tag(:number)
|> label("a real number, such as `-2.71828' or `10'")
bool =
choice([
string("true"),
string("false")
])
|> label("a boolean, such as `false'")
|> reduce({Jason, :decode!, []})
ipv4_octet =
choice([
ascii_char('2') |> ascii_char('5') |> ascii_char([?0..?5]),
ascii_char('2') |> ascii_char([?0..?4]) |> ascii_char([?0..?9]),
ascii_char('1') |> ascii_char([?0..?9]) |> ascii_char([?0..?9]),
ascii_char([?1..?9]) |> ascii_char([?0..?9]),
ascii_char([?0..?9])
])
|> reduce({List, :to_string, []})
ipv4_address =
times(ipv4_octet |> string("."), 3)
|> concat(ipv4_octet)
ipv4_prefix =
ascii_char('/')
|> choice([
ascii_char('3') |> ascii_char([?0..?2]),
ascii_char([?1..?2]) |> ascii_char([?0..?9]),
ascii_char([?0..?9])
])
|> reduce({List, :to_string, []})
ipv6_hexadectet = ascii_string('0123456789abcdefABCDEF', min: 1, max: 4)
ipv6_ls32 =
choice([
ipv6_hexadectet |> string(":") |> concat(ipv6_hexadectet),
ipv4_address
])
ipv6_fragment = ipv6_hexadectet |> string(":")
ipv6_address =
choice([
times(ipv6_fragment, 6) |> concat(ipv6_ls32),
string("::") |> times(ipv6_fragment, 5) |> concat(ipv6_ls32),
ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
string("::") |> times(ipv6_fragment, 4) |> concat(ipv6_ls32),
times(ipv6_fragment, 1)
|> concat(ipv6_hexadectet)
|> string("::")
|> times(ipv6_fragment, 3)
|> concat(ipv6_ls32),
ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
string("::") |> times(ipv6_fragment, 3) |> concat(ipv6_ls32),
times(ipv6_fragment, 2)
|> concat(ipv6_hexadectet)
|> string("::")
|> times(ipv6_fragment, 2)
|> concat(ipv6_ls32),
times(ipv6_fragment, 1)
|> concat(ipv6_hexadectet)
|> string("::")
|> times(ipv6_fragment, 2)
|> concat(ipv6_ls32),
ipv6_hexadectet |> string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
string("::") |> times(ipv6_fragment, 2) |> concat(ipv6_ls32),
times(ipv6_fragment, 3)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_fragment)
|> concat(ipv6_ls32),
times(ipv6_fragment, 2)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_fragment)
|> concat(ipv6_ls32),
times(ipv6_fragment, 1)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_fragment)
|> concat(ipv6_ls32),
ipv6_hexadectet |> string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
string("::") |> concat(ipv6_fragment) |> concat(ipv6_ls32),
times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::") |> concat(ipv6_ls32),
ipv6_hexadectet |> string("::") |> concat(ipv6_ls32),
string("::") |> concat(ipv6_ls32),
times(ipv6_fragment, 5)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_hexadectet),
times(ipv6_fragment, 4)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_hexadectet),
times(ipv6_fragment, 3)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_hexadectet),
times(ipv6_fragment, 2)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_hexadectet),
times(ipv6_fragment, 1)
|> concat(ipv6_hexadectet)
|> string("::")
|> concat(ipv6_hexadectet),
ipv6_hexadectet |> string("::") |> concat(ipv6_hexadectet),
string("::") |> concat(ipv6_hexadectet),
times(ipv6_fragment, 6) |> concat(ipv6_hexadectet) |> string("::"),
times(ipv6_fragment, 5) |> concat(ipv6_hexadectet) |> string("::"),
times(ipv6_fragment, 4) |> concat(ipv6_hexadectet) |> string("::"),
times(ipv6_fragment, 3) |> concat(ipv6_hexadectet) |> string("::"),
times(ipv6_fragment, 2) |> concat(ipv6_hexadectet) |> string("::"),
times(ipv6_fragment, 1) |> concat(ipv6_hexadectet) |> string("::"),
ipv6_hexadectet |> string("::"),
string("::")
])
ipv6_prefix =
ascii_char('/')
|> choice([
ascii_char('1') |> ascii_char('2') |> ascii_char([?0..?8]),
ascii_char('1') |> ascii_char([?0..?1]) |> ascii_char([?0..?9]),
ascii_char([?1..?9]) |> ascii_char([?0..?9]),
ascii_char([?0..?9])
])
|> reduce({List, :to_string, []})
# choice([
# ,
ip_address =
ipv4_address
|> optional(ipv4_prefix)
# ipv6_address |> optional(ipv6_prefix)
# ])
|> reduce({Enum, :join, []})
|> label("a valid IPv4 or IPv6 address and optional CIDR prefix")
|> unwrap_and_tag(:ip)
year = integer(4)
month = integer(2)
day = integer(2)
hour = integer(2)
minute = integer(2)
second = integer(2)
tz_hour = integer(2)
tz_minute = integer(2)
ymd_sep = ignore(string("-"))
hms_sep = ignore(string(":"))
iso8601_sep = ignore(choice([string("T"), string("t"), space]))
iso8601_tzsep = choice([string("+"), string("-")])
zulu = ignore(choice([string("Z"), string("z")]))
date_part =
year
|> optional(
ymd_sep
|> concat(month)
|> optional(
ymd_sep
|> concat(day)
|> optional(
iso8601_sep
|> optional(
hour
|> optional(
hms_sep
|> concat(minute)
|> optional(concat(hms_sep, second))
)
)
)
)
)
|> tag(:date)
timezone_part =
choice([
iso8601_tzsep
|> concat(tz_hour)
|> optional(
hms_sep
|> concat(tz_minute)
)
|> tag(:timezone),
zulu
])
absolute_date =
date_part
|> optional(timezone_part)
|> reduce(:absolute_datetime)
|> unwrap_and_tag(:date)
|> label("an RFC3339 date and optional time, such as `2019-08-01'")
relative_date =
integer(min: 1)
|> ignore(concat(space, empty()))
|> choice([
string("second") |> optional(string("s")) |> replace(1),
string("minute") |> optional(string("s")) |> replace(60),
string("hour") |> optional(string("s")) |> replace(3600),
string("day") |> optional(string("s")) |> replace(86400),
string("week") |> optional(string("s")) |> replace(604_800),
string("month") |> optional(string("s")) |> replace(2_592_000),
string("year") |> optional(string("s")) |> replace(31_536_000)
])
|> ignore(string(" ago"))
|> reduce(:relative_datetime)
|> unwrap_and_tag(:date)
|> label("a relative date, such as `3 days ago'")
date =
choice([
absolute_date,
relative_date
])
eq = choice([string(":"), string(".eq:")]) |> unwrap_and_tag(:eq)
lt = string(".lt:") |> unwrap_and_tag(:lt)
lte = string(".lte:") |> unwrap_and_tag(:lte)
gt = string(".gt:") |> unwrap_and_tag(:gt)
gte = string(".gte:") |> unwrap_and_tag(:gte)
range_relation =
choice([
eq,
lt,
lte,
gt,
gte
])
boost =
string("^")
|> unwrap_and_tag(:boost)
|> concat(number)
fuzz =
string("~")
|> unwrap_and_tag(:fuzz)
|> concat(number)
quot = string("\"")
bool_value =
full_choice(unquote(for f <- bool_fields, do: [string: f]))
|> unwrap_and_tag(:bool_field)
|> concat(eq)
|> concat(bool)
date_value =
full_choice(unquote(for f <- date_fields, do: [string: f]))
|> unwrap_and_tag(:date_field)
|> concat(range_relation)
|> concat(date)
float_value =
full_choice(unquote(for f <- float_fields, do: [string: f]))
|> unwrap_and_tag(:float_field)
|> concat(range_relation)
|> concat(number)
int_value =
full_choice(unquote(for f <- int_fields, do: [string: f]))
|> unwrap_and_tag(:int_field)
|> concat(range_relation)
|> concat(int)
ip_value =
full_choice(unquote(for f <- ip_fields, do: [string: f]))
|> unwrap_and_tag(:ip_field)
|> ignore(eq)
|> concat(ip_address)
numeric =
choice([
bool_value,
date_value,
float_value,
int_value,
ip_value
])
quoted_numeric = ignore(quot) |> concat(numeric) |> ignore(quot)
stop_words =
repeat(space)
|> choice([
string("\\") |> eos(),
string(","),
l_and,
l_or,
rparen,
fuzz,
boost
])
defcombinatorp(
unquote(:"#{name}_text"),
lookahead_not(stop_words)
|> choice([
string("\\") |> utf8_char([]),
string("(") |> parsec(unquote(:"#{name}_text")) |> string(")"),
utf8_char([])
])
|> times(min: 1)
)
text =
parsec(unquote(:"#{name}_text"))
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:text)
quoted_text =
choice([
ignore(string("\\")) |> string("\""),
ignore(string("\\")) |> string("\\"),
string("\\") |> utf8_char([]),
utf8_char(not: ?")
])
|> times(min: 1)
|> reduce({List, :to_string, []})
|> unwrap_and_tag(:text)
literal =
full_choice(unquote(for f <- literal_fields, do: [string: f]))
|> unwrap_and_tag(:literal_field)
|> concat(eq)
|> concat(text)
ngram =
full_choice(unquote(for f <- ngram_fields, do: [string: f]))
|> unwrap_and_tag(:ngram_field)
|> concat(eq)
|> concat(text)
custom =
full_choice(unquote(for f <- custom_fields, do: [string: f]))
|> unwrap_and_tag(:custom_field)
|> ignore(string(":"))
|> concat(text)
quoted_literal =
ignore(quot)
|> full_choice(unquote(for f <- literal_fields, do: [string: f]))
|> unwrap_and_tag(:literal_field)
|> concat(eq)
|> concat(quoted_text)
|> ignore(quot)
quoted_ngram =
ignore(quot)
|> full_choice(unquote(for f <- ngram_fields, do: [string: f]))
|> unwrap_and_tag(:ngram_field)
|> concat(eq)
|> concat(quoted_text)
|> ignore(quot)
quoted_custom =
ignore(quot)
|> full_choice(unquote(for f <- custom_fields, do: [string: f]))
|> unwrap_and_tag(:custom_field)
|> ignore(string(":"))
|> concat(quoted_text)
|> ignore(quot)
default =
text
|> tag(:default)
quoted_default =
ignore(quot)
|> concat(quoted_text)
|> ignore(quot)
|> tag(:default)
term =
choice([
quoted_numeric,
quoted_literal,
quoted_ngram,
quoted_custom,
quoted_default,
numeric,
literal,
ngram,
custom,
default
])
outer =
choice([
l_and,
l_or,
l_not,
lparen,
rparen,
boost,
fuzz,
space,
term
])
search =
times(outer, min: 1)
|> eos()
defparsec(unquote(:"#{name}_lexer"), search)
end
end
end