From 9d6103d77c5629c17feefcea01054b520dc3a1bd Mon Sep 17 00:00:00 2001 From: "byte[]" Date: Sun, 1 Dec 2019 11:42:38 -0500 Subject: [PATCH] add search evaluator --- lib/search/evaluator.ex | 142 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 lib/search/evaluator.ex diff --git a/lib/search/evaluator.ex b/lib/search/evaluator.ex new file mode 100644 index 00000000..15fccea6 --- /dev/null +++ b/lib/search/evaluator.ex @@ -0,0 +1,142 @@ +defmodule Search.Evaluator do + # TODO: rethink the necessity of this module. + # Can we do this in elasticsearch instead? + + def hits?(doc, %{bool: bool_query}) do + must(doc, bool_query[:must]) and + must(doc, bool_query[:filter]) and + should(doc, bool_query[:should]) and + not should(doc, bool_query[:must_not]) + end + + def hits?(doc, %{range: range_query}) do + [term] = Map.keys(range_query) + doc_values = wrap(doc[atomify(term)]) + + range_query[term] + |> Enum.all?(fn + {:gt, query_val} -> + Enum.any?(doc_values, & &1 > query_val) + {:gte, query_val} -> + Enum.any?(doc_values, & &1 >= query_val) + {:lt, query_val} -> + Enum.any?(doc_values, & &1 < query_val) + {:lte, query_val} -> + Enum.any?(doc_values, & &1 <= query_val) + end) + end + + def hits?(doc, %{fuzzy: fuzzy_query}) do + [{term, %{value: query_val, fuzziness: fuzziness}}] = Enum.to_list(fuzzy_query) + + wrap(doc[atomify(term)]) + |> Enum.any?(fn doc_val -> + cond do + fuzziness >= 1 -> + levenshtein(query_val, doc_val) <= fuzziness + + fuzziness >= 0 -> + levenshtein(query_val, doc_val) <= trunc((1 - fuzziness) * byte_size(query_val)) + + true -> + false + end + end) + end + + def hits?(doc, %{wildcard: wildcard_query}) do + [{term, query_val}] = Enum.to_list(wildcard_query) + query_re = wildcard_to_regex(query_val) + + wrap(doc[atomify(term)]) + |> Enum.any?(&Regex.match?(query_re, &1)) + end + + def hits?(doc, %{match_phrase: phrase_query}) do + # This is wildly inaccurate but practically unavoidable as + # there is no good reason to import a term stemmer + [{term, query_val}] = Enum.to_list(phrase_query) + + wrap(doc[atomify(term)]) + |> Enum.any?(&String.contains?(&1, query_val)) + end + + def hits?(doc, %{term: term_query}) do + [{term, query_val}] = Enum.to_list(term_query) + + wrap(doc[atomify(term)]) + |> Enum.member?(query_val) + end + + def hits?(_doc, %{match_all: %{}}), do: true + def hits?(_doc, %{match_none: %{}}), do: false + def hits?(doc, %{function_score: %{query: query}}), do: hits?(doc, query) + + defp must(_doc, nil), do: true + defp must(doc, queries) when is_list(queries), do: Enum.all?(queries, &hits?(doc, &1)) + defp must(doc, query), do: hits?(doc, query) + + defp should(_doc, nil), do: false + defp should(doc, queries) when is_list(queries), do: Enum.any?(queries, &hits?(doc, &1)) + defp should(doc, query), do: hits?(doc, query) + + defp wrap(list) when is_list(list), do: list + defp wrap(object), do: [object] + + defp atomify(atom) when is_atom(atom), do: atom + defp atomify(string) when is_binary(string), do: String.to_existing_atom(string) + + def levenshtein(s1, s2) do + {dist, _lookup} = levenshtein_lookup(s1, s2, %{}, 0) + + dist + end + + defp levenshtein_lookup(s1, s2, lookup, times) do + case lookup[{s1, s2}] do + nil -> + levenshtein_execute(s1, s2, lookup, times) + + val -> + {val, lookup} + end + end + + # Avoid pursuing excessively time-consuming substrings + defp levenshtein_execute(s1, s2, lookup, times) when times > 2, do: {max(byte_size(s1), byte_size(s2)), lookup} + defp levenshtein_execute("", s2, lookup, _times), do: {byte_size(s2), lookup} + defp levenshtein_execute(s1, "", lookup, _times), do: {byte_size(s1), lookup} + defp levenshtein_execute(s1, s1, lookup, _times), do: {0, lookup} + defp levenshtein_execute(s1, s2, lookup, times) do + {deletion, lookup} = levenshtein_lookup(chop(s1), s2, lookup, times + 1) + {insertion, lookup} = levenshtein_lookup(s1, chop(s2), lookup, times + 1) + {substitution, lookup} = levenshtein_lookup(chop(s1), chop(s2), lookup, times + 1) + + min = + Enum.min([ + deletion + 1, + insertion + 1, + substitution + last_bytes_different?(s1, s2) + ]) + + lookup = Map.put(lookup, {s1, s2}, min) + + {min, lookup} + end + + defp chop(str) when is_binary(str), do: binary_part(str, 0, byte_size(str) - 1) + defp last_bytes_different?(s1, s2) when binary_part(s1, byte_size(s1) - 1, 1) == binary_part(s2, byte_size(s2) - 1, 1), do: 0 + defp last_bytes_different?(_s1, _s2), do: 1 + + defp wildcard_to_regex(input) do + re = + input + |> String.replace(~r/([.+^$\[\]\\\(\){}|-])/, "\\\\\\1") # escape regex metacharacters + |> String.replace(~r/([^\\]|[^\\](?:\\\\)+)\*/, "\\1.*") # * -> .* (kleene star) + |> String.replace(~r/\A(?:\\\\)*\*/, ".*") # * -> .* (kleene star) + |> String.replace(~r/([^\\]|[^\\](?:\\\\)+)\?/, "\\1.?") # ? -> .? (concatenation/alternation) + |> String.replace(~r/\A(?:\\\\)*\?/, ".?") # ? -> .? (concatenation/alternation) + + Regex.compile!("\\A#{re}\\z", "im") + end +end \ No newline at end of file