Improve performance of fuzzy queries using regex

This commit is contained in:
Joakim Soderlund 2020-03-07 21:04:12 +01:00
parent 94d8bf9fe2
commit 96b7773e8f
3 changed files with 55 additions and 6 deletions

37
Cargo.lock generated
View file

@ -5,6 +5,14 @@ name = "adler32"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "aho-corasick"
version = "0.7.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "arrayvec"
version = "0.4.12"
@ -119,6 +127,7 @@ dependencies = [
"hex 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"nom 5.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.104 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.48 (registry+https://github.com/rust-lang/crates.io-index)",
"zip 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
@ -319,6 +328,22 @@ name = "redox_syscall"
version = "0.1.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "regex"
version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.7.9 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.6.16 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.6.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rustc_version"
version = "0.2.3"
@ -414,6 +439,14 @@ dependencies = [
"unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "time"
version = "0.1.42"
@ -485,6 +518,7 @@ dependencies = [
[metadata]
"checksum adler32 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "5d2e7343e7fc9de883d1b0341e0b13970f764c14101234857d2ddafa1cb1cac2"
"checksum aho-corasick 0.7.9 (registry+https://github.com/rust-lang/crates.io-index)" = "d5e63fd144e18ba274ae7095c0197a870a7b9468abc801dd62f190d80817d2ec"
"checksum arrayvec 0.4.12 (registry+https://github.com/rust-lang/crates.io-index)" = "cd9fd44efafa8690358b7408d253adf110036b88f55672a933f01d616ad9b1b9"
"checksum autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d"
"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
@ -522,6 +556,8 @@ dependencies = [
"checksum rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
"checksum rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
"checksum regex 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "322cf97724bea3ee221b78fe25ac9c46114ebb51747ad5babd51a2fc6a8235a8"
"checksum regex-syntax 0.6.16 (registry+https://github.com/rust-lang/crates.io-index)" = "1132f845907680735a84409c3bebc64d1364a5683ffbce899550cd09d5eaefc1"
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
"checksum rustyline 6.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "de64be8eecbe428b6924f1d8430369a01719fbb182c26fa431ddbb0a95f5315d"
"checksum ryu 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "bfa8506c1de11c9c4e4c38863ccbe02a305c8188e85a05a784c9e11e1c3910c8"
@ -534,6 +570,7 @@ dependencies = [
"checksum serde_json 1.0.48 (registry+https://github.com/rust-lang/crates.io-index)" = "9371ade75d4c2d6cb154141b9752cf3781ec9c05e0e5cf35060e1e70ee7b9c25"
"checksum static_assertions 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
"checksum syn 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)" = "123bd9499cfb380418d509322d7a6d52e5315f064fe4b3ad18a53d6b92c07859"
"checksum thread_local 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"
"checksum unicode-segmentation 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
"checksum unicode-width 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"

View file

@ -32,6 +32,9 @@ version = "5"
[dependencies.rayon]
version = "*"
[dependencies.regex]
version = "*"
[dependencies.serde]
version = "*"
features = ["derive"]

View file

@ -4,6 +4,9 @@ use chrono::prelude::*;
use chrono_english::{parse_date_string, Dialect};
use regex::RegexBuilder;
use regex::escape;
use super::parser::{Operator, Source};
use crate::archive::Story;
use crate::error::{Error, Result};
@ -32,14 +35,20 @@ pub fn optimize(src: Source, op: Operator, value: &str) -> Result<Filter> {
}
fn strfn(f: StrFn, op: Operator, value: &str) -> Result<Filter> {
let value: String = match op {
Fuzzy => value.to_lowercase(),
_ => value.to_owned(),
};
let exact: String = value.into();
let result = RegexBuilder::new(&escape(value))
.case_insensitive(true)
.size_limit(1_048_576)
.build();
let regex = result.map_err(|e| match e {
_ => Error::query("Invalid value for fuzzy match"),
})?;
match op {
Exact => ok!(move |s| f(s) == value),
Fuzzy => ok!(move |s| f(s).to_lowercase().contains(&value)),
Exact => ok!(move |s| f(s) == exact),
Fuzzy => ok!(move |s| regex.is_match(f(s))),
_ => Err(Error::query("Invalid operation for text type")),
}
}