From 2832d50b2fad853691d287ef04da2184b4dc9609 Mon Sep 17 00:00:00 2001 From: Joakim Soderlund Date: Sat, 6 Apr 2024 22:58:54 +0200 Subject: [PATCH] Simplify search while improving performance --- Cargo.lock | 7 +-- cli/Cargo.toml | 5 -- cli/src/main.rs | 12 +--- search/Cargo.toml | 6 -- search/src/lib.rs | 122 +++++++++++++++++++---------------------- src/archive/fetcher.rs | 12 ++-- 6 files changed, 65 insertions(+), 99 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 22aafaf..f0f9b36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,7 +301,6 @@ dependencies = [ "fimfareader", "fimfareader-search", "rustyline", - "zip", ] [[package]] @@ -321,9 +320,7 @@ name = "fimfareader-search" version = "0.1.0" dependencies = [ "fimfareader", - "rayon", "tantivy", - "thread_local", "zip", ] @@ -863,9 +860,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" +checksum = "80af6f9131f277a45a3fba6ce8e2258037bb0477a67e610d3c1fe046ab31de47" [[package]] name = "rustyline" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index beebfcf..1ea0c92 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -13,8 +13,3 @@ path = "../search" [dependencies.rustyline] default-features = false version = "14" - -[dependencies.zip] -version = "*" -features = ["deflate"] -default-features = false diff --git a/cli/src/main.rs b/cli/src/main.rs index 809bb47..850e34e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,13 +1,10 @@ //! Main module. use std::env::args; -use std::fs::File; -use std::io::BufReader; use std::time::Instant; use rustyline::DefaultEditor; use rustyline::Result; -use zip::ZipArchive; use fimfareader::prelude::*; use fimfareader_search::Searcher; @@ -37,14 +34,7 @@ fn main() -> Result<()> { println!("Finished loading in {} milliseconds.", finish); println!("The archive contains {} stories.", count); - let opener = || { - let file = File::open(&argv[1]).unwrap(); - let reader = BufReader::new(file); - - ZipArchive::new(reader).unwrap() - }; - - let searcher = Searcher::new(&fetcher, &opener); + let searcher = Searcher::new(&fetcher); while let Ok(line) = editor.readline(">>> ") { editor.add_history_entry(&line)?; diff --git a/search/Cargo.toml b/search/Cargo.toml index 3c38acb..0838e54 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -7,15 +7,9 @@ edition = "2021" [dependencies.fimfareader] path = ".." -[dependencies.rayon] -version = "*" - [dependencies.tantivy] git = "https://github.com/quickwit-oss/tantivy.git" -[dependencies.thread_local] -version = "*" - [dependencies.zip] version = "*" features = ["deflate"] diff --git a/search/src/lib.rs b/search/src/lib.rs index d60bbb6..2fa882e 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -1,108 +1,90 @@ //! Main module. -use std::cell::RefCell; +use std::fs::create_dir_all; +use std::io::stdout; use std::io::Cursor; use std::io::Read; use std::io::Seek; +use std::io::Write; use std::path::Path; -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; use std::time::Instant; -use rayon::iter::ParallelIterator; -use thread_local::ThreadLocal; -use zip::read::ZipArchive; - use tantivy::collector::TopDocs; -use tantivy::directory::MmapDirectory; use tantivy::query::QueryParser; use tantivy::schema; use tantivy::schema::Schema; use tantivy::schema::Value; use tantivy::Index; -use tantivy::IndexSettings; use tantivy::ReloadPolicy; use tantivy::TantivyDocument; +use zip::read::ZipArchive; use fimfareader::prelude::*; pub struct Searcher { - schema: Schema, index: Index, } impl Searcher { - pub fn new(fetcher: &Fetcher, f: &F) -> Self + pub fn new(fetcher: &Fetcher) -> Self where T: Read + Seek + Send, - F: Fn() -> ZipArchive + Sync, { + Searcher { + index: Self::load_index(fetcher), + } + } + + fn schema() -> Schema { let mut builder = Schema::builder(); + builder.add_i64_field("sid", schema::INDEXED | schema::STORED); builder.add_text_field("content", schema::TEXT); - let schema = builder.build(); - let index = Self::load_index(schema.clone(), fetcher, f); - - Searcher { schema, index } + builder.build() } - fn load_index(schema: Schema, fetcher: &Fetcher, f: &F) -> Index + fn load_index(fetcher: &Fetcher) -> Index where T: Read + Seek + Send, - F: Fn() -> ZipArchive + Sync, { let identity = fetcher.identity().unwrap(); - let directory = Path::new("cache").join(identity); + let path = Path::new("cache").join(identity); - if !directory.exists() { - Self::make_index(schema.clone(), fetcher, f); + if path.exists() { + Index::open_in_dir(path).unwrap() + } else { + Self::make_index(&path, fetcher) } - - let store = MmapDirectory::open(&directory).unwrap(); - Index::open_or_create(store, schema).unwrap() } - fn make_index(schema: Schema, fetcher: &Fetcher, f: &F) + fn make_index(path: &Path, fetcher: &Fetcher) -> Index where T: Read + Seek + Send, - F: Fn() -> ZipArchive + Sync, { - let identity = fetcher.identity().unwrap(); - let directory = Path::new("cache").join(identity); + let start = Instant::now(); + print!("\r\rIndexing archive...\r\r"); + create_dir_all(path).unwrap(); - std::fs::create_dir_all(&directory).unwrap(); - let store = MmapDirectory::open(&directory).unwrap(); - let settings = IndexSettings::default(); - let index = Index::create(store, schema, settings).unwrap(); + let schema = Self::schema(); + let index = Index::create_in_dir(path, schema).unwrap(); + let mut writer = index.writer(1_073_741_824).unwrap(); + let mut buffer = String::with_capacity(1_048_576); let schema = index.schema(); - let sid = schema.get_field("sid").unwrap(); + let identifier = schema.get_field("sid").unwrap(); let content = schema.get_field("content").unwrap(); - let mut writer = index.writer(536_870_912).unwrap(); + let story_count = fetcher.iter().count() as f64; - let counter = AtomicUsize::new(0); - let total = fetcher.iter().count(); - let start = Instant::now(); + for (i, story) in fetcher.iter().enumerate() { + let progress = (i * 100) as f64 / story_count; + print!("\r\rIndexing archive... {progress:.2}%\r\r"); - let local = ThreadLocal::new(); + let cursor = Cursor::new(fetcher.read(story).unwrap()); + let mut epub = ZipArchive::new(cursor).unwrap(); + let mut document = TantivyDocument::default(); - fetcher.par_iter().for_each(|story| { - let mut doc = TantivyDocument::default(); - - let mut arch = local.get_or(|| RefCell::new(f())).borrow_mut(); - let mut file = arch.by_name(&story.archive.path).unwrap(); - let mut data = Vec::with_capacity(file.size() as usize); - let mut text = String::with_capacity(1_048_576); - - file.read_to_end(&mut data).unwrap(); - let mut epub = ZipArchive::new(Cursor::new(data)).unwrap(); - - let count = counter.fetch_add(1, Ordering::SeqCst); - let percentage = (count as f64 / total as f64) * 100f64; - print!("\r\rIndexing archive... {:.2}%\r\r", percentage); - - doc.add_i64(sid, story.id); + document.add_i64(identifier, story.id); for i in 0..epub.len() { let mut file = epub.by_index(i).unwrap(); @@ -111,18 +93,24 @@ impl Searcher { continue; } - file.read_to_string(&mut text).unwrap(); - doc.add_text(content, &text); - text.clear(); + file.read_to_string(&mut buffer).unwrap(); + document.add_text(content, &buffer); + buffer.clear(); } - writer.add_document(doc).unwrap(); - }); + writer.add_document(document).unwrap(); + } + + print!("\r\rCommitting archive index...\r\r"); + stdout().flush().unwrap(); writer.commit().unwrap(); + writer.wait_merging_threads().unwrap(); let finish = (Instant::now() - start).as_secs(); - println!("Index generated in {} seconds.", finish); + println!("Index generated in {finish} seconds."); + + index } pub fn search(&self, text: &str) -> Vec<(i64, f32)> { @@ -133,20 +121,22 @@ impl Searcher { .try_into() .unwrap(); - let searcher = reader.searcher(); - let identitfier = self.schema.get_field("sid").unwrap(); - let content = self.schema.get_field("content").unwrap(); - let parser = QueryParser::for_index(&self.index, vec![content]); + let schema = self.index.schema(); + let identifier = schema.get_field("sid").unwrap(); + let content = schema.get_field("content").unwrap(); + let parser = QueryParser::for_index(&self.index, vec![content]); + let query = parser.parse_query(text).unwrap(); + + let searcher = reader.searcher(); let limit = TopDocs::with_limit(32); - let query = parser.parse_query(&text).unwrap(); let docs = searcher.search(&query, &limit).unwrap(); docs.into_iter() .map(|(score, address)| { let doc: TantivyDocument = searcher.doc(address).unwrap(); - match doc.get_first(identitfier).map(|v| v.as_i64()) { + match doc.get_first(identifier).map(|v| v.as_i64()) { Some(Some(value)) => (value, score), _ => panic!("Invalid story key type!"), } diff --git a/src/archive/fetcher.rs b/src/archive/fetcher.rs index 4223b65..0adf45e 100644 --- a/src/archive/fetcher.rs +++ b/src/archive/fetcher.rs @@ -98,13 +98,13 @@ impl Fetcher { } pub fn identity(&self) -> Result { - let mut archive = self.archive.lock().map_err(|e| match e { - _ => Error::archive("Could not acquire fetcher lock"), - })?; + let Ok(mut archive) = self.archive.lock() else { + return Err(Error::archive("Could not acquire fetcher lock")); + }; - let index = archive.by_name("index.json").map_err(|e| match e { - _ => Error::archive("Could not open archive index"), - })?; + let Ok(index) = archive.by_name("index.json") else { + return Err(Error::archive("Could not open archive index")); + }; Ok(format!("{}", index.crc32())) }