Simplify search while improving performance

2025-04-01 08:45:30 +02:00 · 2024-04-06 22:58:54 +02:00 · 2024-04-06 22:58:54 +02:00 · 2832d50b2f
commit 2832d50b2f
parent 01acb7c842
6 changed files with 65 additions and 99 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -301,7 +301,6 @@ dependencies = [
 "fimfareader",
 "fimfareader-search",
 "rustyline",
 "zip",
 ]
 [[package]]
@ -321,9 +320,7 @@ name = "fimfareader-search"
 version = "0.1.0"
 dependencies = [
 "fimfareader",
 "rayon",
 "tantivy",
 "thread_local",
 "zip",
 ]
@ -863,9 +860,9 @@ dependencies = [
 [[package]]
 name = "rustversion"
-version = "1.0.14"
+version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
+checksum = "80af6f9131f277a45a3fba6ce8e2258037bb0477a67e610d3c1fe046ab31de47"
 [[package]]
 name = "rustyline"
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -13,8 +13,3 @@ path = "../search"
 [dependencies.rustyline]
 default-features = false
 version = "14"
 [dependencies.zip]
 version = "*"
 features = ["deflate"]
 default-features = false
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@ -1,13 +1,10 @@
 //! Main module.
 use std::env::args;
 use std::fs::File;
 use std::io::BufReader;
 use std::time::Instant;
 use rustyline::DefaultEditor;
 use rustyline::Result;
 use zip::ZipArchive;
 use fimfareader::prelude::*;
 use fimfareader_search::Searcher;
@ -37,14 +34,7 @@ fn main() -> Result<()> {
    println!("Finished loading in {} milliseconds.", finish);
    println!("The archive contains {} stories.", count);
-    let opener = || {
+    let searcher = Searcher::new(&fetcher);
        let file = File::open(&argv[1]).unwrap();
        let reader = BufReader::new(file);
        ZipArchive::new(reader).unwrap()
    };
    let searcher = Searcher::new(&fetcher, &opener);
    while let Ok(line) = editor.readline(">>> ") {
        editor.add_history_entry(&line)?;
--- a/search/Cargo.toml
+++ b/search/Cargo.toml
@ -7,15 +7,9 @@ edition = "2021"
 [dependencies.fimfareader]
 path = ".."
 [dependencies.rayon]
 version = "*"
 [dependencies.tantivy]
 git = "https://github.com/quickwit-oss/tantivy.git"
 [dependencies.thread_local]
 version = "*"
 [dependencies.zip]
 version = "*"
 features = ["deflate"]
--- a/search/src/lib.rs
+++ b/search/src/lib.rs
@ -1,108 +1,90 @@
 //! Main module.
-use std::cell::RefCell;
+use std::fs::create_dir_all;
 use std::io::stdout;
 use std::io::Cursor;
 use std::io::Read;
 use std::io::Seek;
 use std::io::Write;
 use std::path::Path;
 use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering;
 use std::time::Instant;
 use rayon::iter::ParallelIterator;
 use thread_local::ThreadLocal;
 use zip::read::ZipArchive;
 use tantivy::collector::TopDocs;
 use tantivy::directory::MmapDirectory;
 use tantivy::query::QueryParser;
 use tantivy::schema;
 use tantivy::schema::Schema;
 use tantivy::schema::Value;
 use tantivy::Index;
 use tantivy::IndexSettings;
 use tantivy::ReloadPolicy;
 use tantivy::TantivyDocument;
 use zip::read::ZipArchive;
 use fimfareader::prelude::*;
 pub struct Searcher {
    schema: Schema,
    index: Index,
 }
 impl Searcher {
-    pub fn new<T, F>(fetcher: &Fetcher<T>, f: &F) -> Self
+    pub fn new<T>(fetcher: &Fetcher<T>) -> Self
    where
        T: Read + Seek + Send,
        F: Fn() -> ZipArchive<T> + Sync,
    {
        Searcher {
            index: Self::load_index(fetcher),
        }
    }
    fn schema() -> Schema {
        let mut builder = Schema::builder();
        builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
        builder.add_text_field("content", schema::TEXT);
        let schema = builder.build();
-        let index = Self::load_index(schema.clone(), fetcher, f);
+        builder.build()
        Searcher { schema, index }
    }
-    fn load_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) -> Index
+    fn load_index<T>(fetcher: &Fetcher<T>) -> Index
    where
        T: Read + Seek + Send,
        F: Fn() -> ZipArchive<T> + Sync,
    {
        let identity = fetcher.identity().unwrap();
-        let directory = Path::new("cache").join(identity);
+        let path = Path::new("cache").join(identity);
-        if !directory.exists() {
+        if path.exists() {
-            Self::make_index(schema.clone(), fetcher, f);
+            Index::open_in_dir(path).unwrap()
        } else {
            Self::make_index(&path, fetcher)
        }
        let store = MmapDirectory::open(&directory).unwrap();
        Index::open_or_create(store, schema).unwrap()
    }
-    fn make_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F)
+    fn make_index<T>(path: &Path, fetcher: &Fetcher<T>) -> Index
    where
        T: Read + Seek + Send,
        F: Fn() -> ZipArchive<T> + Sync,
    {
-        let identity = fetcher.identity().unwrap();
+        let start = Instant::now();
-        let directory = Path::new("cache").join(identity);
+        print!("\r\rIndexing archive...\r\r");
        create_dir_all(path).unwrap();
-        std::fs::create_dir_all(&directory).unwrap();
+        let schema = Self::schema();
-        let store = MmapDirectory::open(&directory).unwrap();
+        let index = Index::create_in_dir(path, schema).unwrap();
-        let settings = IndexSettings::default();
+        let mut writer = index.writer(1_073_741_824).unwrap();
-        let index = Index::create(store, schema, settings).unwrap();
+        let mut buffer = String::with_capacity(1_048_576);
        let schema = index.schema();
-        let sid = schema.get_field("sid").unwrap();
+        let identifier = schema.get_field("sid").unwrap();
        let content = schema.get_field("content").unwrap();
-        let mut writer = index.writer(536_870_912).unwrap();
+        let story_count = fetcher.iter().count() as f64;
-        let counter = AtomicUsize::new(0);
+        for (i, story) in fetcher.iter().enumerate() {
-        let total = fetcher.iter().count();
+            let progress = (i * 100) as f64 / story_count;
-        let start = Instant::now();
+            print!("\r\rIndexing archive... {progress:.2}%\r\r");
-        let local = ThreadLocal::new();
+            let cursor = Cursor::new(fetcher.read(story).unwrap());
            let mut epub = ZipArchive::new(cursor).unwrap();
            let mut document = TantivyDocument::default();
-        fetcher.par_iter().for_each(|story| {
+            document.add_i64(identifier, story.id);
            let mut doc = TantivyDocument::default();
            let mut arch = local.get_or(|| RefCell::new(f())).borrow_mut();
            let mut file = arch.by_name(&story.archive.path).unwrap();
            let mut data = Vec::with_capacity(file.size() as usize);
            let mut text = String::with_capacity(1_048_576);
            file.read_to_end(&mut data).unwrap();
            let mut epub = ZipArchive::new(Cursor::new(data)).unwrap();
            let count = counter.fetch_add(1, Ordering::SeqCst);
            let percentage = (count as f64 / total as f64) * 100f64;
            print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
            doc.add_i64(sid, story.id);
            for i in 0..epub.len() {
                let mut file = epub.by_index(i).unwrap();
@ -111,18 +93,24 @@ impl Searcher {
                    continue;
                }
-                file.read_to_string(&mut text).unwrap();
+                file.read_to_string(&mut buffer).unwrap();
-                doc.add_text(content, &text);
+                document.add_text(content, &buffer);
-                text.clear();
+                buffer.clear();
            }
-            writer.add_document(doc).unwrap();
+            writer.add_document(document).unwrap();
-        });
+        }
        print!("\r\rCommitting archive index...\r\r");
        stdout().flush().unwrap();
        writer.commit().unwrap();
        writer.wait_merging_threads().unwrap();
        let finish = (Instant::now() - start).as_secs();
-        println!("Index generated in {} seconds.", finish);
+        println!("Index generated in {finish} seconds.");
        index
    }
    pub fn search(&self, text: &str) -> Vec<(i64, f32)> {
@ -133,20 +121,22 @@ impl Searcher {
            .try_into()
            .unwrap();
-        let searcher = reader.searcher();
+        let schema = self.index.schema();
-        let identitfier = self.schema.get_field("sid").unwrap();
+        let identifier = schema.get_field("sid").unwrap();
-        let content = self.schema.get_field("content").unwrap();
+        let content = schema.get_field("content").unwrap();
        let parser = QueryParser::for_index(&self.index, vec![content]);
        let parser = QueryParser::for_index(&self.index, vec![content]);
        let query = parser.parse_query(text).unwrap();
        let searcher = reader.searcher();
        let limit = TopDocs::with_limit(32);
        let query = parser.parse_query(&text).unwrap();
        let docs = searcher.search(&query, &limit).unwrap();
        docs.into_iter()
            .map(|(score, address)| {
                let doc: TantivyDocument = searcher.doc(address).unwrap();
-                match doc.get_first(identitfier).map(|v| v.as_i64()) {
+                match doc.get_first(identifier).map(|v| v.as_i64()) {
                    Some(Some(value)) => (value, score),
                    _ => panic!("Invalid story key type!"),
                }
--- a/src/archive/fetcher.rs
+++ b/src/archive/fetcher.rs
@ -98,13 +98,13 @@ impl<T: Read + Seek> Fetcher<T> {
    }
    pub fn identity(&self) -> Result<String> {
-        let mut archive = self.archive.lock().map_err(|e| match e {
+        let Ok(mut archive) = self.archive.lock() else {
-            _ => Error::archive("Could not acquire fetcher lock"),
+            return Err(Error::archive("Could not acquire fetcher lock"));
-        })?;
+        };
-        let index = archive.by_name("index.json").map_err(|e| match e {
+        let Ok(index) = archive.by_name("index.json") else {
-            _ => Error::archive("Could not open archive index"),
+            return Err(Error::archive("Could not open archive index"));
-        })?;
+        };
        Ok(format!("{}", index.crc32()))
    }