diff --git a/Cargo.lock b/Cargo.lock index 488a880..e7c436c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -44,15 +44,6 @@ version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4" -[[package]] -name = "arbitrary" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" -dependencies = [ - "derive_arbitrary", -] - [[package]] name = "arc-swap" version = "1.7.1" @@ -299,17 +290,6 @@ dependencies = [ "serde", ] -[[package]] -name = "derive_arbitrary" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "derive_more" version = "1.0.0" @@ -330,17 +310,6 @@ dependencies = [ "syn", ] -[[package]] -name = "displaydoc" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "downcast-rs" version = "2.0.1" @@ -396,7 +365,7 @@ dependencies = [ "rayon", "serde", "serde_json", - "zip 0.6.6", + "zip", ] [[package]] @@ -406,7 +375,6 @@ dependencies = [ "fimfareader", "fimfareader-search", "rustyline", - "zip 2.2.2", ] [[package]] @@ -427,10 +395,8 @@ name = "fimfareader-search" version = "0.1.0" dependencies = [ "fimfareader", - "rayon", "tantivy", - "thread_local", - "zip 2.2.2", + "zip", ] [[package]] @@ -586,16 +552,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" -[[package]] -name = "indexmap" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" -dependencies = [ - "equivalent", - "hashbrown", -] - [[package]] name = "itertools" version = "0.14.0" @@ -660,12 +616,6 @@ version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" -[[package]] -name = "lockfree-object-pool" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" - [[package]] name = "log" version = "0.4.25" @@ -1038,12 +988,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" -[[package]] -name = "simd-adler32" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" - [[package]] name = "sketches-ddsketch" version = "0.3.0" @@ -1261,16 +1205,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread_local" -version = "1.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" -dependencies = [ - "cfg-if", - "once_cell", -] - [[package]] name = "time" version = "0.3.37" @@ -1570,37 +1504,6 @@ dependencies = [ "flate2", ] -[[package]] -name = "zip" -version = "2.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae9c1ea7b3a5e1f4b922ff856a129881167511563dc219869afe3787fc0c1a45" -dependencies = [ - "arbitrary", - "crc32fast", - "crossbeam-utils", - "displaydoc", - "flate2", - "indexmap", - "memchr", - "thiserror", - "zopfli", -] - -[[package]] -name = "zopfli" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" -dependencies = [ - "bumpalo", - "crc32fast", - "lockfree-object-pool", - "log", - "once_cell", - "simd-adler32", -] - [[package]] name = "zstd" version = "0.13.2" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index beebfcf..1ea0c92 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -13,8 +13,3 @@ path = "../search" [dependencies.rustyline] default-features = false version = "14" - -[dependencies.zip] -version = "*" -features = ["deflate"] -default-features = false diff --git a/cli/src/main.rs b/cli/src/main.rs index 41cc611..2f9c35b 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -2,13 +2,10 @@ use std::env::args; use std::error::Error; -use std::fs::File; -use std::io::BufReader; use std::result::Result; use std::time::Instant; use rustyline::DefaultEditor; -use zip::ZipArchive; use fimfareader::archive::Fetcher; use fimfareader_search::Searcher; @@ -32,14 +29,7 @@ fn main() -> Result<(), Box> { println!("Finished loading in {finish:?}."); println!("The archive contains {count} stories."); - let opener = || { - let file = File::open(&argv[1]).unwrap(); - let reader = BufReader::new(file); - - ZipArchive::new(reader).unwrap() - }; - - let searcher = Searcher::new(&fetcher, &opener); + let searcher = Searcher::new(&fetcher); while let Ok(line) = editor.readline(">>> ") { editor.add_history_entry(&line)?; diff --git a/search/Cargo.toml b/search/Cargo.toml index 3c38acb..2011e63 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -7,16 +7,10 @@ edition = "2021" [dependencies.fimfareader] path = ".." -[dependencies.rayon] -version = "*" - [dependencies.tantivy] git = "https://github.com/quickwit-oss/tantivy.git" -[dependencies.thread_local] -version = "*" - [dependencies.zip] -version = "*" +version = "=0.6.6" features = ["deflate"] default-features = false diff --git a/search/src/lib.rs b/search/src/lib.rs index e17b49f..5009f2a 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -1,108 +1,90 @@ //! Main module. -use std::cell::RefCell; +use std::fs::create_dir_all; +use std::io::stdout; use std::io::Cursor; use std::io::Read; use std::io::Seek; +use std::io::Write; use std::path::Path; -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; use std::time::Instant; -use rayon::iter::ParallelIterator; -use thread_local::ThreadLocal; -use zip::read::ZipArchive; - use tantivy::collector::TopDocs; -use tantivy::directory::MmapDirectory; use tantivy::query::QueryParser; use tantivy::schema; use tantivy::schema::Schema; use tantivy::schema::Value; use tantivy::Index; -use tantivy::IndexSettings; use tantivy::ReloadPolicy; use tantivy::TantivyDocument; +use zip::read::ZipArchive; use fimfareader::prelude::*; pub struct Searcher { - schema: Schema, index: Index, } impl Searcher { - pub fn new(fetcher: &Fetcher, f: &F) -> Self + pub fn new(fetcher: &Fetcher) -> Self where T: Read + Seek + Send, - F: Fn() -> ZipArchive + Sync, { + Searcher { + index: Self::load_index(fetcher), + } + } + + fn schema() -> Schema { let mut builder = Schema::builder(); + builder.add_i64_field("sid", schema::INDEXED | schema::STORED); builder.add_text_field("content", schema::TEXT); - let schema = builder.build(); - let index = Self::load_index(schema.clone(), fetcher, f); - - Searcher { schema, index } + builder.build() } - fn load_index(schema: Schema, fetcher: &Fetcher, f: &F) -> Index + fn load_index(fetcher: &Fetcher) -> Index where T: Read + Seek + Send, - F: Fn() -> ZipArchive + Sync, { let identity = fetcher.identity().unwrap(); - let directory = Path::new("cache").join(identity); + let path = Path::new("cache").join(identity); - if !directory.exists() { - Self::make_index(schema.clone(), fetcher, f); + if path.exists() { + Index::open_in_dir(path).unwrap() + } else { + Self::make_index(&path, fetcher) } - - let store = MmapDirectory::open(&directory).unwrap(); - Index::open_or_create(store, schema).unwrap() } - fn make_index(schema: Schema, fetcher: &Fetcher, f: &F) + fn make_index(path: &Path, fetcher: &Fetcher) -> Index where T: Read + Seek + Send, - F: Fn() -> ZipArchive + Sync, { - let identity = fetcher.identity().unwrap(); - let directory = Path::new("cache").join(identity); + let start = Instant::now(); + print!("\r\rIndexing archive...\r\r"); + create_dir_all(path).unwrap(); - std::fs::create_dir_all(&directory).unwrap(); - let store = MmapDirectory::open(&directory).unwrap(); - let settings = IndexSettings::default(); - let index = Index::create(store, schema, settings).unwrap(); + let schema = Self::schema(); + let index = Index::create_in_dir(path, schema).unwrap(); + let mut writer = index.writer(1_073_741_824).unwrap(); + let mut buffer = String::with_capacity(1_048_576); let schema = index.schema(); - let sid = schema.get_field("sid").unwrap(); + let identifier = schema.get_field("sid").unwrap(); let content = schema.get_field("content").unwrap(); - let mut writer = index.writer(536_870_912).unwrap(); + let story_count = fetcher.iter().count() as f64; - let counter = AtomicUsize::new(0); - let total = fetcher.iter().count(); - let start = Instant::now(); + for (i, story) in fetcher.iter().enumerate() { + let progress = (i * 100) as f64 / story_count; + print!("\r\rIndexing archive... {progress:.2}%\r\r"); - let local = ThreadLocal::new(); + let cursor = Cursor::new(fetcher.read(story).unwrap()); + let mut epub = ZipArchive::new(cursor).unwrap(); + let mut document = TantivyDocument::default(); - fetcher.par_iter().for_each(|story| { - let mut doc = TantivyDocument::default(); - - let mut arch = local.get_or(|| RefCell::new(f())).borrow_mut(); - let mut file = arch.by_name(&story.archive.path).unwrap(); - let mut data = Vec::with_capacity(file.size() as usize); - let mut text = String::with_capacity(1_048_576); - - file.read_to_end(&mut data).unwrap(); - let mut epub = ZipArchive::new(Cursor::new(data)).unwrap(); - - let count = counter.fetch_add(1, Ordering::SeqCst); - let percentage = (count as f64 / total as f64) * 100f64; - print!("\r\rIndexing archive... {:.2}%\r\r", percentage); - - doc.add_i64(sid, story.id.into()); + document.add_i64(identifier, story.id.into()); for i in 0..epub.len() { let mut file = epub.by_index(i).unwrap(); @@ -111,18 +93,24 @@ impl Searcher { continue; } - file.read_to_string(&mut text).unwrap(); - doc.add_text(content, &text); - text.clear(); + file.read_to_string(&mut buffer).unwrap(); + document.add_text(content, &buffer); + buffer.clear(); } - writer.add_document(doc).unwrap(); - }); + writer.add_document(document).unwrap(); + } + + print!("\r\rCommitting archive index...\r\r"); + stdout().flush().unwrap(); writer.commit().unwrap(); + writer.wait_merging_threads().unwrap(); let finish = (Instant::now() - start).as_secs(); - println!("Index generated in {} seconds.", finish); + println!("Index generated in {finish} seconds."); + + index } pub fn search(&self, text: &str) -> Vec<(i64, f32)> { @@ -133,20 +121,22 @@ impl Searcher { .try_into() .unwrap(); - let searcher = reader.searcher(); - let identitfier = self.schema.get_field("sid").unwrap(); - let content = self.schema.get_field("content").unwrap(); - let parser = QueryParser::for_index(&self.index, vec![content]); + let schema = self.index.schema(); + let identifier = schema.get_field("sid").unwrap(); + let content = schema.get_field("content").unwrap(); - let limit = TopDocs::with_limit(32); + let parser = QueryParser::for_index(&self.index, vec![content]); let query = parser.parse_query(text).unwrap(); + + let searcher = reader.searcher(); + let limit = TopDocs::with_limit(32); let docs = searcher.search(&query, &limit).unwrap(); docs.into_iter() .map(|(score, address)| { let doc: TantivyDocument = searcher.doc(address).unwrap(); - match doc.get_first(identitfier).map(|v| v.as_i64()) { + match doc.get_first(identifier).map(|v| v.as_i64()) { Some(Some(value)) => (value, score), _ => panic!("Invalid story key type!"), }