From aae361767b2b1ba87416271fde2e77d6b034c498 Mon Sep 17 00:00:00 2001 From: Joakim Soderlund Date: Wed, 11 Mar 2020 17:56:39 +0100 Subject: [PATCH] Index using multiple archive instances --- Cargo.lock | 1 + cli/Cargo.toml | 3 +++ cli/src/main.rs | 39 ++++++++++++++++++++++++++------------- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5b4222..c376374 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -349,6 +349,7 @@ dependencies = [ "rayon", "rustyline", "tantivy", + "thread_local", "zip", ] diff --git a/cli/Cargo.toml b/cli/Cargo.toml index bb725a9..421321e 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -20,6 +20,9 @@ version = "*" [dependencies.tantivy] version = "*" +[dependencies.thread_local] +version = "*" + [dependencies.zip] version = "*" features = ["deflate"] diff --git a/cli/src/main.rs b/cli/src/main.rs index 6d49c2b..80d5809 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,6 +1,8 @@ //! Main module. +use std::cell::RefCell; use std::env::args; +use std::fs::File; use std::io::BufReader; use std::io::Cursor; use std::io::Read; @@ -24,6 +26,8 @@ use tantivy::schema::Value; use tantivy::Index; use tantivy::ReloadPolicy; +use thread_local::ThreadLocal; + use fimfareader::prelude::*; fn exit(error: Error) -> ! { @@ -32,7 +36,7 @@ fn exit(error: Error) -> ! { std::process::exit(1) } -fn load_index(schema: Schema, fetcher: &Fetcher) -> Index +fn load_index(schema: Schema, fetcher: &Fetcher, path: &str) -> Index where T: Read + Seek + Send, { @@ -51,34 +55,43 @@ where let schema = index.schema(); let sid = schema.get_field("sid").unwrap(); let content = schema.get_field("content").unwrap(); - let mut writer = index.writer(250_000_000).unwrap(); + let mut writer = index.writer(536_870_912).unwrap(); let counter = AtomicUsize::new(0); let total = fetcher.iter().count(); let start = Instant::now(); + let local = ThreadLocal::new(); + fetcher.par_iter().for_each(|story| { let mut doc = Document::default(); doc.add_i64(sid, story.id); - let data = fetcher.read(story).unwrap(); - let count = counter.fetch_add(1, Ordering::SeqCst); - let mut arch = ZipArchive::new(Cursor::new(data)).unwrap(); + let archive = local.get_or(|| { + let reader = BufReader::new(File::open(&path).unwrap()); + RefCell::new(ZipArchive::new(reader).unwrap()) + }); + + let mut archive = archive.borrow_mut(); + let mut file = archive.by_name(&story.archive.path).unwrap(); + let mut data = Vec::with_capacity(file.size() as usize); let mut text = String::with_capacity(1_048_576); - if (count % 16) == 0 { - let percentage = (count as f64 / total as f64) * 100f64; - print!("\rIndexing archive... {:.2}%", percentage); - } + file.read_to_end(&mut data).unwrap(); + let mut arch = ZipArchive::new(Cursor::new(data)).unwrap(); + let count = counter.fetch_add(1, Ordering::SeqCst); + + let percentage = (count as f64 / total as f64) * 100f64; + print!("\r\rIndexing archive... {:.2}%\r\r", percentage); for i in 0..arch.len() { - let file = arch.by_index(i).unwrap(); + let mut file = arch.by_index(i).unwrap(); if !file.name().ends_with(".html") { continue; } - BufReader::new(file).read_to_string(&mut text).unwrap(); + file.read_to_string(&mut text).unwrap(); doc.add_text(content, &text); text.clear(); } @@ -89,7 +102,7 @@ where writer.commit().unwrap(); let finish = (Instant::now() - start).as_secs(); - println!("\rIndex generated in {} seconds.", finish); + println!("Index generated in {} seconds.", finish); index } @@ -117,7 +130,7 @@ fn main() { let mut builder = Schema::builder(); let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED); let content = builder.add_text_field("content", schema::TEXT); - let index = load_index(builder.build(), &fetcher); + let index = load_index(builder.build(), &fetcher, &argv[1]); let reader = index .reader_builder()