Index using multiple archive instances

This commit is contained in:
Joakim Soderlund 2020-03-11 17:56:39 +01:00
parent 83e5fdb0a8
commit aae361767b
3 changed files with 30 additions and 13 deletions

1
Cargo.lock generated
View file

@ -349,6 +349,7 @@ dependencies = [
"rayon", "rayon",
"rustyline", "rustyline",
"tantivy", "tantivy",
"thread_local",
"zip", "zip",
] ]

View file

@ -20,6 +20,9 @@ version = "*"
[dependencies.tantivy] [dependencies.tantivy]
version = "*" version = "*"
[dependencies.thread_local]
version = "*"
[dependencies.zip] [dependencies.zip]
version = "*" version = "*"
features = ["deflate"] features = ["deflate"]

View file

@ -1,6 +1,8 @@
//! Main module. //! Main module.
use std::cell::RefCell;
use std::env::args; use std::env::args;
use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
use std::io::Cursor; use std::io::Cursor;
use std::io::Read; use std::io::Read;
@ -24,6 +26,8 @@ use tantivy::schema::Value;
use tantivy::Index; use tantivy::Index;
use tantivy::ReloadPolicy; use tantivy::ReloadPolicy;
use thread_local::ThreadLocal;
use fimfareader::prelude::*; use fimfareader::prelude::*;
fn exit(error: Error) -> ! { fn exit(error: Error) -> ! {
@ -32,7 +36,7 @@ fn exit(error: Error) -> ! {
std::process::exit(1) std::process::exit(1)
} }
fn load_index<T>(schema: Schema, fetcher: &Fetcher<T>) -> Index fn load_index<T>(schema: Schema, fetcher: &Fetcher<T>, path: &str) -> Index
where where
T: Read + Seek + Send, T: Read + Seek + Send,
{ {
@ -51,34 +55,43 @@ where
let schema = index.schema(); let schema = index.schema();
let sid = schema.get_field("sid").unwrap(); let sid = schema.get_field("sid").unwrap();
let content = schema.get_field("content").unwrap(); let content = schema.get_field("content").unwrap();
let mut writer = index.writer(250_000_000).unwrap(); let mut writer = index.writer(536_870_912).unwrap();
let counter = AtomicUsize::new(0); let counter = AtomicUsize::new(0);
let total = fetcher.iter().count(); let total = fetcher.iter().count();
let start = Instant::now(); let start = Instant::now();
let local = ThreadLocal::new();
fetcher.par_iter().for_each(|story| { fetcher.par_iter().for_each(|story| {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_i64(sid, story.id); doc.add_i64(sid, story.id);
let data = fetcher.read(story).unwrap(); let archive = local.get_or(|| {
let count = counter.fetch_add(1, Ordering::SeqCst); let reader = BufReader::new(File::open(&path).unwrap());
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap(); RefCell::new(ZipArchive::new(reader).unwrap())
});
let mut archive = archive.borrow_mut();
let mut file = archive.by_name(&story.archive.path).unwrap();
let mut data = Vec::with_capacity(file.size() as usize);
let mut text = String::with_capacity(1_048_576); let mut text = String::with_capacity(1_048_576);
if (count % 16) == 0 { file.read_to_end(&mut data).unwrap();
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
let count = counter.fetch_add(1, Ordering::SeqCst);
let percentage = (count as f64 / total as f64) * 100f64; let percentage = (count as f64 / total as f64) * 100f64;
print!("\rIndexing archive... {:.2}%", percentage); print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
}
for i in 0..arch.len() { for i in 0..arch.len() {
let file = arch.by_index(i).unwrap(); let mut file = arch.by_index(i).unwrap();
if !file.name().ends_with(".html") { if !file.name().ends_with(".html") {
continue; continue;
} }
BufReader::new(file).read_to_string(&mut text).unwrap(); file.read_to_string(&mut text).unwrap();
doc.add_text(content, &text); doc.add_text(content, &text);
text.clear(); text.clear();
} }
@ -89,7 +102,7 @@ where
writer.commit().unwrap(); writer.commit().unwrap();
let finish = (Instant::now() - start).as_secs(); let finish = (Instant::now() - start).as_secs();
println!("\rIndex generated in {} seconds.", finish); println!("Index generated in {} seconds.", finish);
index index
} }
@ -117,7 +130,7 @@ fn main() {
let mut builder = Schema::builder(); let mut builder = Schema::builder();
let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED); let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
let content = builder.add_text_field("content", schema::TEXT); let content = builder.add_text_field("content", schema::TEXT);
let index = load_index(builder.build(), &fetcher); let index = load_index(builder.build(), &fetcher, &argv[1]);
let reader = index let reader = index
.reader_builder() .reader_builder()