mirror of
https://github.com/JockeTF/fimfareader.git
synced 2024-11-23 22:07:59 +01:00
Index using multiple archive instances
This commit is contained in:
parent
83e5fdb0a8
commit
aae361767b
3 changed files with 30 additions and 13 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -349,6 +349,7 @@ dependencies = [
|
||||||
"rayon",
|
"rayon",
|
||||||
"rustyline",
|
"rustyline",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
|
"thread_local",
|
||||||
"zip",
|
"zip",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,9 @@ version = "*"
|
||||||
[dependencies.tantivy]
|
[dependencies.tantivy]
|
||||||
version = "*"
|
version = "*"
|
||||||
|
|
||||||
|
[dependencies.thread_local]
|
||||||
|
version = "*"
|
||||||
|
|
||||||
[dependencies.zip]
|
[dependencies.zip]
|
||||||
version = "*"
|
version = "*"
|
||||||
features = ["deflate"]
|
features = ["deflate"]
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
//! Main module.
|
//! Main module.
|
||||||
|
|
||||||
|
use std::cell::RefCell;
|
||||||
use std::env::args;
|
use std::env::args;
|
||||||
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
@ -24,6 +26,8 @@ use tantivy::schema::Value;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tantivy::ReloadPolicy;
|
use tantivy::ReloadPolicy;
|
||||||
|
|
||||||
|
use thread_local::ThreadLocal;
|
||||||
|
|
||||||
use fimfareader::prelude::*;
|
use fimfareader::prelude::*;
|
||||||
|
|
||||||
fn exit(error: Error) -> ! {
|
fn exit(error: Error) -> ! {
|
||||||
|
@ -32,7 +36,7 @@ fn exit(error: Error) -> ! {
|
||||||
std::process::exit(1)
|
std::process::exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_index<T>(schema: Schema, fetcher: &Fetcher<T>) -> Index
|
fn load_index<T>(schema: Schema, fetcher: &Fetcher<T>, path: &str) -> Index
|
||||||
where
|
where
|
||||||
T: Read + Seek + Send,
|
T: Read + Seek + Send,
|
||||||
{
|
{
|
||||||
|
@ -51,34 +55,43 @@ where
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
let sid = schema.get_field("sid").unwrap();
|
let sid = schema.get_field("sid").unwrap();
|
||||||
let content = schema.get_field("content").unwrap();
|
let content = schema.get_field("content").unwrap();
|
||||||
let mut writer = index.writer(250_000_000).unwrap();
|
let mut writer = index.writer(536_870_912).unwrap();
|
||||||
|
|
||||||
let counter = AtomicUsize::new(0);
|
let counter = AtomicUsize::new(0);
|
||||||
let total = fetcher.iter().count();
|
let total = fetcher.iter().count();
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let local = ThreadLocal::new();
|
||||||
|
|
||||||
fetcher.par_iter().for_each(|story| {
|
fetcher.par_iter().for_each(|story| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_i64(sid, story.id);
|
doc.add_i64(sid, story.id);
|
||||||
|
|
||||||
let data = fetcher.read(story).unwrap();
|
let archive = local.get_or(|| {
|
||||||
let count = counter.fetch_add(1, Ordering::SeqCst);
|
let reader = BufReader::new(File::open(&path).unwrap());
|
||||||
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
|
RefCell::new(ZipArchive::new(reader).unwrap())
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut archive = archive.borrow_mut();
|
||||||
|
let mut file = archive.by_name(&story.archive.path).unwrap();
|
||||||
|
let mut data = Vec::with_capacity(file.size() as usize);
|
||||||
let mut text = String::with_capacity(1_048_576);
|
let mut text = String::with_capacity(1_048_576);
|
||||||
|
|
||||||
if (count % 16) == 0 {
|
file.read_to_end(&mut data).unwrap();
|
||||||
|
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
|
||||||
|
let count = counter.fetch_add(1, Ordering::SeqCst);
|
||||||
|
|
||||||
let percentage = (count as f64 / total as f64) * 100f64;
|
let percentage = (count as f64 / total as f64) * 100f64;
|
||||||
print!("\rIndexing archive... {:.2}%", percentage);
|
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
|
||||||
}
|
|
||||||
|
|
||||||
for i in 0..arch.len() {
|
for i in 0..arch.len() {
|
||||||
let file = arch.by_index(i).unwrap();
|
let mut file = arch.by_index(i).unwrap();
|
||||||
|
|
||||||
if !file.name().ends_with(".html") {
|
if !file.name().ends_with(".html") {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
BufReader::new(file).read_to_string(&mut text).unwrap();
|
file.read_to_string(&mut text).unwrap();
|
||||||
doc.add_text(content, &text);
|
doc.add_text(content, &text);
|
||||||
text.clear();
|
text.clear();
|
||||||
}
|
}
|
||||||
|
@ -89,7 +102,7 @@ where
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
|
|
||||||
let finish = (Instant::now() - start).as_secs();
|
let finish = (Instant::now() - start).as_secs();
|
||||||
println!("\rIndex generated in {} seconds.", finish);
|
println!("Index generated in {} seconds.", finish);
|
||||||
|
|
||||||
index
|
index
|
||||||
}
|
}
|
||||||
|
@ -117,7 +130,7 @@ fn main() {
|
||||||
let mut builder = Schema::builder();
|
let mut builder = Schema::builder();
|
||||||
let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
||||||
let content = builder.add_text_field("content", schema::TEXT);
|
let content = builder.add_text_field("content", schema::TEXT);
|
||||||
let index = load_index(builder.build(), &fetcher);
|
let index = load_index(builder.build(), &fetcher, &argv[1]);
|
||||||
|
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
|
|
Loading…
Reference in a new issue