Simplify search while improving performance

This commit is contained in:
Joakim Soderlund 2024-04-06 22:58:54 +02:00
parent 01acb7c842
commit 2832d50b2f
6 changed files with 65 additions and 99 deletions

7
Cargo.lock generated
View file

@ -301,7 +301,6 @@ dependencies = [
"fimfareader", "fimfareader",
"fimfareader-search", "fimfareader-search",
"rustyline", "rustyline",
"zip",
] ]
[[package]] [[package]]
@ -321,9 +320,7 @@ name = "fimfareader-search"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"fimfareader", "fimfareader",
"rayon",
"tantivy", "tantivy",
"thread_local",
"zip", "zip",
] ]
@ -863,9 +860,9 @@ dependencies = [
[[package]] [[package]]
name = "rustversion" name = "rustversion"
version = "1.0.14" version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" checksum = "80af6f9131f277a45a3fba6ce8e2258037bb0477a67e610d3c1fe046ab31de47"
[[package]] [[package]]
name = "rustyline" name = "rustyline"

View file

@ -13,8 +13,3 @@ path = "../search"
[dependencies.rustyline] [dependencies.rustyline]
default-features = false default-features = false
version = "14" version = "14"
[dependencies.zip]
version = "*"
features = ["deflate"]
default-features = false

View file

@ -1,13 +1,10 @@
//! Main module. //! Main module.
use std::env::args; use std::env::args;
use std::fs::File;
use std::io::BufReader;
use std::time::Instant; use std::time::Instant;
use rustyline::DefaultEditor; use rustyline::DefaultEditor;
use rustyline::Result; use rustyline::Result;
use zip::ZipArchive;
use fimfareader::prelude::*; use fimfareader::prelude::*;
use fimfareader_search::Searcher; use fimfareader_search::Searcher;
@ -37,14 +34,7 @@ fn main() -> Result<()> {
println!("Finished loading in {} milliseconds.", finish); println!("Finished loading in {} milliseconds.", finish);
println!("The archive contains {} stories.", count); println!("The archive contains {} stories.", count);
let opener = || { let searcher = Searcher::new(&fetcher);
let file = File::open(&argv[1]).unwrap();
let reader = BufReader::new(file);
ZipArchive::new(reader).unwrap()
};
let searcher = Searcher::new(&fetcher, &opener);
while let Ok(line) = editor.readline(">>> ") { while let Ok(line) = editor.readline(">>> ") {
editor.add_history_entry(&line)?; editor.add_history_entry(&line)?;

View file

@ -7,15 +7,9 @@ edition = "2021"
[dependencies.fimfareader] [dependencies.fimfareader]
path = ".." path = ".."
[dependencies.rayon]
version = "*"
[dependencies.tantivy] [dependencies.tantivy]
git = "https://github.com/quickwit-oss/tantivy.git" git = "https://github.com/quickwit-oss/tantivy.git"
[dependencies.thread_local]
version = "*"
[dependencies.zip] [dependencies.zip]
version = "*" version = "*"
features = ["deflate"] features = ["deflate"]

View file

@ -1,108 +1,90 @@
//! Main module. //! Main module.
use std::cell::RefCell; use std::fs::create_dir_all;
use std::io::stdout;
use std::io::Cursor; use std::io::Cursor;
use std::io::Read; use std::io::Read;
use std::io::Seek; use std::io::Seek;
use std::io::Write;
use std::path::Path; use std::path::Path;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::time::Instant; use std::time::Instant;
use rayon::iter::ParallelIterator;
use thread_local::ThreadLocal;
use zip::read::ZipArchive;
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::directory::MmapDirectory;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema; use tantivy::schema;
use tantivy::schema::Schema; use tantivy::schema::Schema;
use tantivy::schema::Value; use tantivy::schema::Value;
use tantivy::Index; use tantivy::Index;
use tantivy::IndexSettings;
use tantivy::ReloadPolicy; use tantivy::ReloadPolicy;
use tantivy::TantivyDocument; use tantivy::TantivyDocument;
use zip::read::ZipArchive;
use fimfareader::prelude::*; use fimfareader::prelude::*;
pub struct Searcher { pub struct Searcher {
schema: Schema,
index: Index, index: Index,
} }
impl Searcher { impl Searcher {
pub fn new<T, F>(fetcher: &Fetcher<T>, f: &F) -> Self pub fn new<T>(fetcher: &Fetcher<T>) -> Self
where where
T: Read + Seek + Send, T: Read + Seek + Send,
F: Fn() -> ZipArchive<T> + Sync,
{ {
Searcher {
index: Self::load_index(fetcher),
}
}
fn schema() -> Schema {
let mut builder = Schema::builder(); let mut builder = Schema::builder();
builder.add_i64_field("sid", schema::INDEXED | schema::STORED); builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
builder.add_text_field("content", schema::TEXT); builder.add_text_field("content", schema::TEXT);
let schema = builder.build();
let index = Self::load_index(schema.clone(), fetcher, f); builder.build()
Searcher { schema, index }
} }
fn load_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) -> Index fn load_index<T>(fetcher: &Fetcher<T>) -> Index
where where
T: Read + Seek + Send, T: Read + Seek + Send,
F: Fn() -> ZipArchive<T> + Sync,
{ {
let identity = fetcher.identity().unwrap(); let identity = fetcher.identity().unwrap();
let directory = Path::new("cache").join(identity); let path = Path::new("cache").join(identity);
if !directory.exists() { if path.exists() {
Self::make_index(schema.clone(), fetcher, f); Index::open_in_dir(path).unwrap()
} else {
Self::make_index(&path, fetcher)
} }
let store = MmapDirectory::open(&directory).unwrap();
Index::open_or_create(store, schema).unwrap()
} }
fn make_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) fn make_index<T>(path: &Path, fetcher: &Fetcher<T>) -> Index
where where
T: Read + Seek + Send, T: Read + Seek + Send,
F: Fn() -> ZipArchive<T> + Sync,
{ {
let identity = fetcher.identity().unwrap(); let start = Instant::now();
let directory = Path::new("cache").join(identity); print!("\r\rIndexing archive...\r\r");
create_dir_all(path).unwrap();
std::fs::create_dir_all(&directory).unwrap(); let schema = Self::schema();
let store = MmapDirectory::open(&directory).unwrap(); let index = Index::create_in_dir(path, schema).unwrap();
let settings = IndexSettings::default(); let mut writer = index.writer(1_073_741_824).unwrap();
let index = Index::create(store, schema, settings).unwrap(); let mut buffer = String::with_capacity(1_048_576);
let schema = index.schema(); let schema = index.schema();
let sid = schema.get_field("sid").unwrap(); let identifier = schema.get_field("sid").unwrap();
let content = schema.get_field("content").unwrap(); let content = schema.get_field("content").unwrap();
let mut writer = index.writer(536_870_912).unwrap(); let story_count = fetcher.iter().count() as f64;
let counter = AtomicUsize::new(0); for (i, story) in fetcher.iter().enumerate() {
let total = fetcher.iter().count(); let progress = (i * 100) as f64 / story_count;
let start = Instant::now(); print!("\r\rIndexing archive... {progress:.2}%\r\r");
let local = ThreadLocal::new(); let cursor = Cursor::new(fetcher.read(story).unwrap());
let mut epub = ZipArchive::new(cursor).unwrap();
let mut document = TantivyDocument::default();
fetcher.par_iter().for_each(|story| { document.add_i64(identifier, story.id);
let mut doc = TantivyDocument::default();
let mut arch = local.get_or(|| RefCell::new(f())).borrow_mut();
let mut file = arch.by_name(&story.archive.path).unwrap();
let mut data = Vec::with_capacity(file.size() as usize);
let mut text = String::with_capacity(1_048_576);
file.read_to_end(&mut data).unwrap();
let mut epub = ZipArchive::new(Cursor::new(data)).unwrap();
let count = counter.fetch_add(1, Ordering::SeqCst);
let percentage = (count as f64 / total as f64) * 100f64;
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
doc.add_i64(sid, story.id);
for i in 0..epub.len() { for i in 0..epub.len() {
let mut file = epub.by_index(i).unwrap(); let mut file = epub.by_index(i).unwrap();
@ -111,18 +93,24 @@ impl Searcher {
continue; continue;
} }
file.read_to_string(&mut text).unwrap(); file.read_to_string(&mut buffer).unwrap();
doc.add_text(content, &text); document.add_text(content, &buffer);
text.clear(); buffer.clear();
} }
writer.add_document(doc).unwrap(); writer.add_document(document).unwrap();
}); }
print!("\r\rCommitting archive index...\r\r");
stdout().flush().unwrap();
writer.commit().unwrap(); writer.commit().unwrap();
writer.wait_merging_threads().unwrap();
let finish = (Instant::now() - start).as_secs(); let finish = (Instant::now() - start).as_secs();
println!("Index generated in {} seconds.", finish); println!("Index generated in {finish} seconds.");
index
} }
pub fn search(&self, text: &str) -> Vec<(i64, f32)> { pub fn search(&self, text: &str) -> Vec<(i64, f32)> {
@ -133,20 +121,22 @@ impl Searcher {
.try_into() .try_into()
.unwrap(); .unwrap();
let searcher = reader.searcher(); let schema = self.index.schema();
let identitfier = self.schema.get_field("sid").unwrap(); let identifier = schema.get_field("sid").unwrap();
let content = self.schema.get_field("content").unwrap(); let content = schema.get_field("content").unwrap();
let parser = QueryParser::for_index(&self.index, vec![content]);
let parser = QueryParser::for_index(&self.index, vec![content]);
let query = parser.parse_query(text).unwrap();
let searcher = reader.searcher();
let limit = TopDocs::with_limit(32); let limit = TopDocs::with_limit(32);
let query = parser.parse_query(&text).unwrap();
let docs = searcher.search(&query, &limit).unwrap(); let docs = searcher.search(&query, &limit).unwrap();
docs.into_iter() docs.into_iter()
.map(|(score, address)| { .map(|(score, address)| {
let doc: TantivyDocument = searcher.doc(address).unwrap(); let doc: TantivyDocument = searcher.doc(address).unwrap();
match doc.get_first(identitfier).map(|v| v.as_i64()) { match doc.get_first(identifier).map(|v| v.as_i64()) {
Some(Some(value)) => (value, score), Some(Some(value)) => (value, score),
_ => panic!("Invalid story key type!"), _ => panic!("Invalid story key type!"),
} }

View file

@ -98,13 +98,13 @@ impl<T: Read + Seek> Fetcher<T> {
} }
pub fn identity(&self) -> Result<String> { pub fn identity(&self) -> Result<String> {
let mut archive = self.archive.lock().map_err(|e| match e { let Ok(mut archive) = self.archive.lock() else {
_ => Error::archive("Could not acquire fetcher lock"), return Err(Error::archive("Could not acquire fetcher lock"));
})?; };
let index = archive.by_name("index.json").map_err(|e| match e { let Ok(index) = archive.by_name("index.json") else {
_ => Error::archive("Could not open archive index"), return Err(Error::archive("Could not open archive index"));
})?; };
Ok(format!("{}", index.crc32())) Ok(format!("{}", index.crc32()))
} }