mirror of
https://github.com/JockeTF/fimfareader.git
synced 2025-02-22 21:03:11 +01:00
Simplify search while improving performance
This commit is contained in:
parent
d440aeab9e
commit
4ef6bc533d
5 changed files with 60 additions and 188 deletions
101
Cargo.lock
generated
101
Cargo.lock
generated
|
@ -44,15 +44,6 @@ version = "1.0.96"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4"
|
||||
|
||||
[[package]]
|
||||
name = "arbitrary"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
|
||||
dependencies = [
|
||||
"derive_arbitrary",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
version = "1.7.1"
|
||||
|
@ -299,17 +290,6 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_arbitrary"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "1.0.0"
|
||||
|
@ -330,17 +310,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "displaydoc"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "downcast-rs"
|
||||
version = "2.0.1"
|
||||
|
@ -396,7 +365,7 @@ dependencies = [
|
|||
"rayon",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"zip 0.6.6",
|
||||
"zip",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -406,7 +375,6 @@ dependencies = [
|
|||
"fimfareader",
|
||||
"fimfareader-search",
|
||||
"rustyline",
|
||||
"zip 2.2.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -427,10 +395,8 @@ name = "fimfareader-search"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"fimfareader",
|
||||
"rayon",
|
||||
"tantivy",
|
||||
"thread_local",
|
||||
"zip 2.2.2",
|
||||
"zip",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -586,16 +552,6 @@ version = "1.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.14.0"
|
||||
|
@ -660,12 +616,6 @@ version = "0.4.15"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
|
||||
|
||||
[[package]]
|
||||
name = "lockfree-object-pool"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.25"
|
||||
|
@ -1038,12 +988,6 @@ version = "1.3.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "simd-adler32"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
|
||||
|
||||
[[package]]
|
||||
name = "sketches-ddsketch"
|
||||
version = "0.3.0"
|
||||
|
@ -1261,16 +1205,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.37"
|
||||
|
@ -1570,37 +1504,6 @@ dependencies = [
|
|||
"flate2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "2.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae9c1ea7b3a5e1f4b922ff856a129881167511563dc219869afe3787fc0c1a45"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"displaydoc",
|
||||
"flate2",
|
||||
"indexmap",
|
||||
"memchr",
|
||||
"thiserror",
|
||||
"zopfli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zopfli"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"crc32fast",
|
||||
"lockfree-object-pool",
|
||||
"log",
|
||||
"once_cell",
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.13.2"
|
||||
|
|
|
@ -13,8 +13,3 @@ path = "../search"
|
|||
[dependencies.rustyline]
|
||||
default-features = false
|
||||
version = "14"
|
||||
|
||||
[dependencies.zip]
|
||||
version = "*"
|
||||
features = ["deflate"]
|
||||
default-features = false
|
||||
|
|
|
@ -2,13 +2,10 @@
|
|||
|
||||
use std::env::args;
|
||||
use std::error::Error;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::result::Result;
|
||||
use std::time::Instant;
|
||||
|
||||
use rustyline::DefaultEditor;
|
||||
use zip::ZipArchive;
|
||||
|
||||
use fimfareader::archive::Fetcher;
|
||||
use fimfareader_search::Searcher;
|
||||
|
@ -32,14 +29,7 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||
println!("Finished loading in {finish:?}.");
|
||||
println!("The archive contains {count} stories.");
|
||||
|
||||
let opener = || {
|
||||
let file = File::open(&argv[1]).unwrap();
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
ZipArchive::new(reader).unwrap()
|
||||
};
|
||||
|
||||
let searcher = Searcher::new(&fetcher, &opener);
|
||||
let searcher = Searcher::new(&fetcher);
|
||||
|
||||
while let Ok(line) = editor.readline(">>> ") {
|
||||
editor.add_history_entry(&line)?;
|
||||
|
|
|
@ -7,16 +7,10 @@ edition = "2021"
|
|||
[dependencies.fimfareader]
|
||||
path = ".."
|
||||
|
||||
[dependencies.rayon]
|
||||
version = "*"
|
||||
|
||||
[dependencies.tantivy]
|
||||
git = "https://github.com/quickwit-oss/tantivy.git"
|
||||
|
||||
[dependencies.thread_local]
|
||||
version = "*"
|
||||
|
||||
[dependencies.zip]
|
||||
version = "*"
|
||||
version = "=0.6.6"
|
||||
features = ["deflate"]
|
||||
default-features = false
|
||||
|
|
|
@ -1,108 +1,90 @@
|
|||
//! Main module.
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::fs::create_dir_all;
|
||||
use std::io::stdout;
|
||||
use std::io::Cursor;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Instant;
|
||||
|
||||
use rayon::iter::ParallelIterator;
|
||||
use thread_local::ThreadLocal;
|
||||
use zip::read::ZipArchive;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema;
|
||||
use tantivy::schema::Schema;
|
||||
use tantivy::schema::Value;
|
||||
use tantivy::Index;
|
||||
use tantivy::IndexSettings;
|
||||
use tantivy::ReloadPolicy;
|
||||
use tantivy::TantivyDocument;
|
||||
use zip::read::ZipArchive;
|
||||
|
||||
use fimfareader::prelude::*;
|
||||
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
pub fn new<T, F>(fetcher: &Fetcher<T>, f: &F) -> Self
|
||||
pub fn new<T>(fetcher: &Fetcher<T>) -> Self
|
||||
where
|
||||
T: Read + Seek + Send,
|
||||
F: Fn() -> ZipArchive<T> + Sync,
|
||||
{
|
||||
Searcher {
|
||||
index: Self::load_index(fetcher),
|
||||
}
|
||||
}
|
||||
|
||||
fn schema() -> Schema {
|
||||
let mut builder = Schema::builder();
|
||||
|
||||
builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
||||
builder.add_text_field("content", schema::TEXT);
|
||||
let schema = builder.build();
|
||||
|
||||
let index = Self::load_index(schema.clone(), fetcher, f);
|
||||
|
||||
Searcher { schema, index }
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn load_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) -> Index
|
||||
fn load_index<T>(fetcher: &Fetcher<T>) -> Index
|
||||
where
|
||||
T: Read + Seek + Send,
|
||||
F: Fn() -> ZipArchive<T> + Sync,
|
||||
{
|
||||
let identity = fetcher.identity().unwrap();
|
||||
let directory = Path::new("cache").join(identity);
|
||||
let path = Path::new("cache").join(identity);
|
||||
|
||||
if !directory.exists() {
|
||||
Self::make_index(schema.clone(), fetcher, f);
|
||||
if path.exists() {
|
||||
Index::open_in_dir(path).unwrap()
|
||||
} else {
|
||||
Self::make_index(&path, fetcher)
|
||||
}
|
||||
}
|
||||
|
||||
let store = MmapDirectory::open(&directory).unwrap();
|
||||
Index::open_or_create(store, schema).unwrap()
|
||||
}
|
||||
|
||||
fn make_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F)
|
||||
fn make_index<T>(path: &Path, fetcher: &Fetcher<T>) -> Index
|
||||
where
|
||||
T: Read + Seek + Send,
|
||||
F: Fn() -> ZipArchive<T> + Sync,
|
||||
{
|
||||
let identity = fetcher.identity().unwrap();
|
||||
let directory = Path::new("cache").join(identity);
|
||||
let start = Instant::now();
|
||||
print!("\r\rIndexing archive...\r\r");
|
||||
create_dir_all(path).unwrap();
|
||||
|
||||
std::fs::create_dir_all(&directory).unwrap();
|
||||
let store = MmapDirectory::open(&directory).unwrap();
|
||||
let settings = IndexSettings::default();
|
||||
let index = Index::create(store, schema, settings).unwrap();
|
||||
let schema = Self::schema();
|
||||
let index = Index::create_in_dir(path, schema).unwrap();
|
||||
let mut writer = index.writer(1_073_741_824).unwrap();
|
||||
let mut buffer = String::with_capacity(1_048_576);
|
||||
|
||||
let schema = index.schema();
|
||||
let sid = schema.get_field("sid").unwrap();
|
||||
let identifier = schema.get_field("sid").unwrap();
|
||||
let content = schema.get_field("content").unwrap();
|
||||
let mut writer = index.writer(536_870_912).unwrap();
|
||||
let story_count = fetcher.iter().count() as f64;
|
||||
|
||||
let counter = AtomicUsize::new(0);
|
||||
let total = fetcher.iter().count();
|
||||
let start = Instant::now();
|
||||
for (i, story) in fetcher.iter().enumerate() {
|
||||
let progress = (i * 100) as f64 / story_count;
|
||||
print!("\r\rIndexing archive... {progress:.2}%\r\r");
|
||||
|
||||
let local = ThreadLocal::new();
|
||||
let cursor = Cursor::new(fetcher.read(story).unwrap());
|
||||
let mut epub = ZipArchive::new(cursor).unwrap();
|
||||
let mut document = TantivyDocument::default();
|
||||
|
||||
fetcher.par_iter().for_each(|story| {
|
||||
let mut doc = TantivyDocument::default();
|
||||
|
||||
let mut arch = local.get_or(|| RefCell::new(f())).borrow_mut();
|
||||
let mut file = arch.by_name(&story.archive.path).unwrap();
|
||||
let mut data = Vec::with_capacity(file.size() as usize);
|
||||
let mut text = String::with_capacity(1_048_576);
|
||||
|
||||
file.read_to_end(&mut data).unwrap();
|
||||
let mut epub = ZipArchive::new(Cursor::new(data)).unwrap();
|
||||
|
||||
let count = counter.fetch_add(1, Ordering::SeqCst);
|
||||
let percentage = (count as f64 / total as f64) * 100f64;
|
||||
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
|
||||
|
||||
doc.add_i64(sid, story.id.into());
|
||||
document.add_i64(identifier, story.id.into());
|
||||
|
||||
for i in 0..epub.len() {
|
||||
let mut file = epub.by_index(i).unwrap();
|
||||
|
@ -111,18 +93,24 @@ impl Searcher {
|
|||
continue;
|
||||
}
|
||||
|
||||
file.read_to_string(&mut text).unwrap();
|
||||
doc.add_text(content, &text);
|
||||
text.clear();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
document.add_text(content, &buffer);
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
writer.add_document(doc).unwrap();
|
||||
});
|
||||
writer.add_document(document).unwrap();
|
||||
}
|
||||
|
||||
print!("\r\rCommitting archive index...\r\r");
|
||||
stdout().flush().unwrap();
|
||||
|
||||
writer.commit().unwrap();
|
||||
writer.wait_merging_threads().unwrap();
|
||||
|
||||
let finish = (Instant::now() - start).as_secs();
|
||||
println!("Index generated in {} seconds.", finish);
|
||||
println!("Index generated in {finish} seconds.");
|
||||
|
||||
index
|
||||
}
|
||||
|
||||
pub fn search(&self, text: &str) -> Vec<(i64, f32)> {
|
||||
|
@ -133,20 +121,22 @@ impl Searcher {
|
|||
.try_into()
|
||||
.unwrap();
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let identitfier = self.schema.get_field("sid").unwrap();
|
||||
let content = self.schema.get_field("content").unwrap();
|
||||
let parser = QueryParser::for_index(&self.index, vec![content]);
|
||||
let schema = self.index.schema();
|
||||
let identifier = schema.get_field("sid").unwrap();
|
||||
let content = schema.get_field("content").unwrap();
|
||||
|
||||
let limit = TopDocs::with_limit(32);
|
||||
let parser = QueryParser::for_index(&self.index, vec![content]);
|
||||
let query = parser.parse_query(text).unwrap();
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let limit = TopDocs::with_limit(32);
|
||||
let docs = searcher.search(&query, &limit).unwrap();
|
||||
|
||||
docs.into_iter()
|
||||
.map(|(score, address)| {
|
||||
let doc: TantivyDocument = searcher.doc(address).unwrap();
|
||||
|
||||
match doc.get_first(identitfier).map(|v| v.as_i64()) {
|
||||
match doc.get_first(identifier).map(|v| v.as_i64()) {
|
||||
Some(Some(value)) => (value, score),
|
||||
_ => panic!("Invalid story key type!"),
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue