mirror of
https://github.com/JockeTF/fimfareader.git
synced 2025-03-12 06:30:02 +01:00
Use Tantivy for querying story contents
This commit is contained in:
parent
36ade958bb
commit
723b19b035
8 changed files with 1079 additions and 23 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
|
/cache
|
||||||
/target
|
/target
|
||||||
**/*.rs.bk
|
**/*.rs.bk
|
||||||
|
|
857
Cargo.lock
generated
857
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -8,6 +8,7 @@ edition = "2024"
|
||||||
members = [
|
members = [
|
||||||
"cli",
|
"cli",
|
||||||
"query",
|
"query",
|
||||||
|
"search",
|
||||||
]
|
]
|
||||||
|
|
||||||
default-members = [
|
default-members = [
|
||||||
|
|
|
@ -7,8 +7,8 @@ edition = "2024"
|
||||||
[dependencies.fimfareader]
|
[dependencies.fimfareader]
|
||||||
path = ".."
|
path = ".."
|
||||||
|
|
||||||
[dependencies.fimfareader-query]
|
[dependencies.fimfareader-search]
|
||||||
path = "../query"
|
path = "../search"
|
||||||
|
|
||||||
[dependencies.rustyline]
|
[dependencies.rustyline]
|
||||||
default-features = false
|
default-features = false
|
||||||
|
|
|
@ -5,10 +5,11 @@ use std::error::Error;
|
||||||
use std::result::Result;
|
use std::result::Result;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use fimfareader::archive::Fetcher;
|
|
||||||
use fimfareader_query::parse;
|
|
||||||
use rustyline::DefaultEditor;
|
use rustyline::DefaultEditor;
|
||||||
|
|
||||||
|
use fimfareader::archive::Fetcher;
|
||||||
|
use fimfareader_search::Searcher;
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn Error>> {
|
fn main() -> Result<(), Box<dyn Error>> {
|
||||||
let argv = args().collect::<Vec<String>>();
|
let argv = args().collect::<Vec<String>>();
|
||||||
let mut editor = DefaultEditor::new()?;
|
let mut editor = DefaultEditor::new()?;
|
||||||
|
@ -28,33 +29,41 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||||
println!("Finished loading in {finish:?}.");
|
println!("Finished loading in {finish:?}.");
|
||||||
println!("The archive contains {count} stories.");
|
println!("The archive contains {count} stories.");
|
||||||
|
|
||||||
|
let searcher = Searcher::new(&fetcher);
|
||||||
|
|
||||||
while let Ok(line) = editor.readline(">>> ") {
|
while let Ok(line) = editor.readline(">>> ") {
|
||||||
editor.add_history_entry(&line)?;
|
editor.add_history_entry(&line)?;
|
||||||
|
|
||||||
let filter = match parse(&line) {
|
|
||||||
Ok(filter) => filter,
|
|
||||||
Err(error) => {
|
|
||||||
println!("{}", error);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let stories = fetcher.filter(&filter);
|
|
||||||
|
let result = searcher
|
||||||
|
.search(&line)
|
||||||
|
.into_iter()
|
||||||
|
.filter(|(_sid, score)| *score > 10f32)
|
||||||
|
.map(|(sid, score)| (i32::try_from(sid).unwrap(), score))
|
||||||
|
.filter_map(|(sid, score)| Some((fetcher.fetch(sid)?, score)))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let finish = (Instant::now() - start).as_millis();
|
let finish = (Instant::now() - start).as_millis();
|
||||||
let count = stories.len();
|
let count = result.len();
|
||||||
|
|
||||||
println!("Found {} stories in {} milliseconds!", count, finish);
|
println!("Found {} stories in {} milliseconds!", count, finish);
|
||||||
|
|
||||||
if count > 32 {
|
for (story, score) in result {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for story in stories.iter() {
|
|
||||||
let key = &story.id;
|
let key = &story.id;
|
||||||
let title = &story.title;
|
let title = &story.title;
|
||||||
|
|
||||||
println!("[{}] {}", key, title);
|
let tags = story
|
||||||
|
.tags
|
||||||
|
.iter()
|
||||||
|
.map(|tag| tag.name.to_string())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(", ");
|
||||||
|
|
||||||
|
println!("{:02.02}% [{:>6}] {}", score, key, title);
|
||||||
|
println!("{}", tags);
|
||||||
|
println!("{}", story.short_description);
|
||||||
|
println!("{}", story.url);
|
||||||
|
println!();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
16
search/Cargo.toml
Normal file
16
search/Cargo.toml
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
[package]
|
||||||
|
name = "fimfareader-search"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Joakim Soderlund <joakim.soderlund@gmail.com>"]
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies.fimfareader]
|
||||||
|
path = ".."
|
||||||
|
|
||||||
|
[dependencies.tantivy]
|
||||||
|
git = "https://github.com/quickwit-oss/tantivy.git"
|
||||||
|
|
||||||
|
[dependencies.zip]
|
||||||
|
version = "=0.6.6"
|
||||||
|
features = ["deflate"]
|
||||||
|
default-features = false
|
160
search/src/lib.rs
Normal file
160
search/src/lib.rs
Normal file
|
@ -0,0 +1,160 @@
|
||||||
|
//! Main module.
|
||||||
|
|
||||||
|
use std::fs::create_dir_all;
|
||||||
|
use std::io::Cursor;
|
||||||
|
use std::io::Read;
|
||||||
|
use std::io::Seek;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::io::stdout;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use tantivy::Index;
|
||||||
|
use tantivy::ReloadPolicy;
|
||||||
|
use tantivy::TantivyDocument;
|
||||||
|
use tantivy::collector::TopDocs;
|
||||||
|
use tantivy::query::QueryParser;
|
||||||
|
use tantivy::schema;
|
||||||
|
use tantivy::schema::Schema;
|
||||||
|
use tantivy::schema::Value;
|
||||||
|
use zip::read::ZipArchive;
|
||||||
|
|
||||||
|
use fimfareader::archive::Fetcher;
|
||||||
|
use fimfareader::archive::Story;
|
||||||
|
|
||||||
|
pub struct Searcher {
|
||||||
|
index: Index,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Searcher {
|
||||||
|
pub fn new<T>(fetcher: &Fetcher<T>) -> Self
|
||||||
|
where
|
||||||
|
T: Read + Seek + Send,
|
||||||
|
{
|
||||||
|
Searcher {
|
||||||
|
index: Self::load_index(fetcher),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn schema() -> Schema {
|
||||||
|
let mut builder = Schema::builder();
|
||||||
|
|
||||||
|
builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
||||||
|
builder.add_text_field("content", schema::TEXT);
|
||||||
|
|
||||||
|
builder.build()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_index<T>(fetcher: &Fetcher<T>) -> Index
|
||||||
|
where
|
||||||
|
T: Read + Seek + Send,
|
||||||
|
{
|
||||||
|
let identity = fetcher.identity().unwrap();
|
||||||
|
let path = Path::new("cache").join(identity);
|
||||||
|
|
||||||
|
if path.exists() {
|
||||||
|
Index::open_in_dir(path).unwrap()
|
||||||
|
} else {
|
||||||
|
Self::make_index(&path, fetcher)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_index<T>(path: &Path, fetcher: &Fetcher<T>) -> Index
|
||||||
|
where
|
||||||
|
T: Read + Seek + Send,
|
||||||
|
{
|
||||||
|
let start = Instant::now();
|
||||||
|
print!("\r\rIndexing archive...\r\r");
|
||||||
|
create_dir_all(path).unwrap();
|
||||||
|
|
||||||
|
let schema = Self::schema();
|
||||||
|
let index = Index::create_in_dir(path, schema).unwrap();
|
||||||
|
let mut writer = index.writer(1_073_741_824).unwrap();
|
||||||
|
let mut buffer = String::with_capacity(1_048_576);
|
||||||
|
|
||||||
|
let schema = index.schema();
|
||||||
|
let identifier = schema.get_field("sid").unwrap();
|
||||||
|
let content = schema.get_field("content").unwrap();
|
||||||
|
let story_count = fetcher.iter().count() as f64;
|
||||||
|
|
||||||
|
for (i, story) in fetcher.iter().enumerate() {
|
||||||
|
let progress = (i * 100) as f64 / story_count;
|
||||||
|
print!("\r\rIndexing archive... {progress:.2}%\r\r");
|
||||||
|
|
||||||
|
let cursor = Cursor::new(fetcher.read(story).unwrap());
|
||||||
|
let mut epub = ZipArchive::new(cursor).unwrap();
|
||||||
|
let mut document = TantivyDocument::default();
|
||||||
|
|
||||||
|
document.add_i64(identifier, story.id.into());
|
||||||
|
|
||||||
|
for i in 0..epub.len() {
|
||||||
|
let mut file = epub.by_index(i).unwrap();
|
||||||
|
|
||||||
|
if !file.name().ends_with(".html") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
file.read_to_string(&mut buffer).unwrap();
|
||||||
|
document.add_text(content, &buffer);
|
||||||
|
buffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.add_document(document).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
print!("\r\rCommitting archive index...\r\r");
|
||||||
|
stdout().flush().unwrap();
|
||||||
|
|
||||||
|
writer.commit().unwrap();
|
||||||
|
writer.wait_merging_threads().unwrap();
|
||||||
|
|
||||||
|
let finish = (Instant::now() - start).as_secs();
|
||||||
|
println!("Index generated in {finish} seconds.");
|
||||||
|
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn search(&self, text: &str) -> Vec<(i64, f32)> {
|
||||||
|
let reader = self
|
||||||
|
.index
|
||||||
|
.reader_builder()
|
||||||
|
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||||
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let schema = self.index.schema();
|
||||||
|
let identifier = schema.get_field("sid").unwrap();
|
||||||
|
let content = schema.get_field("content").unwrap();
|
||||||
|
|
||||||
|
let parser = QueryParser::for_index(&self.index, vec![content]);
|
||||||
|
let query = parser.parse_query(text).unwrap();
|
||||||
|
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let limit = TopDocs::with_limit(32);
|
||||||
|
let docs = searcher.search(&query, &limit).unwrap();
|
||||||
|
|
||||||
|
docs.into_iter()
|
||||||
|
.map(|(score, address)| {
|
||||||
|
let doc: TantivyDocument = searcher.doc(address).unwrap();
|
||||||
|
|
||||||
|
match doc.get_first(identifier).map(|v| v.as_i64()) {
|
||||||
|
Some(Some(value)) => (value, score),
|
||||||
|
_ => panic!("Invalid story key type!"),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse(&self, text: &str) -> impl Fn(&Story) -> bool + Sync {
|
||||||
|
let mut sids: Vec<_> = self
|
||||||
|
.search(text)
|
||||||
|
.into_iter()
|
||||||
|
.filter(|(_, score)| *score > 10f32)
|
||||||
|
.map(|(sid, _)| sid)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
sids.sort();
|
||||||
|
|
||||||
|
move |story| sids.binary_search(&story.id.into()).is_ok()
|
||||||
|
}
|
||||||
|
}
|
|
@ -104,10 +104,26 @@ impl<T: Read + Seek> Fetcher<T> {
|
||||||
Ok(buf)
|
Ok(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn identity(&self) -> Result<String> {
|
||||||
|
let Ok(mut archive) = self.archive.lock() else {
|
||||||
|
return Err(Error::archive("Could not acquire fetcher lock"));
|
||||||
|
};
|
||||||
|
|
||||||
|
let Ok(index) = archive.by_name("index.json") else {
|
||||||
|
return Err(Error::archive("Could not open archive index"));
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(format!("{}", index.crc32()))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> impl Iterator<Item = &Story> {
|
pub fn iter(&self) -> impl Iterator<Item = &Story> {
|
||||||
self.index.iter()
|
self.index.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn par_iter(&self) -> impl ParallelIterator<Item = &Story> {
|
||||||
|
self.index.par_iter()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn filter<F>(&self, function: &F) -> Vec<&Story>
|
pub fn filter<F>(&self, function: &F) -> Vec<&Story>
|
||||||
where
|
where
|
||||||
F: Sync + Fn(&Story) -> bool,
|
F: Sync + Fn(&Story) -> bool,
|
||||||
|
|
Loading…
Add table
Reference in a new issue