mirror of
https://github.com/JockeTF/fimfareader.git
synced 2024-11-23 13:58:00 +01:00
Use Tantivy to query story content
This commit is contained in:
parent
01215c4178
commit
01acb7c842
8 changed files with 1219 additions and 37 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
/cache
|
||||
/target
|
||||
**/*.rs.bk
|
||||
|
|
985
Cargo.lock
generated
985
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -8,6 +8,7 @@ edition = "2021"
|
|||
members = [
|
||||
"cli",
|
||||
"query",
|
||||
"search",
|
||||
]
|
||||
|
||||
default-members = [
|
||||
|
|
|
@ -7,9 +7,14 @@ edition = "2021"
|
|||
[dependencies.fimfareader]
|
||||
path = ".."
|
||||
|
||||
[dependencies.fimfareader-query]
|
||||
path = "../query"
|
||||
[dependencies.fimfareader-search]
|
||||
path = "../search"
|
||||
|
||||
[dependencies.rustyline]
|
||||
default-features = false
|
||||
version = "14"
|
||||
|
||||
[dependencies.zip]
|
||||
version = "*"
|
||||
features = ["deflate"]
|
||||
default-features = false
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
//! Main module.
|
||||
|
||||
use std::env::args;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::time::Instant;
|
||||
|
||||
use rustyline::DefaultEditor;
|
||||
use rustyline::Result;
|
||||
use zip::ZipArchive;
|
||||
|
||||
use fimfareader::prelude::*;
|
||||
use fimfareader_query::parse;
|
||||
use fimfareader_search::Searcher;
|
||||
|
||||
fn exit(error: Error) -> ! {
|
||||
eprintln!("{}", error);
|
||||
|
@ -34,33 +37,47 @@ fn main() -> Result<()> {
|
|||
println!("Finished loading in {} milliseconds.", finish);
|
||||
println!("The archive contains {} stories.", count);
|
||||
|
||||
while let Ok(line) = editor.readline(">>> ") {
|
||||
editor.add_history_entry(&line)?;
|
||||
let opener = || {
|
||||
let file = File::open(&argv[1]).unwrap();
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let filter = match parse(&line) {
|
||||
Ok(filter) => filter,
|
||||
Err(error) => {
|
||||
println!("{}", error);
|
||||
continue;
|
||||
}
|
||||
ZipArchive::new(reader).unwrap()
|
||||
};
|
||||
|
||||
let searcher = Searcher::new(&fetcher, &opener);
|
||||
|
||||
while let Ok(line) = editor.readline(">>> ") {
|
||||
editor.add_history_entry(&line)?;
|
||||
let start = Instant::now();
|
||||
let stories = fetcher.filter(&filter);
|
||||
|
||||
let result = searcher
|
||||
.search(&line)
|
||||
.into_iter()
|
||||
.filter(|(_sid, score)| *score > 10f32)
|
||||
.filter_map(|(sid, score)| Some((fetcher.fetch(sid)?, score)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let finish = (Instant::now() - start).as_millis();
|
||||
let count = stories.len();
|
||||
let count = result.len();
|
||||
|
||||
println!("Found {} stories in {} milliseconds!", count, finish);
|
||||
|
||||
if count > 32 {
|
||||
continue;
|
||||
}
|
||||
|
||||
for story in stories.iter() {
|
||||
for (story, score) in result {
|
||||
let key = &story.id;
|
||||
let title = &story.title;
|
||||
|
||||
println!("[{}] {}", key, title);
|
||||
let tags = story
|
||||
.tags
|
||||
.iter()
|
||||
.map(|tag| String::from(&tag.name))
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ");
|
||||
|
||||
println!("{:02.02}% [{:>6}] {}", score, key, title);
|
||||
println!("{}", tags);
|
||||
println!("{}", story.short_description);
|
||||
println!("{}", story.url);
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
22
search/Cargo.toml
Normal file
22
search/Cargo.toml
Normal file
|
@ -0,0 +1,22 @@
|
|||
[package]
|
||||
name = "fimfareader-search"
|
||||
version = "0.1.0"
|
||||
authors = ["Joakim Soderlund <joakim.soderlund@gmail.com>"]
|
||||
edition = "2021"
|
||||
|
||||
[dependencies.fimfareader]
|
||||
path = ".."
|
||||
|
||||
[dependencies.rayon]
|
||||
version = "*"
|
||||
|
||||
[dependencies.tantivy]
|
||||
git = "https://github.com/quickwit-oss/tantivy.git"
|
||||
|
||||
[dependencies.thread_local]
|
||||
version = "*"
|
||||
|
||||
[dependencies.zip]
|
||||
version = "*"
|
||||
features = ["deflate"]
|
||||
default-features = false
|
169
search/src/lib.rs
Normal file
169
search/src/lib.rs
Normal file
|
@ -0,0 +1,169 @@
|
|||
//! Main module.
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::io::Cursor;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Instant;
|
||||
|
||||
use rayon::iter::ParallelIterator;
|
||||
use thread_local::ThreadLocal;
|
||||
use zip::read::ZipArchive;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema;
|
||||
use tantivy::schema::Schema;
|
||||
use tantivy::schema::Value;
|
||||
use tantivy::Index;
|
||||
use tantivy::IndexSettings;
|
||||
use tantivy::ReloadPolicy;
|
||||
use tantivy::TantivyDocument;
|
||||
|
||||
use fimfareader::prelude::*;
|
||||
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
pub fn new<T, F>(fetcher: &Fetcher<T>, f: &F) -> Self
|
||||
where
|
||||
T: Read + Seek + Send,
|
||||
F: Fn() -> ZipArchive<T> + Sync,
|
||||
{
|
||||
let mut builder = Schema::builder();
|
||||
builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
||||
builder.add_text_field("content", schema::TEXT);
|
||||
let schema = builder.build();
|
||||
|
||||
let index = Self::load_index(schema.clone(), fetcher, f);
|
||||
|
||||
Searcher { schema, index }
|
||||
}
|
||||
|
||||
fn load_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) -> Index
|
||||
where
|
||||
T: Read + Seek + Send,
|
||||
F: Fn() -> ZipArchive<T> + Sync,
|
||||
{
|
||||
let identity = fetcher.identity().unwrap();
|
||||
let directory = Path::new("cache").join(identity);
|
||||
|
||||
if !directory.exists() {
|
||||
Self::make_index(schema.clone(), fetcher, f);
|
||||
}
|
||||
|
||||
let store = MmapDirectory::open(&directory).unwrap();
|
||||
Index::open_or_create(store, schema).unwrap()
|
||||
}
|
||||
|
||||
fn make_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F)
|
||||
where
|
||||
T: Read + Seek + Send,
|
||||
F: Fn() -> ZipArchive<T> + Sync,
|
||||
{
|
||||
let identity = fetcher.identity().unwrap();
|
||||
let directory = Path::new("cache").join(identity);
|
||||
|
||||
std::fs::create_dir_all(&directory).unwrap();
|
||||
let store = MmapDirectory::open(&directory).unwrap();
|
||||
let settings = IndexSettings::default();
|
||||
let index = Index::create(store, schema, settings).unwrap();
|
||||
|
||||
let schema = index.schema();
|
||||
let sid = schema.get_field("sid").unwrap();
|
||||
let content = schema.get_field("content").unwrap();
|
||||
let mut writer = index.writer(536_870_912).unwrap();
|
||||
|
||||
let counter = AtomicUsize::new(0);
|
||||
let total = fetcher.iter().count();
|
||||
let start = Instant::now();
|
||||
|
||||
let local = ThreadLocal::new();
|
||||
|
||||
fetcher.par_iter().for_each(|story| {
|
||||
let mut doc = TantivyDocument::default();
|
||||
|
||||
let mut arch = local.get_or(|| RefCell::new(f())).borrow_mut();
|
||||
let mut file = arch.by_name(&story.archive.path).unwrap();
|
||||
let mut data = Vec::with_capacity(file.size() as usize);
|
||||
let mut text = String::with_capacity(1_048_576);
|
||||
|
||||
file.read_to_end(&mut data).unwrap();
|
||||
let mut epub = ZipArchive::new(Cursor::new(data)).unwrap();
|
||||
|
||||
let count = counter.fetch_add(1, Ordering::SeqCst);
|
||||
let percentage = (count as f64 / total as f64) * 100f64;
|
||||
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
|
||||
|
||||
doc.add_i64(sid, story.id);
|
||||
|
||||
for i in 0..epub.len() {
|
||||
let mut file = epub.by_index(i).unwrap();
|
||||
|
||||
if !file.name().ends_with(".html") {
|
||||
continue;
|
||||
}
|
||||
|
||||
file.read_to_string(&mut text).unwrap();
|
||||
doc.add_text(content, &text);
|
||||
text.clear();
|
||||
}
|
||||
|
||||
writer.add_document(doc).unwrap();
|
||||
});
|
||||
|
||||
writer.commit().unwrap();
|
||||
|
||||
let finish = (Instant::now() - start).as_secs();
|
||||
println!("Index generated in {} seconds.", finish);
|
||||
}
|
||||
|
||||
pub fn search(&self, text: &str) -> Vec<(i64, f32)> {
|
||||
let reader = self
|
||||
.index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let identitfier = self.schema.get_field("sid").unwrap();
|
||||
let content = self.schema.get_field("content").unwrap();
|
||||
let parser = QueryParser::for_index(&self.index, vec![content]);
|
||||
|
||||
let limit = TopDocs::with_limit(32);
|
||||
let query = parser.parse_query(&text).unwrap();
|
||||
let docs = searcher.search(&query, &limit).unwrap();
|
||||
|
||||
docs.into_iter()
|
||||
.map(|(score, address)| {
|
||||
let doc: TantivyDocument = searcher.doc(address).unwrap();
|
||||
|
||||
match doc.get_first(identitfier).map(|v| v.as_i64()) {
|
||||
Some(Some(value)) => (value, score),
|
||||
_ => panic!("Invalid story key type!"),
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn parse(&self, text: &str) -> impl Fn(&Story) -> bool + Sync {
|
||||
let mut sids: Vec<_> = self
|
||||
.search(text)
|
||||
.into_iter()
|
||||
.filter(|(_, score)| *score > 10f32)
|
||||
.map(|(sid, _)| sid)
|
||||
.collect();
|
||||
|
||||
sids.sort();
|
||||
|
||||
move |story| sids.binary_search(&story.id).is_ok()
|
||||
}
|
||||
}
|
|
@ -97,10 +97,26 @@ impl<T: Read + Seek> Fetcher<T> {
|
|||
Ok(buf)
|
||||
}
|
||||
|
||||
pub fn identity(&self) -> Result<String> {
|
||||
let mut archive = self.archive.lock().map_err(|e| match e {
|
||||
_ => Error::archive("Could not acquire fetcher lock"),
|
||||
})?;
|
||||
|
||||
let index = archive.by_name("index.json").map_err(|e| match e {
|
||||
_ => Error::archive("Could not open archive index"),
|
||||
})?;
|
||||
|
||||
Ok(format!("{}", index.crc32()))
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = &Story> {
|
||||
self.index.iter()
|
||||
}
|
||||
|
||||
pub fn par_iter(&self) -> impl ParallelIterator<Item = &Story> {
|
||||
self.index.par_iter()
|
||||
}
|
||||
|
||||
pub fn filter<F>(&self, function: &F) -> Vec<&Story>
|
||||
where
|
||||
F: Sync + Fn(&Story) -> bool,
|
||||
|
|
Loading…
Reference in a new issue