Use Tantivy for querying story contents

2025-03-11 22:20:02 +01:00 · 2020-03-08 00:54:56 +01:00 · 2020-03-08 00:54:56 +01:00 · 723b19b035
commit 723b19b035
parent 36ade958bb
8 changed files with 1079 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
+/cache
 /target
 **/*.rs.bk
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,6 +8,7 @@ edition = "2024"
 members = [
    "cli",
    "query",
+    "search",
 ]

 default-members = [
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -7,8 +7,8 @@ edition = "2024"
 [dependencies.fimfareader]
 path = ".."

-[dependencies.fimfareader-query]
-path = "../query"
+[dependencies.fimfareader-search]
+path = "../search"

 [dependencies.rustyline]
 default-features = false
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@ -5,10 +5,11 @@ use std::error::Error;
 use std::result::Result;
 use std::time::Instant;

-use fimfareader::archive::Fetcher;
-use fimfareader_query::parse;
 use rustyline::DefaultEditor;

+use fimfareader::archive::Fetcher;
+use fimfareader_search::Searcher;
+
 fn main() -> Result<(), Box<dyn Error>> {
    let argv = args().collect::<Vec<String>>();
    let mut editor = DefaultEditor::new()?;
@ -28,33 +29,41 @@ fn main() -> Result<(), Box<dyn Error>> {
    println!("Finished loading in {finish:?}.");
    println!("The archive contains {count} stories.");

+    let searcher = Searcher::new(&fetcher);
+
    while let Ok(line) = editor.readline(">>> ") {
        editor.add_history_entry(&line)?;
-
-        let filter = match parse(&line) {
-            Ok(filter) => filter,
-            Err(error) => {
-                println!("{}", error);
-                continue;
-            }
-        };
-
        let start = Instant::now();
-        let stories = fetcher.filter(&filter);
+
+        let result = searcher
+            .search(&line)
+            .into_iter()
+            .filter(|(_sid, score)| *score > 10f32)
+            .map(|(sid, score)| (i32::try_from(sid).unwrap(), score))
+            .filter_map(|(sid, score)| Some((fetcher.fetch(sid)?, score)))
+            .collect::<Vec<_>>();
+
        let finish = (Instant::now() - start).as_millis();
-        let count = stories.len();
+        let count = result.len();

        println!("Found {} stories in {} milliseconds!", count, finish);

-        if count > 32 {
-            continue;
-        }
-
-        for story in stories.iter() {
+        for (story, score) in result {
            let key = &story.id;
            let title = &story.title;

-            println!("[{}] {}", key, title);
+            let tags = story
+                .tags
+                .iter()
+                .map(|tag| tag.name.to_string())
+                .collect::<Vec<_>>()
+                .join(", ");
+
+            println!("{:02.02}% [{:>6}] {}", score, key, title);
+            println!("{}", tags);
+            println!("{}", story.short_description);
+            println!("{}", story.url);
+            println!();
        }
    }

--- a/search/Cargo.toml
+++ b/search/Cargo.toml
@ -0,0 +1,16 @@
+[package]
+name = "fimfareader-search"
+version = "0.1.0"
+authors = ["Joakim Soderlund <joakim.soderlund@gmail.com>"]
+edition = "2024"
+
+[dependencies.fimfareader]
+path = ".."
+
+[dependencies.tantivy]
+git = "https://github.com/quickwit-oss/tantivy.git"
+
+[dependencies.zip]
+version = "=0.6.6"
+features = ["deflate"]
+default-features = false
--- a/search/src/lib.rs
+++ b/search/src/lib.rs
@ -0,0 +1,160 @@
+//! Main module.
+
+use std::fs::create_dir_all;
+use std::io::Cursor;
+use std::io::Read;
+use std::io::Seek;
+use std::io::Write;
+use std::io::stdout;
+use std::path::Path;
+use std::time::Instant;
+
+use tantivy::Index;
+use tantivy::ReloadPolicy;
+use tantivy::TantivyDocument;
+use tantivy::collector::TopDocs;
+use tantivy::query::QueryParser;
+use tantivy::schema;
+use tantivy::schema::Schema;
+use tantivy::schema::Value;
+use zip::read::ZipArchive;
+
+use fimfareader::archive::Fetcher;
+use fimfareader::archive::Story;
+
+pub struct Searcher {
+    index: Index,
+}
+
+impl Searcher {
+    pub fn new<T>(fetcher: &Fetcher<T>) -> Self
+    where
+        T: Read + Seek + Send,
+    {
+        Searcher {
+            index: Self::load_index(fetcher),
+        }
+    }
+
+    fn schema() -> Schema {
+        let mut builder = Schema::builder();
+
+        builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
+        builder.add_text_field("content", schema::TEXT);
+
+        builder.build()
+    }
+
+    fn load_index<T>(fetcher: &Fetcher<T>) -> Index
+    where
+        T: Read + Seek + Send,
+    {
+        let identity = fetcher.identity().unwrap();
+        let path = Path::new("cache").join(identity);
+
+        if path.exists() {
+            Index::open_in_dir(path).unwrap()
+        } else {
+            Self::make_index(&path, fetcher)
+        }
+    }
+
+    fn make_index<T>(path: &Path, fetcher: &Fetcher<T>) -> Index
+    where
+        T: Read + Seek + Send,
+    {
+        let start = Instant::now();
+        print!("\r\rIndexing archive...\r\r");
+        create_dir_all(path).unwrap();
+
+        let schema = Self::schema();
+        let index = Index::create_in_dir(path, schema).unwrap();
+        let mut writer = index.writer(1_073_741_824).unwrap();
+        let mut buffer = String::with_capacity(1_048_576);
+
+        let schema = index.schema();
+        let identifier = schema.get_field("sid").unwrap();
+        let content = schema.get_field("content").unwrap();
+        let story_count = fetcher.iter().count() as f64;
+
+        for (i, story) in fetcher.iter().enumerate() {
+            let progress = (i * 100) as f64 / story_count;
+            print!("\r\rIndexing archive... {progress:.2}%\r\r");
+
+            let cursor = Cursor::new(fetcher.read(story).unwrap());
+            let mut epub = ZipArchive::new(cursor).unwrap();
+            let mut document = TantivyDocument::default();
+
+            document.add_i64(identifier, story.id.into());
+
+            for i in 0..epub.len() {
+                let mut file = epub.by_index(i).unwrap();
+
+                if !file.name().ends_with(".html") {
+                    continue;
+                }
+
+                file.read_to_string(&mut buffer).unwrap();
+                document.add_text(content, &buffer);
+                buffer.clear();
+            }
+
+            writer.add_document(document).unwrap();
+        }
+
+        print!("\r\rCommitting archive index...\r\r");
+        stdout().flush().unwrap();
+
+        writer.commit().unwrap();
+        writer.wait_merging_threads().unwrap();
+
+        let finish = (Instant::now() - start).as_secs();
+        println!("Index generated in {finish} seconds.");
+
+        index
+    }
+
+    pub fn search(&self, text: &str) -> Vec<(i64, f32)> {
+        let reader = self
+            .index
+            .reader_builder()
+            .reload_policy(ReloadPolicy::OnCommitWithDelay)
+            .try_into()
+            .unwrap();
+
+        let schema = self.index.schema();
+        let identifier = schema.get_field("sid").unwrap();
+        let content = schema.get_field("content").unwrap();
+
+        let parser = QueryParser::for_index(&self.index, vec![content]);
+        let query = parser.parse_query(text).unwrap();
+
+        let searcher = reader.searcher();
+        let limit = TopDocs::with_limit(32);
+        let docs = searcher.search(&query, &limit).unwrap();
+
+        docs.into_iter()
+            .map(|(score, address)| {
+                let doc: TantivyDocument = searcher.doc(address).unwrap();
+
+                match doc.get_first(identifier).map(|v| v.as_i64()) {
+                    Some(Some(value)) => (value, score),
+                    _ => panic!("Invalid story key type!"),
+                }
+            })
+            .collect()
+    }
+
+    pub fn parse(&self, text: &str) -> impl Fn(&Story) -> bool + Sync {
+        let mut sids: Vec<_> = self
+            .search(text)
+            .into_iter()
+            .filter(|(_, score)| *score > 10f32)
+            .map(|(sid, _)| sid)
+            .collect();
+
+        sids.sort();
+
+        move |story| sids.binary_search(&story.id.into()).is_ok()
+    }
+}
--- a/src/archive/fetcher.rs
+++ b/src/archive/fetcher.rs
@ -104,10 +104,26 @@ impl<T: Read + Seek> Fetcher<T> {
        Ok(buf)
    }

+    pub fn identity(&self) -> Result<String> {
+        let Ok(mut archive) = self.archive.lock() else {
+            return Err(Error::archive("Could not acquire fetcher lock"));
+        };
+
+        let Ok(index) = archive.by_name("index.json") else {
+            return Err(Error::archive("Could not open archive index"));
+        };
+
+        Ok(format!("{}", index.crc32()))
+    }
+
    pub fn iter(&self) -> impl Iterator<Item = &Story> {
        self.index.iter()
    }

+    pub fn par_iter(&self) -> impl ParallelIterator<Item = &Story> {
+        self.index.par_iter()
+    }
+
    pub fn filter<F>(&self, function: &F) -> Vec<&Story>
    where
        F: Sync + Fn(&Story) -> bool,