diff --git a/.gitignore b/.gitignore index 53eaa21..ebcc59a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ +/cache /target **/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock index c376374..970ec9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -345,11 +345,8 @@ name = "fimfareader-cli" version = "0.1.0" dependencies = [ "fimfareader", - "fimfareader-query", - "rayon", + "fimfareader-search", "rustyline", - "tantivy", - "thread_local", "zip", ] @@ -365,6 +362,17 @@ dependencies = [ "regex", ] +[[package]] +name = "fimfareader-search" +version = "0.1.0" +dependencies = [ + "fimfareader", + "rayon", + "tantivy", + "thread_local", + "zip", +] + [[package]] name = "flate2" version = "1.0.24" diff --git a/Cargo.toml b/Cargo.toml index b9a2358..e456527 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" members = [ "cli", "query", + "search", ] default-members = [ diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 421321e..e7a3250 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -7,22 +7,13 @@ edition = "2018" [dependencies.fimfareader] path = ".." -[dependencies.fimfareader-query] -path = "../query" +[dependencies.fimfareader-search] +path = "../search" [dependencies.rustyline] default-features = false version = "*" -[dependencies.rayon] -version = "*" - -[dependencies.tantivy] -version = "*" - -[dependencies.thread_local] -version = "*" - [dependencies.zip] version = "*" features = ["deflate"] diff --git a/cli/src/main.rs b/cli/src/main.rs index 80d5809..f580d22 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,34 +1,16 @@ //! Main module. -use std::cell::RefCell; use std::env::args; use std::fs::File; use std::io::BufReader; -use std::io::Cursor; -use std::io::Read; -use std::io::Seek; -use std::path::Path; -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; use std::time::Instant; -use rayon::iter::ParallelIterator; use rustyline::Editor; -use zip::read::ZipArchive; -use tantivy::collector::TopDocs; -use tantivy::directory::MmapDirectory; -use tantivy::query::QueryParser; -use tantivy::schema; -use tantivy::schema::Document; -use tantivy::schema::Schema; -use tantivy::schema::Value; -use tantivy::Index; -use tantivy::ReloadPolicy; - -use thread_local::ThreadLocal; +use zip::ZipArchive; use fimfareader::prelude::*; +use fimfareader_search::Searcher; fn exit(error: Error) -> ! { eprintln!("{}", error); @@ -36,77 +18,6 @@ fn exit(error: Error) -> ! { std::process::exit(1) } -fn load_index(schema: Schema, fetcher: &Fetcher, path: &str) -> Index -where - T: Read + Seek + Send, -{ - let identity = fetcher.identity().unwrap(); - let directory = Path::new("search").join(identity); - - if directory.exists() { - let store = MmapDirectory::open(&directory).unwrap(); - return Index::open_or_create(store, schema).unwrap(); - } - - std::fs::create_dir_all(&directory).unwrap(); - let store = MmapDirectory::open(&directory).unwrap(); - let index = Index::create(store, schema).unwrap(); - - let schema = index.schema(); - let sid = schema.get_field("sid").unwrap(); - let content = schema.get_field("content").unwrap(); - let mut writer = index.writer(536_870_912).unwrap(); - - let counter = AtomicUsize::new(0); - let total = fetcher.iter().count(); - let start = Instant::now(); - - let local = ThreadLocal::new(); - - fetcher.par_iter().for_each(|story| { - let mut doc = Document::default(); - doc.add_i64(sid, story.id); - - let archive = local.get_or(|| { - let reader = BufReader::new(File::open(&path).unwrap()); - RefCell::new(ZipArchive::new(reader).unwrap()) - }); - - let mut archive = archive.borrow_mut(); - let mut file = archive.by_name(&story.archive.path).unwrap(); - let mut data = Vec::with_capacity(file.size() as usize); - let mut text = String::with_capacity(1_048_576); - - file.read_to_end(&mut data).unwrap(); - let mut arch = ZipArchive::new(Cursor::new(data)).unwrap(); - let count = counter.fetch_add(1, Ordering::SeqCst); - - let percentage = (count as f64 / total as f64) * 100f64; - print!("\r\rIndexing archive... {:.2}%\r\r", percentage); - - for i in 0..arch.len() { - let mut file = arch.by_index(i).unwrap(); - - if !file.name().ends_with(".html") { - continue; - } - - file.read_to_string(&mut text).unwrap(); - doc.add_text(content, &text); - text.clear(); - } - - writer.add_document(doc); - }); - - writer.commit().unwrap(); - - let finish = (Instant::now() - start).as_secs(); - println!("Index generated in {} seconds.", finish); - - index -} - fn main() { let argv = args().collect::>(); let mut editor = Editor::<()>::new(); @@ -127,36 +38,36 @@ fn main() { println!("Finished loading in {} milliseconds.", finish); println!("The archive contains {} stories.", count); - let mut builder = Schema::builder(); - let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED); - let content = builder.add_text_field("content", schema::TEXT); - let index = load_index(builder.build(), &fetcher, &argv[1]); + let opener = || { + let file = File::open(&argv[1]).unwrap(); + let reader = BufReader::new(file); - let reader = index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into() - .unwrap(); + ZipArchive::new(reader).unwrap() + }; - let searcher = reader.searcher(); - let parser = QueryParser::for_index(&index, vec![content]); + let searcher = Searcher::new(&fetcher, &opener); while let Ok(line) = editor.readline(">>> ") { editor.add_history_entry(&line); - let limit = TopDocs::with_limit(16); - let query = parser.parse_query(&line).unwrap(); - let docs = searcher.search(&query, &limit).unwrap(); + let filter = searcher.parse(&line); - for (score, address) in docs { - let doc = searcher.doc(address).unwrap(); + let start = Instant::now(); + let stories = fetcher.filter(&filter); + let finish = (Instant::now() - start).as_millis(); + let count = stories.len(); - let story = match doc.get_first(sid).unwrap() { - Value::I64(value) => fetcher.fetch(*value).unwrap(), - _ => panic!("Invalid story key type!"), - }; + println!("Found {} stories in {} milliseconds!", count, finish); - println!("{:02.0}% [{:06}] {}", score, story.id, story.title); + if count > 32 { + continue; + } + + for story in stories.iter() { + let key = &story.id; + let title = &story.title; + + println!("[{}] {}", key, title); } } } diff --git a/search/Cargo.toml b/search/Cargo.toml new file mode 100644 index 0000000..6a82059 --- /dev/null +++ b/search/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "fimfareader-search" +version = "0.1.0" +authors = ["Joakim Soderlund "] +edition = "2018" + +[dependencies.fimfareader] +path = ".." + +[dependencies.rayon] +version = "*" + +[dependencies.tantivy] +version = "*" + +[dependencies.thread_local] +version = "*" + +[dependencies.zip] +version = "*" +features = ["deflate"] +default-features = false diff --git a/search/src/lib.rs b/search/src/lib.rs new file mode 100644 index 0000000..04e80f8 --- /dev/null +++ b/search/src/lib.rs @@ -0,0 +1,163 @@ +//! Main module. + +use std::cell::RefCell; +use std::io::Cursor; +use std::io::Read; +use std::io::Seek; +use std::path::Path; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; +use std::time::Instant; + +use rayon::iter::ParallelIterator; +use zip::read::ZipArchive; + +use tantivy::collector::TopDocs; +use tantivy::directory::MmapDirectory; +use tantivy::query::QueryParser; +use tantivy::schema; +use tantivy::schema::Document; +use tantivy::schema::Schema; +use tantivy::schema::Value; +use tantivy::Index; +use tantivy::ReloadPolicy; + +use thread_local::ThreadLocal; + +use fimfareader::prelude::*; + +pub struct Searcher { + schema: Schema, + index: Index, +} + +impl Searcher { + pub fn new(fetcher: &Fetcher, f: &F) -> Self + where + T: Read + Seek + Send, + F: Fn() -> ZipArchive + Sync, + { + let mut builder = Schema::builder(); + builder.add_i64_field("sid", schema::INDEXED | schema::STORED); + builder.add_text_field("content", schema::TEXT); + let schema = builder.build(); + + let index = Self::load_index(schema.clone(), fetcher, f); + + Searcher { schema, index } + } + + fn load_index(schema: Schema, fetcher: &Fetcher, f: &F) -> Index + where + T: Read + Seek + Send, + F: Fn() -> ZipArchive + Sync, + { + let identity = fetcher.identity().unwrap(); + let directory = Path::new("cache").join(identity); + + if !directory.exists() { + Self::make_index(schema.clone(), fetcher, f); + } + + let store = MmapDirectory::open(&directory).unwrap(); + return Index::open_or_create(store, schema).unwrap(); + } + + fn make_index(schema: Schema, fetcher: &Fetcher, f: &F) + where + T: Read + Seek + Send, + F: Fn() -> ZipArchive + Sync, + { + let identity = fetcher.identity().unwrap(); + let directory = Path::new("cache").join(identity); + + std::fs::create_dir_all(&directory).unwrap(); + let store = MmapDirectory::open(&directory).unwrap(); + let index = Index::create(store, schema).unwrap(); + + let schema = index.schema(); + let sid = schema.get_field("sid").unwrap(); + let content = schema.get_field("content").unwrap(); + let mut writer = index.writer(536_870_912).unwrap(); + + let counter = AtomicUsize::new(0); + let total = fetcher.iter().count(); + let start = Instant::now(); + + let local = ThreadLocal::new(); + + fetcher.par_iter().for_each(|story| { + let mut doc = Document::default(); + doc.add_i64(sid, story.id); + + let archive = local.get_or(|| RefCell::new(f())); + + let mut archive = archive.borrow_mut(); + let mut file = archive.by_name(&story.archive.path).unwrap(); + let mut data = Vec::with_capacity(file.size() as usize); + let mut text = String::with_capacity(1_048_576); + + file.read_to_end(&mut data).unwrap(); + let mut arch = ZipArchive::new(Cursor::new(data)).unwrap(); + let count = counter.fetch_add(1, Ordering::SeqCst); + + let percentage = (count as f64 / total as f64) * 100f64; + print!("\r\rIndexing archive... {:.2}%\r\r", percentage); + + for i in 0..arch.len() { + let mut file = arch.by_index(i).unwrap(); + + if !file.name().ends_with(".html") { + continue; + } + + file.read_to_string(&mut text).unwrap(); + doc.add_text(content, &text); + text.clear(); + } + + writer.add_document(doc); + }); + + writer.commit().unwrap(); + + let finish = (Instant::now() - start).as_secs(); + println!("Index generated in {} seconds.", finish); + } + + pub fn parse(&self, text: &str) -> impl Fn(&Story) -> bool + Sync { + let reader = self + .index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommit) + .try_into() + .unwrap(); + + let searcher = reader.searcher(); + let identitfier = self.schema.get_field("sid").unwrap(); + let content = self.schema.get_field("content").unwrap(); + let parser = QueryParser::for_index(&self.index, vec![content]); + + let limit = TopDocs::with_limit(16); + let query = parser.parse_query(&text).unwrap(); + let docs = searcher.search(&query, &limit).unwrap(); + + let mut sids: Vec = docs + .into_iter() + .map(|(score, address)| { + let doc = searcher.doc(address).unwrap(); + + match doc.get_first(identitfier) { + Some(Value::I64(value)) => (*value, score), + _ => panic!("Invalid story key type!"), + } + }) + .filter(|(_, score)| *score > 0.1) + .map(|(sid, _)| sid) + .collect(); + + sids.sort(); + + move |story| sids.binary_search(&story.id).is_ok() + } +}