Split search into its own crate

This commit is contained in:
Joakim Soderlund 2020-05-21 18:12:50 +02:00
parent aae361767b
commit c6a9e88834
7 changed files with 224 additions and 127 deletions

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
/cache
/target /target
**/*.rs.bk **/*.rs.bk

16
Cargo.lock generated
View file

@ -345,11 +345,8 @@ name = "fimfareader-cli"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"fimfareader", "fimfareader",
"fimfareader-query", "fimfareader-search",
"rayon",
"rustyline", "rustyline",
"tantivy",
"thread_local",
"zip", "zip",
] ]
@ -365,6 +362,17 @@ dependencies = [
"regex", "regex",
] ]
[[package]]
name = "fimfareader-search"
version = "0.1.0"
dependencies = [
"fimfareader",
"rayon",
"tantivy",
"thread_local",
"zip",
]
[[package]] [[package]]
name = "flate2" name = "flate2"
version = "1.0.24" version = "1.0.24"

View file

@ -8,6 +8,7 @@ edition = "2021"
members = [ members = [
"cli", "cli",
"query", "query",
"search",
] ]
default-members = [ default-members = [

View file

@ -7,22 +7,13 @@ edition = "2018"
[dependencies.fimfareader] [dependencies.fimfareader]
path = ".." path = ".."
[dependencies.fimfareader-query] [dependencies.fimfareader-search]
path = "../query" path = "../search"
[dependencies.rustyline] [dependencies.rustyline]
default-features = false default-features = false
version = "*" version = "*"
[dependencies.rayon]
version = "*"
[dependencies.tantivy]
version = "*"
[dependencies.thread_local]
version = "*"
[dependencies.zip] [dependencies.zip]
version = "*" version = "*"
features = ["deflate"] features = ["deflate"]

View file

@ -1,34 +1,16 @@
//! Main module. //! Main module.
use std::cell::RefCell;
use std::env::args; use std::env::args;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
use std::io::Cursor;
use std::io::Read;
use std::io::Seek;
use std::path::Path;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::time::Instant; use std::time::Instant;
use rayon::iter::ParallelIterator;
use rustyline::Editor; use rustyline::Editor;
use zip::read::ZipArchive;
use tantivy::collector::TopDocs; use zip::ZipArchive;
use tantivy::directory::MmapDirectory;
use tantivy::query::QueryParser;
use tantivy::schema;
use tantivy::schema::Document;
use tantivy::schema::Schema;
use tantivy::schema::Value;
use tantivy::Index;
use tantivy::ReloadPolicy;
use thread_local::ThreadLocal;
use fimfareader::prelude::*; use fimfareader::prelude::*;
use fimfareader_search::Searcher;
fn exit(error: Error) -> ! { fn exit(error: Error) -> ! {
eprintln!("{}", error); eprintln!("{}", error);
@ -36,77 +18,6 @@ fn exit(error: Error) -> ! {
std::process::exit(1) std::process::exit(1)
} }
fn load_index<T>(schema: Schema, fetcher: &Fetcher<T>, path: &str) -> Index
where
T: Read + Seek + Send,
{
let identity = fetcher.identity().unwrap();
let directory = Path::new("search").join(identity);
if directory.exists() {
let store = MmapDirectory::open(&directory).unwrap();
return Index::open_or_create(store, schema).unwrap();
}
std::fs::create_dir_all(&directory).unwrap();
let store = MmapDirectory::open(&directory).unwrap();
let index = Index::create(store, schema).unwrap();
let schema = index.schema();
let sid = schema.get_field("sid").unwrap();
let content = schema.get_field("content").unwrap();
let mut writer = index.writer(536_870_912).unwrap();
let counter = AtomicUsize::new(0);
let total = fetcher.iter().count();
let start = Instant::now();
let local = ThreadLocal::new();
fetcher.par_iter().for_each(|story| {
let mut doc = Document::default();
doc.add_i64(sid, story.id);
let archive = local.get_or(|| {
let reader = BufReader::new(File::open(&path).unwrap());
RefCell::new(ZipArchive::new(reader).unwrap())
});
let mut archive = archive.borrow_mut();
let mut file = archive.by_name(&story.archive.path).unwrap();
let mut data = Vec::with_capacity(file.size() as usize);
let mut text = String::with_capacity(1_048_576);
file.read_to_end(&mut data).unwrap();
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
let count = counter.fetch_add(1, Ordering::SeqCst);
let percentage = (count as f64 / total as f64) * 100f64;
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
for i in 0..arch.len() {
let mut file = arch.by_index(i).unwrap();
if !file.name().ends_with(".html") {
continue;
}
file.read_to_string(&mut text).unwrap();
doc.add_text(content, &text);
text.clear();
}
writer.add_document(doc);
});
writer.commit().unwrap();
let finish = (Instant::now() - start).as_secs();
println!("Index generated in {} seconds.", finish);
index
}
fn main() { fn main() {
let argv = args().collect::<Vec<String>>(); let argv = args().collect::<Vec<String>>();
let mut editor = Editor::<()>::new(); let mut editor = Editor::<()>::new();
@ -127,36 +38,36 @@ fn main() {
println!("Finished loading in {} milliseconds.", finish); println!("Finished loading in {} milliseconds.", finish);
println!("The archive contains {} stories.", count); println!("The archive contains {} stories.", count);
let mut builder = Schema::builder(); let opener = || {
let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED); let file = File::open(&argv[1]).unwrap();
let content = builder.add_text_field("content", schema::TEXT); let reader = BufReader::new(file);
let index = load_index(builder.build(), &fetcher, &argv[1]);
let reader = index ZipArchive::new(reader).unwrap()
.reader_builder() };
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
let searcher = reader.searcher(); let searcher = Searcher::new(&fetcher, &opener);
let parser = QueryParser::for_index(&index, vec![content]);
while let Ok(line) = editor.readline(">>> ") { while let Ok(line) = editor.readline(">>> ") {
editor.add_history_entry(&line); editor.add_history_entry(&line);
let limit = TopDocs::with_limit(16); let filter = searcher.parse(&line);
let query = parser.parse_query(&line).unwrap();
let docs = searcher.search(&query, &limit).unwrap();
for (score, address) in docs { let start = Instant::now();
let doc = searcher.doc(address).unwrap(); let stories = fetcher.filter(&filter);
let finish = (Instant::now() - start).as_millis();
let count = stories.len();
let story = match doc.get_first(sid).unwrap() { println!("Found {} stories in {} milliseconds!", count, finish);
Value::I64(value) => fetcher.fetch(*value).unwrap(),
_ => panic!("Invalid story key type!"),
};
println!("{:02.0}% [{:06}] {}", score, story.id, story.title); if count > 32 {
continue;
}
for story in stories.iter() {
let key = &story.id;
let title = &story.title;
println!("[{}] {}", key, title);
} }
} }
} }

22
search/Cargo.toml Normal file
View file

@ -0,0 +1,22 @@
[package]
name = "fimfareader-search"
version = "0.1.0"
authors = ["Joakim Soderlund <joakim.soderlund@gmail.com>"]
edition = "2018"
[dependencies.fimfareader]
path = ".."
[dependencies.rayon]
version = "*"
[dependencies.tantivy]
version = "*"
[dependencies.thread_local]
version = "*"
[dependencies.zip]
version = "*"
features = ["deflate"]
default-features = false

163
search/src/lib.rs Normal file
View file

@ -0,0 +1,163 @@
//! Main module.
use std::cell::RefCell;
use std::io::Cursor;
use std::io::Read;
use std::io::Seek;
use std::path::Path;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::time::Instant;
use rayon::iter::ParallelIterator;
use zip::read::ZipArchive;
use tantivy::collector::TopDocs;
use tantivy::directory::MmapDirectory;
use tantivy::query::QueryParser;
use tantivy::schema;
use tantivy::schema::Document;
use tantivy::schema::Schema;
use tantivy::schema::Value;
use tantivy::Index;
use tantivy::ReloadPolicy;
use thread_local::ThreadLocal;
use fimfareader::prelude::*;
pub struct Searcher {
schema: Schema,
index: Index,
}
impl Searcher {
pub fn new<T, F>(fetcher: &Fetcher<T>, f: &F) -> Self
where
T: Read + Seek + Send,
F: Fn() -> ZipArchive<T> + Sync,
{
let mut builder = Schema::builder();
builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
builder.add_text_field("content", schema::TEXT);
let schema = builder.build();
let index = Self::load_index(schema.clone(), fetcher, f);
Searcher { schema, index }
}
fn load_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) -> Index
where
T: Read + Seek + Send,
F: Fn() -> ZipArchive<T> + Sync,
{
let identity = fetcher.identity().unwrap();
let directory = Path::new("cache").join(identity);
if !directory.exists() {
Self::make_index(schema.clone(), fetcher, f);
}
let store = MmapDirectory::open(&directory).unwrap();
return Index::open_or_create(store, schema).unwrap();
}
fn make_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F)
where
T: Read + Seek + Send,
F: Fn() -> ZipArchive<T> + Sync,
{
let identity = fetcher.identity().unwrap();
let directory = Path::new("cache").join(identity);
std::fs::create_dir_all(&directory).unwrap();
let store = MmapDirectory::open(&directory).unwrap();
let index = Index::create(store, schema).unwrap();
let schema = index.schema();
let sid = schema.get_field("sid").unwrap();
let content = schema.get_field("content").unwrap();
let mut writer = index.writer(536_870_912).unwrap();
let counter = AtomicUsize::new(0);
let total = fetcher.iter().count();
let start = Instant::now();
let local = ThreadLocal::new();
fetcher.par_iter().for_each(|story| {
let mut doc = Document::default();
doc.add_i64(sid, story.id);
let archive = local.get_or(|| RefCell::new(f()));
let mut archive = archive.borrow_mut();
let mut file = archive.by_name(&story.archive.path).unwrap();
let mut data = Vec::with_capacity(file.size() as usize);
let mut text = String::with_capacity(1_048_576);
file.read_to_end(&mut data).unwrap();
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
let count = counter.fetch_add(1, Ordering::SeqCst);
let percentage = (count as f64 / total as f64) * 100f64;
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
for i in 0..arch.len() {
let mut file = arch.by_index(i).unwrap();
if !file.name().ends_with(".html") {
continue;
}
file.read_to_string(&mut text).unwrap();
doc.add_text(content, &text);
text.clear();
}
writer.add_document(doc);
});
writer.commit().unwrap();
let finish = (Instant::now() - start).as_secs();
println!("Index generated in {} seconds.", finish);
}
pub fn parse(&self, text: &str) -> impl Fn(&Story) -> bool + Sync {
let reader = self
.index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
let searcher = reader.searcher();
let identitfier = self.schema.get_field("sid").unwrap();
let content = self.schema.get_field("content").unwrap();
let parser = QueryParser::for_index(&self.index, vec![content]);
let limit = TopDocs::with_limit(16);
let query = parser.parse_query(&text).unwrap();
let docs = searcher.search(&query, &limit).unwrap();
let mut sids: Vec<i64> = docs
.into_iter()
.map(|(score, address)| {
let doc = searcher.doc(address).unwrap();
match doc.get_first(identitfier) {
Some(Value::I64(value)) => (*value, score),
_ => panic!("Invalid story key type!"),
}
})
.filter(|(_, score)| *score > 0.1)
.map(|(sid, _)| sid)
.collect();
sids.sort();
move |story| sids.binary_search(&story.id).is_ok()
}
}