mirror of
https://github.com/JockeTF/fimfareader.git
synced 2024-11-23 13:58:00 +01:00
Split search into its own crate
This commit is contained in:
parent
aae361767b
commit
c6a9e88834
7 changed files with 224 additions and 127 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
|
/cache
|
||||||
/target
|
/target
|
||||||
**/*.rs.bk
|
**/*.rs.bk
|
||||||
|
|
16
Cargo.lock
generated
16
Cargo.lock
generated
|
@ -345,11 +345,8 @@ name = "fimfareader-cli"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fimfareader",
|
"fimfareader",
|
||||||
"fimfareader-query",
|
"fimfareader-search",
|
||||||
"rayon",
|
|
||||||
"rustyline",
|
"rustyline",
|
||||||
"tantivy",
|
|
||||||
"thread_local",
|
|
||||||
"zip",
|
"zip",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -365,6 +362,17 @@ dependencies = [
|
||||||
"regex",
|
"regex",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fimfareader-search"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"fimfareader",
|
||||||
|
"rayon",
|
||||||
|
"tantivy",
|
||||||
|
"thread_local",
|
||||||
|
"zip",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flate2"
|
name = "flate2"
|
||||||
version = "1.0.24"
|
version = "1.0.24"
|
||||||
|
|
|
@ -8,6 +8,7 @@ edition = "2021"
|
||||||
members = [
|
members = [
|
||||||
"cli",
|
"cli",
|
||||||
"query",
|
"query",
|
||||||
|
"search",
|
||||||
]
|
]
|
||||||
|
|
||||||
default-members = [
|
default-members = [
|
||||||
|
|
|
@ -7,22 +7,13 @@ edition = "2018"
|
||||||
[dependencies.fimfareader]
|
[dependencies.fimfareader]
|
||||||
path = ".."
|
path = ".."
|
||||||
|
|
||||||
[dependencies.fimfareader-query]
|
[dependencies.fimfareader-search]
|
||||||
path = "../query"
|
path = "../search"
|
||||||
|
|
||||||
[dependencies.rustyline]
|
[dependencies.rustyline]
|
||||||
default-features = false
|
default-features = false
|
||||||
version = "*"
|
version = "*"
|
||||||
|
|
||||||
[dependencies.rayon]
|
|
||||||
version = "*"
|
|
||||||
|
|
||||||
[dependencies.tantivy]
|
|
||||||
version = "*"
|
|
||||||
|
|
||||||
[dependencies.thread_local]
|
|
||||||
version = "*"
|
|
||||||
|
|
||||||
[dependencies.zip]
|
[dependencies.zip]
|
||||||
version = "*"
|
version = "*"
|
||||||
features = ["deflate"]
|
features = ["deflate"]
|
||||||
|
|
135
cli/src/main.rs
135
cli/src/main.rs
|
@ -1,34 +1,16 @@
|
||||||
//! Main module.
|
//! Main module.
|
||||||
|
|
||||||
use std::cell::RefCell;
|
|
||||||
use std::env::args;
|
use std::env::args;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::io::Cursor;
|
|
||||||
use std::io::Read;
|
|
||||||
use std::io::Seek;
|
|
||||||
use std::path::Path;
|
|
||||||
use std::sync::atomic::AtomicUsize;
|
|
||||||
use std::sync::atomic::Ordering;
|
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use rayon::iter::ParallelIterator;
|
|
||||||
use rustyline::Editor;
|
use rustyline::Editor;
|
||||||
use zip::read::ZipArchive;
|
|
||||||
|
|
||||||
use tantivy::collector::TopDocs;
|
use zip::ZipArchive;
|
||||||
use tantivy::directory::MmapDirectory;
|
|
||||||
use tantivy::query::QueryParser;
|
|
||||||
use tantivy::schema;
|
|
||||||
use tantivy::schema::Document;
|
|
||||||
use tantivy::schema::Schema;
|
|
||||||
use tantivy::schema::Value;
|
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::ReloadPolicy;
|
|
||||||
|
|
||||||
use thread_local::ThreadLocal;
|
|
||||||
|
|
||||||
use fimfareader::prelude::*;
|
use fimfareader::prelude::*;
|
||||||
|
use fimfareader_search::Searcher;
|
||||||
|
|
||||||
fn exit(error: Error) -> ! {
|
fn exit(error: Error) -> ! {
|
||||||
eprintln!("{}", error);
|
eprintln!("{}", error);
|
||||||
|
@ -36,77 +18,6 @@ fn exit(error: Error) -> ! {
|
||||||
std::process::exit(1)
|
std::process::exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_index<T>(schema: Schema, fetcher: &Fetcher<T>, path: &str) -> Index
|
|
||||||
where
|
|
||||||
T: Read + Seek + Send,
|
|
||||||
{
|
|
||||||
let identity = fetcher.identity().unwrap();
|
|
||||||
let directory = Path::new("search").join(identity);
|
|
||||||
|
|
||||||
if directory.exists() {
|
|
||||||
let store = MmapDirectory::open(&directory).unwrap();
|
|
||||||
return Index::open_or_create(store, schema).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::fs::create_dir_all(&directory).unwrap();
|
|
||||||
let store = MmapDirectory::open(&directory).unwrap();
|
|
||||||
let index = Index::create(store, schema).unwrap();
|
|
||||||
|
|
||||||
let schema = index.schema();
|
|
||||||
let sid = schema.get_field("sid").unwrap();
|
|
||||||
let content = schema.get_field("content").unwrap();
|
|
||||||
let mut writer = index.writer(536_870_912).unwrap();
|
|
||||||
|
|
||||||
let counter = AtomicUsize::new(0);
|
|
||||||
let total = fetcher.iter().count();
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let local = ThreadLocal::new();
|
|
||||||
|
|
||||||
fetcher.par_iter().for_each(|story| {
|
|
||||||
let mut doc = Document::default();
|
|
||||||
doc.add_i64(sid, story.id);
|
|
||||||
|
|
||||||
let archive = local.get_or(|| {
|
|
||||||
let reader = BufReader::new(File::open(&path).unwrap());
|
|
||||||
RefCell::new(ZipArchive::new(reader).unwrap())
|
|
||||||
});
|
|
||||||
|
|
||||||
let mut archive = archive.borrow_mut();
|
|
||||||
let mut file = archive.by_name(&story.archive.path).unwrap();
|
|
||||||
let mut data = Vec::with_capacity(file.size() as usize);
|
|
||||||
let mut text = String::with_capacity(1_048_576);
|
|
||||||
|
|
||||||
file.read_to_end(&mut data).unwrap();
|
|
||||||
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
|
|
||||||
let count = counter.fetch_add(1, Ordering::SeqCst);
|
|
||||||
|
|
||||||
let percentage = (count as f64 / total as f64) * 100f64;
|
|
||||||
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
|
|
||||||
|
|
||||||
for i in 0..arch.len() {
|
|
||||||
let mut file = arch.by_index(i).unwrap();
|
|
||||||
|
|
||||||
if !file.name().ends_with(".html") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
file.read_to_string(&mut text).unwrap();
|
|
||||||
doc.add_text(content, &text);
|
|
||||||
text.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.add_document(doc);
|
|
||||||
});
|
|
||||||
|
|
||||||
writer.commit().unwrap();
|
|
||||||
|
|
||||||
let finish = (Instant::now() - start).as_secs();
|
|
||||||
println!("Index generated in {} seconds.", finish);
|
|
||||||
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let argv = args().collect::<Vec<String>>();
|
let argv = args().collect::<Vec<String>>();
|
||||||
let mut editor = Editor::<()>::new();
|
let mut editor = Editor::<()>::new();
|
||||||
|
@ -127,36 +38,36 @@ fn main() {
|
||||||
println!("Finished loading in {} milliseconds.", finish);
|
println!("Finished loading in {} milliseconds.", finish);
|
||||||
println!("The archive contains {} stories.", count);
|
println!("The archive contains {} stories.", count);
|
||||||
|
|
||||||
let mut builder = Schema::builder();
|
let opener = || {
|
||||||
let sid = builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
let file = File::open(&argv[1]).unwrap();
|
||||||
let content = builder.add_text_field("content", schema::TEXT);
|
let reader = BufReader::new(file);
|
||||||
let index = load_index(builder.build(), &fetcher, &argv[1]);
|
|
||||||
|
|
||||||
let reader = index
|
ZipArchive::new(reader).unwrap()
|
||||||
.reader_builder()
|
};
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = Searcher::new(&fetcher, &opener);
|
||||||
let parser = QueryParser::for_index(&index, vec![content]);
|
|
||||||
|
|
||||||
while let Ok(line) = editor.readline(">>> ") {
|
while let Ok(line) = editor.readline(">>> ") {
|
||||||
editor.add_history_entry(&line);
|
editor.add_history_entry(&line);
|
||||||
|
|
||||||
let limit = TopDocs::with_limit(16);
|
let filter = searcher.parse(&line);
|
||||||
let query = parser.parse_query(&line).unwrap();
|
|
||||||
let docs = searcher.search(&query, &limit).unwrap();
|
|
||||||
|
|
||||||
for (score, address) in docs {
|
let start = Instant::now();
|
||||||
let doc = searcher.doc(address).unwrap();
|
let stories = fetcher.filter(&filter);
|
||||||
|
let finish = (Instant::now() - start).as_millis();
|
||||||
|
let count = stories.len();
|
||||||
|
|
||||||
let story = match doc.get_first(sid).unwrap() {
|
println!("Found {} stories in {} milliseconds!", count, finish);
|
||||||
Value::I64(value) => fetcher.fetch(*value).unwrap(),
|
|
||||||
_ => panic!("Invalid story key type!"),
|
|
||||||
};
|
|
||||||
|
|
||||||
println!("{:02.0}% [{:06}] {}", score, story.id, story.title);
|
if count > 32 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for story in stories.iter() {
|
||||||
|
let key = &story.id;
|
||||||
|
let title = &story.title;
|
||||||
|
|
||||||
|
println!("[{}] {}", key, title);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
22
search/Cargo.toml
Normal file
22
search/Cargo.toml
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
[package]
|
||||||
|
name = "fimfareader-search"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Joakim Soderlund <joakim.soderlund@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies.fimfareader]
|
||||||
|
path = ".."
|
||||||
|
|
||||||
|
[dependencies.rayon]
|
||||||
|
version = "*"
|
||||||
|
|
||||||
|
[dependencies.tantivy]
|
||||||
|
version = "*"
|
||||||
|
|
||||||
|
[dependencies.thread_local]
|
||||||
|
version = "*"
|
||||||
|
|
||||||
|
[dependencies.zip]
|
||||||
|
version = "*"
|
||||||
|
features = ["deflate"]
|
||||||
|
default-features = false
|
163
search/src/lib.rs
Normal file
163
search/src/lib.rs
Normal file
|
@ -0,0 +1,163 @@
|
||||||
|
//! Main module.
|
||||||
|
|
||||||
|
use std::cell::RefCell;
|
||||||
|
use std::io::Cursor;
|
||||||
|
use std::io::Read;
|
||||||
|
use std::io::Seek;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::sync::atomic::AtomicUsize;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use rayon::iter::ParallelIterator;
|
||||||
|
use zip::read::ZipArchive;
|
||||||
|
|
||||||
|
use tantivy::collector::TopDocs;
|
||||||
|
use tantivy::directory::MmapDirectory;
|
||||||
|
use tantivy::query::QueryParser;
|
||||||
|
use tantivy::schema;
|
||||||
|
use tantivy::schema::Document;
|
||||||
|
use tantivy::schema::Schema;
|
||||||
|
use tantivy::schema::Value;
|
||||||
|
use tantivy::Index;
|
||||||
|
use tantivy::ReloadPolicy;
|
||||||
|
|
||||||
|
use thread_local::ThreadLocal;
|
||||||
|
|
||||||
|
use fimfareader::prelude::*;
|
||||||
|
|
||||||
|
pub struct Searcher {
|
||||||
|
schema: Schema,
|
||||||
|
index: Index,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Searcher {
|
||||||
|
pub fn new<T, F>(fetcher: &Fetcher<T>, f: &F) -> Self
|
||||||
|
where
|
||||||
|
T: Read + Seek + Send,
|
||||||
|
F: Fn() -> ZipArchive<T> + Sync,
|
||||||
|
{
|
||||||
|
let mut builder = Schema::builder();
|
||||||
|
builder.add_i64_field("sid", schema::INDEXED | schema::STORED);
|
||||||
|
builder.add_text_field("content", schema::TEXT);
|
||||||
|
let schema = builder.build();
|
||||||
|
|
||||||
|
let index = Self::load_index(schema.clone(), fetcher, f);
|
||||||
|
|
||||||
|
Searcher { schema, index }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F) -> Index
|
||||||
|
where
|
||||||
|
T: Read + Seek + Send,
|
||||||
|
F: Fn() -> ZipArchive<T> + Sync,
|
||||||
|
{
|
||||||
|
let identity = fetcher.identity().unwrap();
|
||||||
|
let directory = Path::new("cache").join(identity);
|
||||||
|
|
||||||
|
if !directory.exists() {
|
||||||
|
Self::make_index(schema.clone(), fetcher, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
let store = MmapDirectory::open(&directory).unwrap();
|
||||||
|
return Index::open_or_create(store, schema).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_index<T, F>(schema: Schema, fetcher: &Fetcher<T>, f: &F)
|
||||||
|
where
|
||||||
|
T: Read + Seek + Send,
|
||||||
|
F: Fn() -> ZipArchive<T> + Sync,
|
||||||
|
{
|
||||||
|
let identity = fetcher.identity().unwrap();
|
||||||
|
let directory = Path::new("cache").join(identity);
|
||||||
|
|
||||||
|
std::fs::create_dir_all(&directory).unwrap();
|
||||||
|
let store = MmapDirectory::open(&directory).unwrap();
|
||||||
|
let index = Index::create(store, schema).unwrap();
|
||||||
|
|
||||||
|
let schema = index.schema();
|
||||||
|
let sid = schema.get_field("sid").unwrap();
|
||||||
|
let content = schema.get_field("content").unwrap();
|
||||||
|
let mut writer = index.writer(536_870_912).unwrap();
|
||||||
|
|
||||||
|
let counter = AtomicUsize::new(0);
|
||||||
|
let total = fetcher.iter().count();
|
||||||
|
let start = Instant::now();
|
||||||
|
|
||||||
|
let local = ThreadLocal::new();
|
||||||
|
|
||||||
|
fetcher.par_iter().for_each(|story| {
|
||||||
|
let mut doc = Document::default();
|
||||||
|
doc.add_i64(sid, story.id);
|
||||||
|
|
||||||
|
let archive = local.get_or(|| RefCell::new(f()));
|
||||||
|
|
||||||
|
let mut archive = archive.borrow_mut();
|
||||||
|
let mut file = archive.by_name(&story.archive.path).unwrap();
|
||||||
|
let mut data = Vec::with_capacity(file.size() as usize);
|
||||||
|
let mut text = String::with_capacity(1_048_576);
|
||||||
|
|
||||||
|
file.read_to_end(&mut data).unwrap();
|
||||||
|
let mut arch = ZipArchive::new(Cursor::new(data)).unwrap();
|
||||||
|
let count = counter.fetch_add(1, Ordering::SeqCst);
|
||||||
|
|
||||||
|
let percentage = (count as f64 / total as f64) * 100f64;
|
||||||
|
print!("\r\rIndexing archive... {:.2}%\r\r", percentage);
|
||||||
|
|
||||||
|
for i in 0..arch.len() {
|
||||||
|
let mut file = arch.by_index(i).unwrap();
|
||||||
|
|
||||||
|
if !file.name().ends_with(".html") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
file.read_to_string(&mut text).unwrap();
|
||||||
|
doc.add_text(content, &text);
|
||||||
|
text.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.add_document(doc);
|
||||||
|
});
|
||||||
|
|
||||||
|
writer.commit().unwrap();
|
||||||
|
|
||||||
|
let finish = (Instant::now() - start).as_secs();
|
||||||
|
println!("Index generated in {} seconds.", finish);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse(&self, text: &str) -> impl Fn(&Story) -> bool + Sync {
|
||||||
|
let reader = self
|
||||||
|
.index
|
||||||
|
.reader_builder()
|
||||||
|
.reload_policy(ReloadPolicy::OnCommit)
|
||||||
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let identitfier = self.schema.get_field("sid").unwrap();
|
||||||
|
let content = self.schema.get_field("content").unwrap();
|
||||||
|
let parser = QueryParser::for_index(&self.index, vec![content]);
|
||||||
|
|
||||||
|
let limit = TopDocs::with_limit(16);
|
||||||
|
let query = parser.parse_query(&text).unwrap();
|
||||||
|
let docs = searcher.search(&query, &limit).unwrap();
|
||||||
|
|
||||||
|
let mut sids: Vec<i64> = docs
|
||||||
|
.into_iter()
|
||||||
|
.map(|(score, address)| {
|
||||||
|
let doc = searcher.doc(address).unwrap();
|
||||||
|
|
||||||
|
match doc.get_first(identitfier) {
|
||||||
|
Some(Value::I64(value)) => (*value, score),
|
||||||
|
_ => panic!("Invalid story key type!"),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(|(_, score)| *score > 0.1)
|
||||||
|
.map(|(sid, _)| sid)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
sids.sort();
|
||||||
|
|
||||||
|
move |story| sids.binary_search(&story.id).is_ok()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue