diff --git a/Cargo.lock b/Cargo.lock index 87bd556..9735aa2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,9 +43,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "cc" -version = "1.0.72" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" [[package]] name = "cfg-if" @@ -88,19 +88,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "console" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28b32d32ca44b70c3e4acd7db1babf555fa026e385fb95f18028f88848b3c31" -dependencies = [ - "encode_unicode", - "libc", - "once_cell", - "terminal_size", - "winapi", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -160,12 +147,6 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" -[[package]] -name = "encode_unicode" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" - [[package]] name = "endian-type" version = "0.1.2" @@ -205,9 +186,9 @@ dependencies = [ [[package]] name = "fd-lock" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcef756dea9cf3db5ce73759cf0467330427a786b47711b8d6c97620d718ceb9" +checksum = "02ecad9808e0596f8956d14f7fa868f996290bd01c8d7329d6e5bc2bb76adf8f" dependencies = [ "cfg-if", "rustix", @@ -240,11 +221,8 @@ dependencies = [ name = "fimfareader-hpm" version = "0.1.0" dependencies = [ - "chrono", "fimfareader", - "indicatif", "rayon", - "regex", "zip", ] @@ -287,27 +265,11 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "indicatif" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" -dependencies = [ - "console", - "lazy_static", - "number_prefix", - "rayon", - "regex", -] - [[package]] name = "io-lifetimes" -version = "0.4.4" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ef6787e7f0faedc040f95716bdd0e62bcfcf4ba93da053b62dea2691c13864" -dependencies = [ - "winapi", -] +checksum = "ec58677acfea8a15352d42fc87d11d63596ade9239e0a7c9352914417515dbe6" [[package]] name = "itoa" @@ -336,15 +298,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.117" +version = "0.2.119" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" +checksum = "1bf2e165bb3457c8e098ea76f3e3bc9db55f87aa90d52d0e6be741470916aaa4" [[package]] name = "linux-raw-sys" -version = "0.0.37" +version = "0.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95f5690fef754d905294c56f7ac815836f2513af966aa47f2e07ac79be07827f" +checksum = "5284f00d480e1c39af34e72f8ad60b94f47007e3481cd3b731c1d67190ddc7b7" [[package]] name = "log" @@ -442,18 +404,6 @@ dependencies = [ "libc", ] -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - -[[package]] -name = "once_cell" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" - [[package]] name = "proc-macro2" version = "1.0.36" @@ -526,9 +476,9 @@ checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "rustix" -version = "0.32.1" +version = "0.33.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cee647393af53c750e15dcbf7781cdd2e550b246bde76e46c326e7ea3c73773" +checksum = "a9466f25b92a648960ac1042fd3baa6b0bf285e60f754d7e5070770c813a177a" dependencies = [ "bitflags", "errno", @@ -601,9 +551,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" +checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95" dependencies = [ "itoa", "ryu", @@ -639,16 +589,6 @@ dependencies = [ "unicode-xid", ] -[[package]] -name = "terminal_size" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "thiserror" version = "1.0.30" diff --git a/Cargo.toml b/Cargo.toml index 5335c22..b8695f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ members = [ ] default-members = [ - "cli", + "hpm", ] [profile.dev.package."*"] diff --git a/hpm/Cargo.toml b/hpm/Cargo.toml index 16a8b10..101876b 100644 --- a/hpm/Cargo.toml +++ b/hpm/Cargo.toml @@ -7,19 +7,9 @@ edition = "2021" [dependencies.fimfareader] path = ".." -[dependencies.chrono] -version = "*" - -[dependencies.indicatif] -version = "*" -features = ["rayon"] - [dependencies.rayon] version = "*" -[dependencies.regex] -version = "*" - [dependencies.zip] version = "*" features = ["deflate"] diff --git a/hpm/src/main.rs b/hpm/src/main.rs index a970cb0..a8f3724 100644 --- a/hpm/src/main.rs +++ b/hpm/src/main.rs @@ -1,51 +1,71 @@ -use std::collections::HashMap; use std::env::args; use std::io::Cursor; use std::io::Read; -use chrono::prelude::*; use fimfareader::prelude::*; use rayon::prelude::*; -use indicatif::ParallelProgressIterator; - -use regex::Regex; -use regex::RegexBuilder; - use zip::ZipArchive; -// TODO: More varied and less literal matches. -const PATTERN: &str = "[^a-z]hug(s|ged|ging)?[^a-z]"; - +#[allow(unused)] +#[derive(Debug)] struct Stat { - date: DateTime, - count: u64, - words: u64, + story: i64, + chars: i64, + count: i64, } -fn count(regex: &Regex, story: &Story, data: Vec) -> Vec { +fn count(story: &Story, data: Vec) -> Stat { let mut archive = ZipArchive::new(Cursor::new(data)).unwrap(); - // TODO: Statistics per chapter. - let date = match story.date_published { - Some(published) => published, - None => return Vec::new(), - }; - - let mut matches = 0; + let id = story.id; + let mut count = 0; + let mut chars = 0; for i in 0..archive.len() { let mut file = archive.by_index(i).unwrap(); let mut data = String::with_capacity(file.size() as usize); - file.read_to_string(&mut data).unwrap(); - matches += regex.find_iter(&data).count(); + let bytes = match file.read_to_string(&mut data) { + Ok(bytes) => bytes, + Err(_) => { + return Stat { + story: id, + count: -1, + chars: -1, + } + } + }; + + let matches = data + .chars() + .enumerate() + .filter(|(_, chr)| *chr == '\u{9d}') + .map(|(index, _)| index) + .collect::>(); + + count += matches.len() as i64; + chars += bytes as i64; + + for pos in matches { + let min = pos.saturating_sub(32); + + let snip = data + .chars() + .skip(min) + .take(64) + .filter(|c| !c.is_whitespace() || *c == ' ') + .collect::(); + + println!("[{id:>6}] {snip}"); + } } - let count = matches as u64; - let words = story.num_words as u64; - - vec![Stat { date, count, words }] + Stat { + story: id, + count, + chars, + } } fn main() { @@ -58,37 +78,15 @@ fn main() { let fetcher = Fetcher::new(&argv[1]).unwrap(); - let pattern = RegexBuilder::new(PATTERN) - .case_insensitive(true) - .build() - .unwrap(); - let stats = fetcher .index() .par_iter() - .progress_count(fetcher.index().len() as u64) .map(|story| (story, fetcher.read(story).unwrap())) - .flat_map_iter(|(story, data)| count(&pattern, story, data)) - .collect::>(); - - // TODO: Finer granularity for better graphing. - let mut yearly = HashMap::::new(); + .map(|(story, data)| count(story, data)) + .filter(|stat| stat.count != 0) + .collect::>(); for stat in stats { - let year = stat.date.year(); - let value = yearly.remove(&year).unwrap_or_else(|| (0, 0)); - - yearly.insert(year, (value.0 + stat.count, value.1 + stat.words)); - } - - let mut yearly = yearly.into_iter().collect::>(); - - yearly.sort(); - - for (year, (count, words)) in yearly.into_iter() { - let modifier = 1_000_000f64 / words as f64; - let hpm = modifier * count as f64; - - println!("{year}: {hpm:.04}"); + println!("{stat:?}"); } }