From f87ae9410fd33491c2731935c4dd8fbaf2b175b9 Mon Sep 17 00:00:00 2001 From: Joakim Soderlund Date: Wed, 9 Feb 2022 22:46:06 +0100 Subject: [PATCH] Add initial HPM (Hugs Per Million (words)) counter --- Cargo.lock | 249 ++++++++++++++++++++++++++++++----------- Cargo.toml | 1 + hpm/Cargo.toml | 26 +++++ hpm/src/main.rs | 94 ++++++++++++++++ src/archive/fetcher.rs | 4 + 5 files changed, 307 insertions(+), 67 deletions(-) create mode 100644 hpm/Cargo.toml create mode 100644 hpm/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index cc80605..87bd556 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,9 +25,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" @@ -69,20 +69,19 @@ dependencies = [ [[package]] name = "chrono-english" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0be5180df5f7c41fc2416bc038bc8d78d44db8136c415b94ccbc95f523dc38e9" +checksum = "f73d909da7eb4a7d88c679c3f5a1bc09d965754e0adb2e7627426cef96a00d6f" dependencies = [ "chrono", "scanlex", - "time", ] [[package]] name = "clipboard-win" -version = "4.2.2" +version = "4.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3db8340083d28acb43451166543b98c838299b7e0863621be53a338adceea0ed" +checksum = "2f3e1238132dc01f081e1cbb9dace14e5ef4c3a51ee244bd982275fb514605db" dependencies = [ "error-code", "str-buf", @@ -90,19 +89,32 @@ dependencies = [ ] [[package]] -name = "crc32fast" -version = "1.3.0" +name = "console" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "738c290dfaea84fc1ca15ad9c168d083b05a714e1efddd8edaab678dc28d2836" +checksum = "a28b32d32ca44b70c3e4acd7db1babf555fa026e385fb95f18028f88848b3c31" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "terminal_size", + "winapi", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ "cfg-if", ] [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" dependencies = [ "cfg-if", "crossbeam-utils", @@ -121,9 +133,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.5" +version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" dependencies = [ "cfg-if", "crossbeam-utils", @@ -134,9 +146,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.5" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" dependencies = [ "cfg-if", "lazy_static", @@ -148,6 +160,12 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "endian-type" version = "0.1.2" @@ -155,10 +173,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" [[package]] -name = "error-code" -version = "2.3.0" +name = "errno" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "error-code" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64f18991e7bf11e7ffee451b5318b5c1a73c52d0d0ada6e5a3017c8c1ced6a21" dependencies = [ "libc", "str-buf", @@ -166,12 +205,12 @@ dependencies = [ [[package]] name = "fd-lock" -version = "3.0.1" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfc110fe50727d46a428eed832df40affe9bf74d077cac1bf3f2718e823f14c5" +checksum = "fcef756dea9cf3db5ce73759cf0467330427a786b47711b8d6c97620d718ceb9" dependencies = [ "cfg-if", - "libc", + "rustix", "windows-sys", ] @@ -197,6 +236,18 @@ dependencies = [ "rustyline", ] +[[package]] +name = "fimfareader-hpm" +version = "0.1.0" +dependencies = [ + "chrono", + "fimfareader", + "indicatif", + "rayon", + "regex", + "zip", +] + [[package]] name = "fimfareader-query" version = "0.1.0" @@ -237,10 +288,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] -name = "itoa" -version = "0.4.8" +name = "indicatif" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" +dependencies = [ + "console", + "lazy_static", + "number_prefix", + "rayon", + "regex", +] + +[[package]] +name = "io-lifetimes" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ef6787e7f0faedc040f95716bdd0e62bcfcf4ba93da053b62dea2691c13864" +dependencies = [ + "winapi", +] + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" [[package]] name = "lazy_static" @@ -263,9 +336,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.109" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98a04dce437184842841303488f70d0188c5f51437d2a834dc097eafa909a01" +checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" + +[[package]] +name = "linux-raw-sys" +version = "0.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95f5690fef754d905294c56f7ac815836f2513af966aa47f2e07ac79be07827f" [[package]] name = "log" @@ -312,9 +391,9 @@ dependencies = [ [[package]] name = "nix" -version = "0.23.0" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f305c2c2e4c39a82f7bf0bf65fb557f9070ce06781d4f2454295cc34b1c43188" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" dependencies = [ "bitflags", "cc", @@ -355,28 +434,40 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", "libc", ] [[package]] -name = "proc-macro2" -version = "1.0.32" +name = "number_prefix" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba508cc11742c0dc5c1659771673afbab7a0efab23aa17e854cbab0837ed0b43" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" + +[[package]] +name = "proc-macro2" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" dependencies = [ "unicode-xid", ] [[package]] name = "quote" -version = "1.0.10" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" dependencies = [ "proc-macro2", ] @@ -434,10 +525,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] -name = "rustyline" -version = "9.1.0" +name = "rustix" +version = "0.32.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dec400b53af4f6551ad23214ba08af344125785a0508228af83a84d498a5ce33" +checksum = "7cee647393af53c750e15dcbf7781cdd2e550b246bde76e46c326e7ea3c73773" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "winapi", +] + +[[package]] +name = "rustyline" +version = "9.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db7826789c0e25614b03e5a54a0717a86f9ff6e6e5247f92b369472869320039" dependencies = [ "bitflags", "cfg-if", @@ -458,9 +563,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.6" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c9613b5a66ab9ba26415184cfc41156594925a9cf3a2057e57f31ff145f6568" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" [[package]] name = "scanlex" @@ -476,18 +581,18 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "serde" -version = "1.0.130" +version = "1.0.136" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.130" +version = "1.0.136" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" dependencies = [ "proc-macro2", "quote", @@ -496,9 +601,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.72" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0ffa0837f2dfa6fb90868c2b5468cad482e175f7dad97e7421951e663f2b527" +checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" dependencies = [ "itoa", "ryu", @@ -507,9 +612,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "static_assertions" @@ -525,15 +630,25 @@ checksum = "d44a3643b4ff9caf57abcee9c2c621d6c03d9135e0d8b589bd9afb5992cb176a" [[package]] name = "syn" -version = "1.0.82" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" +checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" dependencies = [ "proc-macro2", "quote", "unicode-xid", ] +[[package]] +name = "terminal_size" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "thiserror" version = "1.0.30" @@ -567,9 +682,9 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" [[package]] name = "unicode-width" @@ -591,9 +706,9 @@ checksum = "936e4b492acfd135421d8dca4b1aa80a7bfc26e702ef3af710e0752684df5372" [[package]] name = "version_check" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "wasi" @@ -625,9 +740,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82ca39602d5cbfa692c4b67e3bcbb2751477355141c1ed434c94da4186836ff6" +checksum = "030b7ff91626e57a05ca64a07c481973cbb2db774e4852c9c7ca342408c6a99a" dependencies = [ "windows_aarch64_msvc", "windows_i686_gnu", @@ -638,33 +753,33 @@ dependencies = [ [[package]] name = "windows_aarch64_msvc" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52695a41e536859d5308cc613b4a022261a274390b25bd29dfff4bf08505f3c2" +checksum = "29277a4435d642f775f63c7d1faeb927adba532886ce0287bd985bffb16b6bca" [[package]] name = "windows_i686_gnu" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f54725ac23affef038fecb177de6c9bf065787c2f432f79e3c373da92f3e1d8a" +checksum = "1145e1989da93956c68d1864f32fb97c8f561a8f89a5125f6a2b7ea75524e4b8" [[package]] name = "windows_i686_msvc" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d5158a43cc43623c0729d1ad6647e62fa384a3d135fd15108d37c683461f64" +checksum = "d4a09e3a0d4753b73019db171c1339cd4362c8c44baf1bcea336235e955954a6" [[package]] name = "windows_x86_64_gnu" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc31f409f565611535130cfe7ee8e6655d3fa99c1c61013981e491921b5ce954" +checksum = "8ca64fcb0220d58db4c119e050e7af03c69e6f4f415ef69ec1773d9aab422d5a" [[package]] name = "windows_x86_64_msvc" -version = "0.28.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f2b8c7cbd3bfdddd9ab98769f9746a7fad1bca236554cd032b78d768bc0e89f" +checksum = "08cabc9f0066848fef4bc6a1c1668e6efce38b661d2aeec75d18d8617eebb5f1" [[package]] name = "zip" diff --git a/Cargo.toml b/Cargo.toml index b9a2358..5335c22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [workspace] members = [ "cli", + "hpm", "query", ] diff --git a/hpm/Cargo.toml b/hpm/Cargo.toml new file mode 100644 index 0000000..16a8b10 --- /dev/null +++ b/hpm/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "fimfareader-hpm" +version = "0.1.0" +authors = ["Joakim Soderlund "] +edition = "2021" + +[dependencies.fimfareader] +path = ".." + +[dependencies.chrono] +version = "*" + +[dependencies.indicatif] +version = "*" +features = ["rayon"] + +[dependencies.rayon] +version = "*" + +[dependencies.regex] +version = "*" + +[dependencies.zip] +version = "*" +features = ["deflate"] +default-features = false diff --git a/hpm/src/main.rs b/hpm/src/main.rs new file mode 100644 index 0000000..a970cb0 --- /dev/null +++ b/hpm/src/main.rs @@ -0,0 +1,94 @@ +use std::collections::HashMap; +use std::env::args; +use std::io::Cursor; +use std::io::Read; + +use chrono::prelude::*; +use fimfareader::prelude::*; +use rayon::prelude::*; + +use indicatif::ParallelProgressIterator; + +use regex::Regex; +use regex::RegexBuilder; + +use zip::ZipArchive; + +// TODO: More varied and less literal matches. +const PATTERN: &str = "[^a-z]hug(s|ged|ging)?[^a-z]"; + +struct Stat { + date: DateTime, + count: u64, + words: u64, +} + +fn count(regex: &Regex, story: &Story, data: Vec) -> Vec { + let mut archive = ZipArchive::new(Cursor::new(data)).unwrap(); + + // TODO: Statistics per chapter. + let date = match story.date_published { + Some(published) => published, + None => return Vec::new(), + }; + + let mut matches = 0; + + for i in 0..archive.len() { + let mut file = archive.by_index(i).unwrap(); + let mut data = String::with_capacity(file.size() as usize); + + file.read_to_string(&mut data).unwrap(); + matches += regex.find_iter(&data).count(); + } + + let count = matches as u64; + let words = story.num_words as u64; + + vec![Stat { date, count, words }] +} + +fn main() { + let argv = args().collect::>(); + + if argv.len() != 2 { + eprintln!("Usage: {} ", argv[0]); + std::process::exit(1); + } + + let fetcher = Fetcher::new(&argv[1]).unwrap(); + + let pattern = RegexBuilder::new(PATTERN) + .case_insensitive(true) + .build() + .unwrap(); + + let stats = fetcher + .index() + .par_iter() + .progress_count(fetcher.index().len() as u64) + .map(|story| (story, fetcher.read(story).unwrap())) + .flat_map_iter(|(story, data)| count(&pattern, story, data)) + .collect::>(); + + // TODO: Finer granularity for better graphing. + let mut yearly = HashMap::::new(); + + for stat in stats { + let year = stat.date.year(); + let value = yearly.remove(&year).unwrap_or_else(|| (0, 0)); + + yearly.insert(year, (value.0 + stat.count, value.1 + stat.words)); + } + + let mut yearly = yearly.into_iter().collect::>(); + + yearly.sort(); + + for (year, (count, words)) in yearly.into_iter() { + let modifier = 1_000_000f64 / words as f64; + let hpm = modifier * count as f64; + + println!("{year}: {hpm:.04}"); + } +} diff --git a/src/archive/fetcher.rs b/src/archive/fetcher.rs index fc014c5..9a8bb87 100644 --- a/src/archive/fetcher.rs +++ b/src/archive/fetcher.rs @@ -94,6 +94,10 @@ impl Fetcher { Ok(buf) } + pub fn index(&self) -> &Vec { + &self.index + } + pub fn iter(&self) -> impl Iterator { self.index.iter() }