Modify HPM to look for specific Unicode characters

This commit is contained in:
Joakim Soderlund 2022-02-26 11:55:53 +01:00
parent f87ae9410f
commit c0e206501a
4 changed files with 66 additions and 138 deletions

88
Cargo.lock generated
View file

@ -43,9 +43,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cc"
version = "1.0.72"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]]
name = "cfg-if"
@ -88,19 +88,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "console"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28b32d32ca44b70c3e4acd7db1babf555fa026e385fb95f18028f88848b3c31"
dependencies = [
"encode_unicode",
"libc",
"once_cell",
"terminal_size",
"winapi",
]
[[package]]
name = "crc32fast"
version = "1.3.2"
@ -160,12 +147,6 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]]
name = "endian-type"
version = "0.1.2"
@ -205,9 +186,9 @@ dependencies = [
[[package]]
name = "fd-lock"
version = "3.0.3"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcef756dea9cf3db5ce73759cf0467330427a786b47711b8d6c97620d718ceb9"
checksum = "02ecad9808e0596f8956d14f7fa868f996290bd01c8d7329d6e5bc2bb76adf8f"
dependencies = [
"cfg-if",
"rustix",
@ -240,11 +221,8 @@ dependencies = [
name = "fimfareader-hpm"
version = "0.1.0"
dependencies = [
"chrono",
"fimfareader",
"indicatif",
"rayon",
"regex",
"zip",
]
@ -287,27 +265,11 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "indicatif"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b"
dependencies = [
"console",
"lazy_static",
"number_prefix",
"rayon",
"regex",
]
[[package]]
name = "io-lifetimes"
version = "0.4.4"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6ef6787e7f0faedc040f95716bdd0e62bcfcf4ba93da053b62dea2691c13864"
dependencies = [
"winapi",
]
checksum = "ec58677acfea8a15352d42fc87d11d63596ade9239e0a7c9352914417515dbe6"
[[package]]
name = "itoa"
@ -336,15 +298,15 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.117"
version = "0.2.119"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
checksum = "1bf2e165bb3457c8e098ea76f3e3bc9db55f87aa90d52d0e6be741470916aaa4"
[[package]]
name = "linux-raw-sys"
version = "0.0.37"
version = "0.0.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95f5690fef754d905294c56f7ac815836f2513af966aa47f2e07ac79be07827f"
checksum = "5284f00d480e1c39af34e72f8ad60b94f47007e3481cd3b731c1d67190ddc7b7"
[[package]]
name = "log"
@ -442,18 +404,6 @@ dependencies = [
"libc",
]
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "once_cell"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
[[package]]
name = "proc-macro2"
version = "1.0.36"
@ -526,9 +476,9 @@ checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "rustix"
version = "0.32.1"
version = "0.33.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cee647393af53c750e15dcbf7781cdd2e550b246bde76e46c326e7ea3c73773"
checksum = "a9466f25b92a648960ac1042fd3baa6b0bf285e60f754d7e5070770c813a177a"
dependencies = [
"bitflags",
"errno",
@ -601,9 +551,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.78"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085"
checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95"
dependencies = [
"itoa",
"ryu",
@ -639,16 +589,6 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "terminal_size"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "thiserror"
version = "1.0.30"

View file

@ -12,7 +12,7 @@ members = [
]
default-members = [
"cli",
"hpm",
]
[profile.dev.package."*"]

View file

@ -7,19 +7,9 @@ edition = "2021"
[dependencies.fimfareader]
path = ".."
[dependencies.chrono]
version = "*"
[dependencies.indicatif]
version = "*"
features = ["rayon"]
[dependencies.rayon]
version = "*"
[dependencies.regex]
version = "*"
[dependencies.zip]
version = "*"
features = ["deflate"]

View file

@ -1,51 +1,71 @@
use std::collections::HashMap;
use std::env::args;
use std::io::Cursor;
use std::io::Read;
use chrono::prelude::*;
use fimfareader::prelude::*;
use rayon::prelude::*;
use indicatif::ParallelProgressIterator;
use regex::Regex;
use regex::RegexBuilder;
use zip::ZipArchive;
// TODO: More varied and less literal matches.
const PATTERN: &str = "[^a-z]hug(s|ged|ging)?[^a-z]";
#[allow(unused)]
#[derive(Debug)]
struct Stat {
date: DateTime<Utc>,
count: u64,
words: u64,
story: i64,
chars: i64,
count: i64,
}
fn count(regex: &Regex, story: &Story, data: Vec<u8>) -> Vec<Stat> {
fn count(story: &Story, data: Vec<u8>) -> Stat {
let mut archive = ZipArchive::new(Cursor::new(data)).unwrap();
// TODO: Statistics per chapter.
let date = match story.date_published {
Some(published) => published,
None => return Vec::new(),
};
let mut matches = 0;
let id = story.id;
let mut count = 0;
let mut chars = 0;
for i in 0..archive.len() {
let mut file = archive.by_index(i).unwrap();
let mut data = String::with_capacity(file.size() as usize);
file.read_to_string(&mut data).unwrap();
matches += regex.find_iter(&data).count();
let bytes = match file.read_to_string(&mut data) {
Ok(bytes) => bytes,
Err(_) => {
return Stat {
story: id,
count: -1,
chars: -1,
}
}
};
let matches = data
.chars()
.enumerate()
.filter(|(_, chr)| *chr == '\u{9d}')
.map(|(index, _)| index)
.collect::<Vec<usize>>();
count += matches.len() as i64;
chars += bytes as i64;
for pos in matches {
let min = pos.saturating_sub(32);
let snip = data
.chars()
.skip(min)
.take(64)
.filter(|c| !c.is_whitespace() || *c == ' ')
.collect::<String>();
println!("[{id:>6}] {snip}");
}
}
let count = matches as u64;
let words = story.num_words as u64;
vec![Stat { date, count, words }]
Stat {
story: id,
count,
chars,
}
}
fn main() {
@ -58,37 +78,15 @@ fn main() {
let fetcher = Fetcher::new(&argv[1]).unwrap();
let pattern = RegexBuilder::new(PATTERN)
.case_insensitive(true)
.build()
.unwrap();
let stats = fetcher
.index()
.par_iter()
.progress_count(fetcher.index().len() as u64)
.map(|story| (story, fetcher.read(story).unwrap()))
.flat_map_iter(|(story, data)| count(&pattern, story, data))
.collect::<Vec<Stat>>();
// TODO: Finer granularity for better graphing.
let mut yearly = HashMap::<i32, (u64, u64)>::new();
.map(|(story, data)| count(story, data))
.filter(|stat| stat.count != 0)
.collect::<Vec<_>>();
for stat in stats {
let year = stat.date.year();
let value = yearly.remove(&year).unwrap_or_else(|| (0, 0));
yearly.insert(year, (value.0 + stat.count, value.1 + stat.words));
}
let mut yearly = yearly.into_iter().collect::<Vec<_>>();
yearly.sort();
for (year, (count, words)) in yearly.into_iter() {
let modifier = 1_000_000f64 / words as f64;
let hpm = modifier * count as f64;
println!("{year}: {hpm:.04}");
println!("{stat:?}");
}
}