mirror of
https://github.com/JockeTF/fimfareader.git
synced 2024-11-23 13:58:00 +01:00
Modify HPM to look for specific Unicode characters
This commit is contained in:
parent
f87ae9410f
commit
c0e206501a
4 changed files with 66 additions and 138 deletions
88
Cargo.lock
generated
88
Cargo.lock
generated
|
@ -43,9 +43,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
|||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.72"
|
||||
version = "1.0.73"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
|
||||
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
|
@ -88,19 +88,6 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "console"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a28b32d32ca44b70c3e4acd7db1babf555fa026e385fb95f18028f88848b3c31"
|
||||
dependencies = [
|
||||
"encode_unicode",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"terminal_size",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.3.2"
|
||||
|
@ -160,12 +147,6 @@ version = "1.6.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
|
||||
|
||||
[[package]]
|
||||
name = "endian-type"
|
||||
version = "0.1.2"
|
||||
|
@ -205,9 +186,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "fd-lock"
|
||||
version = "3.0.3"
|
||||
version = "3.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fcef756dea9cf3db5ce73759cf0467330427a786b47711b8d6c97620d718ceb9"
|
||||
checksum = "02ecad9808e0596f8956d14f7fa868f996290bd01c8d7329d6e5bc2bb76adf8f"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"rustix",
|
||||
|
@ -240,11 +221,8 @@ dependencies = [
|
|||
name = "fimfareader-hpm"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"fimfareader",
|
||||
"indicatif",
|
||||
"rayon",
|
||||
"regex",
|
||||
"zip",
|
||||
]
|
||||
|
||||
|
@ -287,27 +265,11 @@ version = "0.4.3"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
|
||||
|
||||
[[package]]
|
||||
name = "indicatif"
|
||||
version = "0.16.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b"
|
||||
dependencies = [
|
||||
"console",
|
||||
"lazy_static",
|
||||
"number_prefix",
|
||||
"rayon",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "0.4.4"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6ef6787e7f0faedc040f95716bdd0e62bcfcf4ba93da053b62dea2691c13864"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
checksum = "ec58677acfea8a15352d42fc87d11d63596ade9239e0a7c9352914417515dbe6"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
|
@ -336,15 +298,15 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.117"
|
||||
version = "0.2.119"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
|
||||
checksum = "1bf2e165bb3457c8e098ea76f3e3bc9db55f87aa90d52d0e6be741470916aaa4"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.0.37"
|
||||
version = "0.0.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95f5690fef754d905294c56f7ac815836f2513af966aa47f2e07ac79be07827f"
|
||||
checksum = "5284f00d480e1c39af34e72f8ad60b94f47007e3481cd3b731c1d67190ddc7b7"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
|
@ -442,18 +404,6 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "number_prefix"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.36"
|
||||
|
@ -526,9 +476,9 @@ checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
|||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.32.1"
|
||||
version = "0.33.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cee647393af53c750e15dcbf7781cdd2e550b246bde76e46c326e7ea3c73773"
|
||||
checksum = "a9466f25b92a648960ac1042fd3baa6b0bf285e60f754d7e5070770c813a177a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
|
@ -601,9 +551,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.78"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085"
|
||||
checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"ryu",
|
||||
|
@ -639,16 +589,6 @@ dependencies = [
|
|||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "terminal_size"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.30"
|
||||
|
|
|
@ -12,7 +12,7 @@ members = [
|
|||
]
|
||||
|
||||
default-members = [
|
||||
"cli",
|
||||
"hpm",
|
||||
]
|
||||
|
||||
[profile.dev.package."*"]
|
||||
|
|
|
@ -7,19 +7,9 @@ edition = "2021"
|
|||
[dependencies.fimfareader]
|
||||
path = ".."
|
||||
|
||||
[dependencies.chrono]
|
||||
version = "*"
|
||||
|
||||
[dependencies.indicatif]
|
||||
version = "*"
|
||||
features = ["rayon"]
|
||||
|
||||
[dependencies.rayon]
|
||||
version = "*"
|
||||
|
||||
[dependencies.regex]
|
||||
version = "*"
|
||||
|
||||
[dependencies.zip]
|
||||
version = "*"
|
||||
features = ["deflate"]
|
||||
|
|
104
hpm/src/main.rs
104
hpm/src/main.rs
|
@ -1,51 +1,71 @@
|
|||
use std::collections::HashMap;
|
||||
use std::env::args;
|
||||
use std::io::Cursor;
|
||||
use std::io::Read;
|
||||
|
||||
use chrono::prelude::*;
|
||||
use fimfareader::prelude::*;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use indicatif::ParallelProgressIterator;
|
||||
|
||||
use regex::Regex;
|
||||
use regex::RegexBuilder;
|
||||
|
||||
use zip::ZipArchive;
|
||||
|
||||
// TODO: More varied and less literal matches.
|
||||
const PATTERN: &str = "[^a-z]hug(s|ged|ging)?[^a-z]";
|
||||
|
||||
#[allow(unused)]
|
||||
#[derive(Debug)]
|
||||
struct Stat {
|
||||
date: DateTime<Utc>,
|
||||
count: u64,
|
||||
words: u64,
|
||||
story: i64,
|
||||
chars: i64,
|
||||
count: i64,
|
||||
}
|
||||
|
||||
fn count(regex: &Regex, story: &Story, data: Vec<u8>) -> Vec<Stat> {
|
||||
fn count(story: &Story, data: Vec<u8>) -> Stat {
|
||||
let mut archive = ZipArchive::new(Cursor::new(data)).unwrap();
|
||||
|
||||
// TODO: Statistics per chapter.
|
||||
let date = match story.date_published {
|
||||
Some(published) => published,
|
||||
None => return Vec::new(),
|
||||
};
|
||||
|
||||
let mut matches = 0;
|
||||
let id = story.id;
|
||||
let mut count = 0;
|
||||
let mut chars = 0;
|
||||
|
||||
for i in 0..archive.len() {
|
||||
let mut file = archive.by_index(i).unwrap();
|
||||
let mut data = String::with_capacity(file.size() as usize);
|
||||
|
||||
file.read_to_string(&mut data).unwrap();
|
||||
matches += regex.find_iter(&data).count();
|
||||
let bytes = match file.read_to_string(&mut data) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(_) => {
|
||||
return Stat {
|
||||
story: id,
|
||||
count: -1,
|
||||
chars: -1,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let matches = data
|
||||
.chars()
|
||||
.enumerate()
|
||||
.filter(|(_, chr)| *chr == '\u{9d}')
|
||||
.map(|(index, _)| index)
|
||||
.collect::<Vec<usize>>();
|
||||
|
||||
count += matches.len() as i64;
|
||||
chars += bytes as i64;
|
||||
|
||||
for pos in matches {
|
||||
let min = pos.saturating_sub(32);
|
||||
|
||||
let snip = data
|
||||
.chars()
|
||||
.skip(min)
|
||||
.take(64)
|
||||
.filter(|c| !c.is_whitespace() || *c == ' ')
|
||||
.collect::<String>();
|
||||
|
||||
println!("[{id:>6}] {snip}");
|
||||
}
|
||||
}
|
||||
|
||||
let count = matches as u64;
|
||||
let words = story.num_words as u64;
|
||||
|
||||
vec![Stat { date, count, words }]
|
||||
Stat {
|
||||
story: id,
|
||||
count,
|
||||
chars,
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
@ -58,37 +78,15 @@ fn main() {
|
|||
|
||||
let fetcher = Fetcher::new(&argv[1]).unwrap();
|
||||
|
||||
let pattern = RegexBuilder::new(PATTERN)
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let stats = fetcher
|
||||
.index()
|
||||
.par_iter()
|
||||
.progress_count(fetcher.index().len() as u64)
|
||||
.map(|story| (story, fetcher.read(story).unwrap()))
|
||||
.flat_map_iter(|(story, data)| count(&pattern, story, data))
|
||||
.collect::<Vec<Stat>>();
|
||||
|
||||
// TODO: Finer granularity for better graphing.
|
||||
let mut yearly = HashMap::<i32, (u64, u64)>::new();
|
||||
.map(|(story, data)| count(story, data))
|
||||
.filter(|stat| stat.count != 0)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for stat in stats {
|
||||
let year = stat.date.year();
|
||||
let value = yearly.remove(&year).unwrap_or_else(|| (0, 0));
|
||||
|
||||
yearly.insert(year, (value.0 + stat.count, value.1 + stat.words));
|
||||
}
|
||||
|
||||
let mut yearly = yearly.into_iter().collect::<Vec<_>>();
|
||||
|
||||
yearly.sort();
|
||||
|
||||
for (year, (count, words)) in yearly.into_iter() {
|
||||
let modifier = 1_000_000f64 / words as f64;
|
||||
let hpm = modifier * count as f64;
|
||||
|
||||
println!("{year}: {hpm:.04}");
|
||||
println!("{stat:?}");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue