From af2ce29d01b98448ff5fccda2112540137547ea0 Mon Sep 17 00:00:00 2001 From: Joakim Soderlund Date: Fri, 22 May 2020 21:19:01 +0200 Subject: [PATCH] Add ineffective deduplication experiment --- Cargo.toml | 2 +- src/archive/parser.rs | 47 ++++++++++++++++++++++++++++++++++++++++--- src/archive/story.rs | 39 +++++------------------------------ 3 files changed, 50 insertions(+), 38 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 03b94a0..4cabbcc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ version = "*" [dependencies.serde] version = "*" -features = ["derive"] +features = ["derive", "rc"] [dependencies.serde_json] version = "*" diff --git a/src/archive/parser.rs b/src/archive/parser.rs index cad15ad..cc14185 100644 --- a/src/archive/parser.rs +++ b/src/archive/parser.rs @@ -1,7 +1,9 @@ //! Index parser. +use std::collections::HashSet; use std::io::BufRead; use std::sync::mpsc::{channel, Receiver}; +use std::sync::Arc; use std::thread::spawn; use rayon::prelude::*; @@ -9,7 +11,7 @@ use serde::de::Error; use serde_json::error::Result; use serde_json::from_str; -use super::story::Story; +use super::story::{Author, Story, Tag}; const TRIM: &[char] = &['"', ',', ' ', '\t', '\n', '\r']; @@ -46,9 +48,48 @@ pub fn parse(reader: impl BufRead) -> Result> { return Err(Error::custom("Invalid file structure")); } - rx.recv().map_err(|e| match e { + let result = rx.recv().map_err(|e| match e { _ => Error::custom("Missing parser result"), - })? + })?; + + Ok(dedup(result?)) +} + +fn dedup(mut stories: Vec) -> Vec { + let mut authors: HashSet> = HashSet::new(); + let mut tags: HashSet> = HashSet::new(); + + for story in stories.iter_mut() { + if let Some(author) = authors.get(&story.author) { + story.author = author.clone(); + } else { + authors.insert(story.author.clone()); + } + } + + for story in stories.iter_mut() { + let unseen = story + .tags + .iter() + .filter(|tag| !tags.contains(*tag)) + .map(|tag| tag.clone()) + .collect::>(); + + tags.extend(unseen); + + story.tags = story + .tags + .iter() + .filter_map(|tag| tags.get(tag)) + .map(|tag| tag.clone()) + .collect(); + } + + for tag in tags.iter() { + println!("{}: {}", tag.name, Arc::strong_count(tag)); + } + + stories } fn spawn_parser(stream: Receiver) -> Receiver>> { diff --git a/src/archive/story.rs b/src/archive/story.rs index 092a738..c6e133b 100644 --- a/src/archive/story.rs +++ b/src/archive/story.rs @@ -1,7 +1,6 @@ //! Story meta. -use std::collections::HashSet; -use std::sync::Mutex; +use std::sync::Arc; use chrono::prelude::*; @@ -9,16 +8,10 @@ use serde::de::Error; use serde::{Deserialize, Deserializer}; use serde_json::Value; -use lazy_static::lazy_static; - -lazy_static! { - static ref TAGS: Mutex> = Mutex::new(HashSet::new()); -} - #[derive(Clone, Debug, Deserialize)] pub struct Story { pub archive: Archive, - pub author: Author, + pub author: Arc, pub chapters: Vec, pub color: Option, pub completion_status: CompletionStatus, @@ -43,8 +36,7 @@ pub struct Story { pub short_description: String, pub status: Status, pub submitted: bool, - #[serde(deserialize_with = "interned_tag")] - pub tags: Vec<&'static Tag>, + pub tags: Vec>, #[serde(deserialize_with = "null_to_text")] pub title: String, pub total_num_views: i32, @@ -60,7 +52,7 @@ pub struct Archive { pub path: String, } -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Hash, PartialEq, Eq, Deserialize)] pub struct Author { pub avatar: Option, pub bio_html: Option, @@ -74,7 +66,7 @@ pub struct Author { pub url: String, } -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Hash, PartialEq, Eq, Deserialize)] pub struct Avatar { #[serde(rename = "16")] pub x16: Option, @@ -200,27 +192,6 @@ where } } -fn interned_tag<'de, D>(d: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let tags = Vec::::deserialize(d)?; - let mut store = TAGS.lock().unwrap(); - - Ok(tags - .into_iter() - .map(|tag| match store.get(&tag) { - Some(tag) => tag, - None => { - let boxed: Box = Box::new(tag); - let leaked: &'static Tag = Box::leak(boxed); - store.insert(leaked); - leaked - } - }) - .collect()) -} - impl<'de> Deserialize<'de> for Color { fn deserialize(d: D) -> Result where