Add ineffective deduplication experiment

This commit is contained in:
Joakim Soderlund 2020-05-22 21:19:01 +02:00
parent 2547cf8346
commit af2ce29d01
3 changed files with 50 additions and 38 deletions

View file

@ -35,7 +35,7 @@ version = "*"
[dependencies.serde] [dependencies.serde]
version = "*" version = "*"
features = ["derive"] features = ["derive", "rc"]
[dependencies.serde_json] [dependencies.serde_json]
version = "*" version = "*"

View file

@ -1,7 +1,9 @@
//! Index parser. //! Index parser.
use std::collections::HashSet;
use std::io::BufRead; use std::io::BufRead;
use std::sync::mpsc::{channel, Receiver}; use std::sync::mpsc::{channel, Receiver};
use std::sync::Arc;
use std::thread::spawn; use std::thread::spawn;
use rayon::prelude::*; use rayon::prelude::*;
@ -9,7 +11,7 @@ use serde::de::Error;
use serde_json::error::Result; use serde_json::error::Result;
use serde_json::from_str; use serde_json::from_str;
use super::story::Story; use super::story::{Author, Story, Tag};
const TRIM: &[char] = &['"', ',', ' ', '\t', '\n', '\r']; const TRIM: &[char] = &['"', ',', ' ', '\t', '\n', '\r'];
@ -46,9 +48,48 @@ pub fn parse(reader: impl BufRead) -> Result<Vec<Story>> {
return Err(Error::custom("Invalid file structure")); return Err(Error::custom("Invalid file structure"));
} }
rx.recv().map_err(|e| match e { let result = rx.recv().map_err(|e| match e {
_ => Error::custom("Missing parser result"), _ => Error::custom("Missing parser result"),
})? })?;
Ok(dedup(result?))
}
fn dedup(mut stories: Vec<Story>) -> Vec<Story> {
let mut authors: HashSet<Arc<Author>> = HashSet::new();
let mut tags: HashSet<Arc<Tag>> = HashSet::new();
for story in stories.iter_mut() {
if let Some(author) = authors.get(&story.author) {
story.author = author.clone();
} else {
authors.insert(story.author.clone());
}
}
for story in stories.iter_mut() {
let unseen = story
.tags
.iter()
.filter(|tag| !tags.contains(*tag))
.map(|tag| tag.clone())
.collect::<Vec<_>>();
tags.extend(unseen);
story.tags = story
.tags
.iter()
.filter_map(|tag| tags.get(tag))
.map(|tag| tag.clone())
.collect();
}
for tag in tags.iter() {
println!("{}: {}", tag.name, Arc::strong_count(tag));
}
stories
} }
fn spawn_parser(stream: Receiver<String>) -> Receiver<Result<Vec<Story>>> { fn spawn_parser(stream: Receiver<String>) -> Receiver<Result<Vec<Story>>> {

View file

@ -1,7 +1,6 @@
//! Story meta. //! Story meta.
use std::collections::HashSet; use std::sync::Arc;
use std::sync::Mutex;
use chrono::prelude::*; use chrono::prelude::*;
@ -9,16 +8,10 @@ use serde::de::Error;
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
use serde_json::Value; use serde_json::Value;
use lazy_static::lazy_static;
lazy_static! {
static ref TAGS: Mutex<HashSet<&'static Tag>> = Mutex::new(HashSet::new());
}
#[derive(Clone, Debug, Deserialize)] #[derive(Clone, Debug, Deserialize)]
pub struct Story { pub struct Story {
pub archive: Archive, pub archive: Archive,
pub author: Author, pub author: Arc<Author>,
pub chapters: Vec<Chapter>, pub chapters: Vec<Chapter>,
pub color: Option<Color>, pub color: Option<Color>,
pub completion_status: CompletionStatus, pub completion_status: CompletionStatus,
@ -43,8 +36,7 @@ pub struct Story {
pub short_description: String, pub short_description: String,
pub status: Status, pub status: Status,
pub submitted: bool, pub submitted: bool,
#[serde(deserialize_with = "interned_tag")] pub tags: Vec<Arc<Tag>>,
pub tags: Vec<&'static Tag>,
#[serde(deserialize_with = "null_to_text")] #[serde(deserialize_with = "null_to_text")]
pub title: String, pub title: String,
pub total_num_views: i32, pub total_num_views: i32,
@ -60,7 +52,7 @@ pub struct Archive {
pub path: String, pub path: String,
} }
#[derive(Clone, Debug, Deserialize)] #[derive(Clone, Debug, Hash, PartialEq, Eq, Deserialize)]
pub struct Author { pub struct Author {
pub avatar: Option<Avatar>, pub avatar: Option<Avatar>,
pub bio_html: Option<String>, pub bio_html: Option<String>,
@ -74,7 +66,7 @@ pub struct Author {
pub url: String, pub url: String,
} }
#[derive(Clone, Debug, Deserialize)] #[derive(Clone, Debug, Hash, PartialEq, Eq, Deserialize)]
pub struct Avatar { pub struct Avatar {
#[serde(rename = "16")] #[serde(rename = "16")]
pub x16: Option<String>, pub x16: Option<String>,
@ -200,27 +192,6 @@ where
} }
} }
fn interned_tag<'de, D>(d: D) -> Result<Vec<&'static Tag>, D::Error>
where
D: Deserializer<'de>,
{
let tags = Vec::<Tag>::deserialize(d)?;
let mut store = TAGS.lock().unwrap();
Ok(tags
.into_iter()
.map(|tag| match store.get(&tag) {
Some(tag) => tag,
None => {
let boxed: Box<Tag> = Box::new(tag);
let leaked: &'static Tag = Box::leak(boxed);
store.insert(leaked);
leaked
}
})
.collect())
}
impl<'de> Deserialize<'de> for Color { impl<'de> Deserialize<'de> for Color {
fn deserialize<D>(d: D) -> Result<Color, D::Error> fn deserialize<D>(d: D) -> Result<Color, D::Error>
where where