Add ineffective deduplication experiment

This commit is contained in:
Joakim Soderlund 2020-05-22 21:19:01 +02:00
parent 2547cf8346
commit af2ce29d01
3 changed files with 50 additions and 38 deletions

View file

@ -35,7 +35,7 @@ version = "*"
[dependencies.serde]
version = "*"
features = ["derive"]
features = ["derive", "rc"]
[dependencies.serde_json]
version = "*"

View file

@ -1,7 +1,9 @@
//! Index parser.
use std::collections::HashSet;
use std::io::BufRead;
use std::sync::mpsc::{channel, Receiver};
use std::sync::Arc;
use std::thread::spawn;
use rayon::prelude::*;
@ -9,7 +11,7 @@ use serde::de::Error;
use serde_json::error::Result;
use serde_json::from_str;
use super::story::Story;
use super::story::{Author, Story, Tag};
const TRIM: &[char] = &['"', ',', ' ', '\t', '\n', '\r'];
@ -46,9 +48,48 @@ pub fn parse(reader: impl BufRead) -> Result<Vec<Story>> {
return Err(Error::custom("Invalid file structure"));
}
rx.recv().map_err(|e| match e {
let result = rx.recv().map_err(|e| match e {
_ => Error::custom("Missing parser result"),
})?
})?;
Ok(dedup(result?))
}
fn dedup(mut stories: Vec<Story>) -> Vec<Story> {
let mut authors: HashSet<Arc<Author>> = HashSet::new();
let mut tags: HashSet<Arc<Tag>> = HashSet::new();
for story in stories.iter_mut() {
if let Some(author) = authors.get(&story.author) {
story.author = author.clone();
} else {
authors.insert(story.author.clone());
}
}
for story in stories.iter_mut() {
let unseen = story
.tags
.iter()
.filter(|tag| !tags.contains(*tag))
.map(|tag| tag.clone())
.collect::<Vec<_>>();
tags.extend(unseen);
story.tags = story
.tags
.iter()
.filter_map(|tag| tags.get(tag))
.map(|tag| tag.clone())
.collect();
}
for tag in tags.iter() {
println!("{}: {}", tag.name, Arc::strong_count(tag));
}
stories
}
fn spawn_parser(stream: Receiver<String>) -> Receiver<Result<Vec<Story>>> {

View file

@ -1,7 +1,6 @@
//! Story meta.
use std::collections::HashSet;
use std::sync::Mutex;
use std::sync::Arc;
use chrono::prelude::*;
@ -9,16 +8,10 @@ use serde::de::Error;
use serde::{Deserialize, Deserializer};
use serde_json::Value;
use lazy_static::lazy_static;
lazy_static! {
static ref TAGS: Mutex<HashSet<&'static Tag>> = Mutex::new(HashSet::new());
}
#[derive(Clone, Debug, Deserialize)]
pub struct Story {
pub archive: Archive,
pub author: Author,
pub author: Arc<Author>,
pub chapters: Vec<Chapter>,
pub color: Option<Color>,
pub completion_status: CompletionStatus,
@ -43,8 +36,7 @@ pub struct Story {
pub short_description: String,
pub status: Status,
pub submitted: bool,
#[serde(deserialize_with = "interned_tag")]
pub tags: Vec<&'static Tag>,
pub tags: Vec<Arc<Tag>>,
#[serde(deserialize_with = "null_to_text")]
pub title: String,
pub total_num_views: i32,
@ -60,7 +52,7 @@ pub struct Archive {
pub path: String,
}
#[derive(Clone, Debug, Deserialize)]
#[derive(Clone, Debug, Hash, PartialEq, Eq, Deserialize)]
pub struct Author {
pub avatar: Option<Avatar>,
pub bio_html: Option<String>,
@ -74,7 +66,7 @@ pub struct Author {
pub url: String,
}
#[derive(Clone, Debug, Deserialize)]
#[derive(Clone, Debug, Hash, PartialEq, Eq, Deserialize)]
pub struct Avatar {
#[serde(rename = "16")]
pub x16: Option<String>,
@ -200,27 +192,6 @@ where
}
}
fn interned_tag<'de, D>(d: D) -> Result<Vec<&'static Tag>, D::Error>
where
D: Deserializer<'de>,
{
let tags = Vec::<Tag>::deserialize(d)?;
let mut store = TAGS.lock().unwrap();
Ok(tags
.into_iter()
.map(|tag| match store.get(&tag) {
Some(tag) => tag,
None => {
let boxed: Box<Tag> = Box::new(tag);
let leaked: &'static Tag = Box::leak(boxed);
store.insert(leaked);
leaked
}
})
.collect())
}
impl<'de> Deserialize<'de> for Color {
fn deserialize<D>(d: D) -> Result<Color, D::Error>
where