1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
//! GoodReads book genre records.
use std::collections::HashMap;

use serde::Deserialize;

use crate::arrow::*;
use crate::ids::index::IdIndex;
use crate::prelude::*;

const OUT_FILE: &'static str = "gr-book-genres.parquet";
const GENRE_FILE: &'static str = "gr-genres.parquet";

/// Book-genre records as parsed from JSON.
#[derive(Deserialize)]
pub struct RawBookGenre {
    pub book_id: String,
    #[serde(default)]
    pub genres: HashMap<String, i32>,
}

/// Rows in the processed book-genre Parquet table.
#[derive(TableRow)]
pub struct BookGenreRecord {
    pub book_id: i32,
    pub genre_id: i32,
    pub count: i32,
}

/// Object writer to transform and write GoodReads book-genre records
pub struct BookGenreWriter {
    genres: IdIndex<String>,
    writer: TableWriter<BookGenreRecord>,
    n_recs: usize,
}

impl BookGenreWriter {
    /// Open a new output
    pub fn open() -> Result<BookGenreWriter> {
        let writer = TableWriter::open(OUT_FILE)?;
        Ok(BookGenreWriter {
            genres: IdIndex::new(),
            writer,
            n_recs: 0,
        })
    }
}

impl DataSink for BookGenreWriter {
    fn output_files(&self) -> Vec<PathBuf> {
        path_list(&[OUT_FILE, GENRE_FILE])
    }
}

impl ObjectWriter<RawBookGenre> for BookGenreWriter {
    fn write_object(&mut self, row: RawBookGenre) -> Result<()> {
        let book_id: i32 = row.book_id.parse()?;

        for (genre, count) in row.genres {
            let genre_id = self.genres.intern(&genre)?;
            self.writer.write_object(BookGenreRecord {
                book_id,
                genre_id,
                count,
            })?;

            self.n_recs += 1;
        }

        Ok(())
    }

    fn finish(self) -> Result<usize> {
        self.writer.finish()?;
        self.genres.save(GENRE_FILE, "genre_id", "genre")?;
        Ok(self.n_recs)
    }
}