bookdata/goodreads/
genres.rs

1//! GoodReads book genre records.
2use std::collections::HashMap;
3
4use parquet_derive::ParquetRecordWriter;
5use serde::Deserialize;
6
7use crate::arrow::*;
8use crate::ids::index::IdIndex;
9use crate::prelude::*;
10
11const OUT_FILE: &'static str = "gr-book-genres.parquet";
12const GENRE_FILE: &'static str = "gr-genres.parquet";
13
14/// Book-genre records as parsed from JSON.
15#[derive(Deserialize)]
16pub struct RawBookGenre {
17    pub book_id: String,
18    #[serde(default)]
19    pub genres: HashMap<String, i32>,
20}
21
22/// Rows in the processed book-genre Parquet table.
23#[derive(ParquetRecordWriter)]
24pub struct BookGenreRecord {
25    pub book_id: i32,
26    pub genre_id: i32,
27    pub count: i32,
28}
29
30/// Object writer to transform and write GoodReads book-genre records
31pub struct BookGenreWriter {
32    genres: IdIndex<String>,
33    writer: TableWriter<BookGenreRecord>,
34    n_recs: usize,
35}
36
37impl BookGenreWriter {
38    /// Open a new output
39    pub fn open() -> Result<BookGenreWriter> {
40        let writer = TableWriter::open(OUT_FILE)?;
41        Ok(BookGenreWriter {
42            genres: IdIndex::new(),
43            writer,
44            n_recs: 0,
45        })
46    }
47}
48
49impl DataSink for BookGenreWriter {
50    fn output_files(&self) -> Vec<PathBuf> {
51        path_list(&[OUT_FILE, GENRE_FILE])
52    }
53}
54
55impl ObjectWriter<RawBookGenre> for BookGenreWriter {
56    fn write_object(&mut self, row: RawBookGenre) -> Result<()> {
57        let book_id: i32 = row.book_id.parse()?;
58
59        for (genre, count) in row.genres {
60            let genre_id = self.genres.intern(&genre)?;
61            self.writer.write_object(BookGenreRecord {
62                book_id,
63                genre_id,
64                count,
65            })?;
66
67            self.n_recs += 1;
68        }
69
70        Ok(())
71    }
72
73    fn finish(self) -> Result<usize> {
74        self.writer.finish()?;
75        self.genres.save(GENRE_FILE, "genre_id", "genre")?;
76        Ok(self.n_recs)
77    }
78}