bookdata/goodreads/
genres.rs1use std::collections::HashMap;
3
4use parquet_derive::ParquetRecordWriter;
5use serde::Deserialize;
6
7use crate::arrow::*;
8use crate::ids::index::IdIndex;
9use crate::prelude::*;
10
11const OUT_FILE: &'static str = "gr-book-genres.parquet";
12const GENRE_FILE: &'static str = "gr-genres.parquet";
13
14#[derive(Deserialize)]
16pub struct RawBookGenre {
17 pub book_id: String,
18 #[serde(default)]
19 pub genres: HashMap<String, i32>,
20}
21
22#[derive(ParquetRecordWriter)]
24pub struct BookGenreRecord {
25 pub book_id: i32,
26 pub genre_id: i32,
27 pub count: i32,
28}
29
30pub struct BookGenreWriter {
32 genres: IdIndex<String>,
33 writer: TableWriter<BookGenreRecord>,
34 n_recs: usize,
35}
36
37impl BookGenreWriter {
38 pub fn open() -> Result<BookGenreWriter> {
40 let writer = TableWriter::open(OUT_FILE)?;
41 Ok(BookGenreWriter {
42 genres: IdIndex::new(),
43 writer,
44 n_recs: 0,
45 })
46 }
47}
48
49impl DataSink for BookGenreWriter {
50 fn output_files(&self) -> Vec<PathBuf> {
51 path_list(&[OUT_FILE, GENRE_FILE])
52 }
53}
54
55impl ObjectWriter<RawBookGenre> for BookGenreWriter {
56 fn write_object(&mut self, row: RawBookGenre) -> Result<()> {
57 let book_id: i32 = row.book_id.parse()?;
58
59 for (genre, count) in row.genres {
60 let genre_id = self.genres.intern(&genre)?;
61 self.writer.write_object(BookGenreRecord {
62 book_id,
63 genre_id,
64 count,
65 })?;
66
67 self.n_recs += 1;
68 }
69
70 Ok(())
71 }
72
73 fn finish(self) -> Result<usize> {
74 self.writer.finish()?;
75 self.genres.save(GENRE_FILE, "genre_id", "genre")?;
76 Ok(self.n_recs)
77 }
78}