bookdata/goodreads/
author.rs

1//! GoodReads work schemas and record processing.
2use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::parsing::*;
7use crate::prelude::*;
8
9const OUT_FILE: &'static str = "gr-author-info.parquet";
10
11/// Author records as parsed from JSON.
12#[derive(Deserialize)]
13pub struct RawAuthor {
14    pub author_id: String,
15    pub name: String,
16}
17
18/// Rows in the processed work Parquet table.
19#[derive(ParquetRecordWriter)]
20pub struct AuthorRecord {
21    pub author_id: i32,
22    pub name: Option<String>,
23}
24
25/// Object writer to transform and write GoodReads works
26pub struct AuthorWriter {
27    writer: TableWriter<AuthorRecord>,
28    n_recs: usize,
29}
30
31impl AuthorWriter {
32    /// Open a new output
33    pub fn open() -> Result<AuthorWriter> {
34        let writer = TableWriter::open(OUT_FILE)?;
35        Ok(AuthorWriter { writer, n_recs: 0 })
36    }
37}
38
39impl DataSink for AuthorWriter {
40    fn output_files(&self) -> Vec<PathBuf> {
41        path_list(&[OUT_FILE])
42    }
43}
44
45impl ObjectWriter<RawAuthor> for AuthorWriter {
46    fn write_object(&mut self, row: RawAuthor) -> Result<()> {
47        let author_id: i32 = row.author_id.parse()?;
48
49        self.writer.write_object(AuthorRecord {
50            author_id,
51            name: trim_owned(&row.name),
52        })?;
53
54        self.n_recs += 1;
55        Ok(())
56    }
57
58    fn finish(self) -> Result<usize> {
59        self.writer.finish()?;
60        Ok(self.n_recs)
61    }
62}