bookdata/goodreads/
work.rs

1//! GoodReads work schemas and record processing.
2use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::parsing::*;
7use crate::prelude::*;
8
9const OUT_FILE: &'static str = "gr-work-info.parquet";
10
11/// Work records as parsed from JSON.
12#[derive(Deserialize)]
13pub struct RawWork {
14    pub work_id: String,
15    #[serde(default)]
16    pub original_title: String,
17    #[serde(default)]
18    pub original_publication_year: String,
19    #[serde(default)]
20    pub original_publication_month: String,
21    #[serde(default)]
22    pub original_publication_day: String,
23}
24
25/// Rows in the processed work Parquet table.
26#[derive(ParquetRecordWriter)]
27pub struct WorkRecord {
28    pub work_id: i32,
29    pub title: Option<String>,
30    pub pub_year: Option<i16>,
31    pub pub_month: Option<u8>,
32}
33
34/// Object writer to transform and write GoodReads works
35pub struct WorkWriter {
36    writer: TableWriter<WorkRecord>,
37    n_recs: usize,
38}
39
40impl WorkWriter {
41    /// Open a new output
42    pub fn open() -> Result<WorkWriter> {
43        let writer = TableWriter::open(OUT_FILE)?;
44        Ok(WorkWriter { writer, n_recs: 0 })
45    }
46}
47
48impl DataSink for WorkWriter {
49    fn output_files(&self) -> Vec<PathBuf> {
50        path_list(&[OUT_FILE])
51    }
52}
53
54impl ObjectWriter<RawWork> for WorkWriter {
55    fn write_object(&mut self, row: RawWork) -> Result<()> {
56        let work_id: i32 = row.work_id.parse()?;
57
58        let pub_year = parse_opt(&row.original_publication_year)?;
59        let pub_month = parse_opt(&row.original_publication_month)?;
60
61        self.writer.write_object(WorkRecord {
62            work_id,
63            title: trim_owned(&row.original_title),
64            pub_year,
65            pub_month,
66        })?;
67        self.n_recs += 1;
68        Ok(())
69    }
70
71    fn finish(self) -> Result<usize> {
72        self.writer.finish()?;
73        Ok(self.n_recs)
74    }
75}