1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
//! GoodReads work schemas and record processing.
use serde::Deserialize;

use crate::arrow::*;
use crate::parsing::*;
use crate::prelude::*;

const OUT_FILE: &'static str = "gr-work-info.parquet";

/// Work records as parsed from JSON.
#[derive(Deserialize)]
pub struct RawWork {
    pub work_id: String,
    #[serde(default)]
    pub original_title: String,
    #[serde(default)]
    pub original_publication_year: String,
    #[serde(default)]
    pub original_publication_month: String,
    #[serde(default)]
    pub original_publication_day: String,
}

/// Rows in the processed work Parquet table.
#[derive(TableRow)]
pub struct WorkRecord {
    pub work_id: i32,
    pub title: Option<String>,
    pub pub_year: Option<i16>,
    pub pub_month: Option<u8>,
}

/// Object writer to transform and write GoodReads works
pub struct WorkWriter {
    writer: TableWriter<WorkRecord>,
    n_recs: usize,
}

impl WorkWriter {
    /// Open a new output
    pub fn open() -> Result<WorkWriter> {
        let writer = TableWriter::open(OUT_FILE)?;
        Ok(WorkWriter { writer, n_recs: 0 })
    }
}

impl DataSink for WorkWriter {
    fn output_files(&self) -> Vec<PathBuf> {
        path_list(&[OUT_FILE])
    }
}

impl ObjectWriter<RawWork> for WorkWriter {
    fn write_object(&mut self, row: RawWork) -> Result<()> {
        let work_id: i32 = row.work_id.parse()?;

        let pub_year = parse_opt(&row.original_publication_year)?;
        let pub_month = parse_opt(&row.original_publication_month)?;

        self.writer.write_object(WorkRecord {
            work_id,
            title: trim_owned(&row.original_title),
            pub_year,
            pub_month,
        })?;
        self.n_recs += 1;
        Ok(())
    }

    fn finish(self) -> Result<usize> {
        self.writer.finish()?;
        Ok(self.n_recs)
    }
}