bookdata/goodreads/
author.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
//! GoodReads work schemas and record processing.
use parquet_derive::ParquetRecordWriter;
use serde::Deserialize;

use crate::arrow::*;
use crate::parsing::*;
use crate::prelude::*;

const OUT_FILE: &'static str = "gr-author-info.parquet";

/// Author records as parsed from JSON.
#[derive(Deserialize)]
pub struct RawAuthor {
    pub author_id: String,
    pub name: String,
}

/// Rows in the processed work Parquet table.
#[derive(ParquetRecordWriter)]
pub struct AuthorRecord {
    pub author_id: i32,
    pub name: Option<String>,
}

/// Object writer to transform and write GoodReads works
pub struct AuthorWriter {
    writer: TableWriter<AuthorRecord>,
    n_recs: usize,
}

impl AuthorWriter {
    /// Open a new output
    pub fn open() -> Result<AuthorWriter> {
        let writer = TableWriter::open(OUT_FILE)?;
        Ok(AuthorWriter { writer, n_recs: 0 })
    }
}

impl DataSink for AuthorWriter {
    fn output_files(&self) -> Vec<PathBuf> {
        path_list(&[OUT_FILE])
    }
}

impl ObjectWriter<RawAuthor> for AuthorWriter {
    fn write_object(&mut self, row: RawAuthor) -> Result<()> {
        let author_id: i32 = row.author_id.parse()?;

        self.writer.write_object(AuthorRecord {
            author_id,
            name: trim_owned(&row.name),
        })?;

        self.n_recs += 1;
        Ok(())
    }

    fn finish(self) -> Result<usize> {
        self.writer.finish()?;
        Ok(self.n_recs)
    }
}