bookdata/goodreads/
book.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
//! GoodReads book schemas and record processing.
use parquet_derive::ParquetRecordWriter;
use serde::Deserialize;

use crate::arrow::*;
use crate::cleaning::isbns::*;
use crate::ids::codes::NS_GR_BOOK;
use crate::ids::codes::NS_GR_WORK;
use crate::parsing::*;
use crate::prelude::*;

const ID_FILE: &'static str = "gr-book-ids.parquet";
const INFO_FILE: &'static str = "gr-book-info.parquet";
const SERIES_FILE: &'static str = "gr-book-series.parquet";
const AUTHOR_FILE: &'static str = "gr-book-authors.parquet";

/// The raw records we read from JSON
#[derive(Deserialize)]
pub struct RawBook {
    pub book_id: String,
    pub work_id: String,
    pub isbn: String,
    pub isbn13: String,
    pub asin: String,
    #[serde(default)]
    pub title: String,
    #[serde(default)]
    pub authors: Vec<RawAuthor>,
    #[serde(default)]
    pub publication_year: String,
    #[serde(default)]
    pub publication_month: String,
    #[serde(default)]
    pub publication_day: String,
    #[serde(default)]
    pub series: Vec<String>,
}

/// The raw author records from JSON.
#[derive(Deserialize)]
pub struct RawAuthor {
    pub author_id: String,
    #[serde(default)]
    pub role: String,
}

/// the book ID records to write to Parquet.
#[derive(ParquetRecordWriter)]
pub struct BookIdRecord {
    pub book_id: i32,
    pub work_id: Option<i32>,
    pub gr_item: i32,
    pub isbn10: Option<String>,
    pub isbn13: Option<String>,
    pub asin: Option<String>,
}

/// book info records to actually write
#[derive(ParquetRecordWriter)]
pub struct BookRecord {
    pub book_id: i32,
    pub title: Option<String>,
    pub pub_year: Option<u16>,
    pub pub_month: Option<u8>,
}

/// book series linking records
#[derive(ParquetRecordWriter)]
pub struct BookSeriesRecord {
    pub book_id: i32,
    pub series: String,
}

/// book author linking records
#[derive(ParquetRecordWriter)]
pub struct BookAuthorRecord {
    pub book_id: i32,
    pub author_id: i32,
    pub role: Option<String>,
}

/// Output handler for GoodReads books.
pub struct BookWriter {
    id_out: TableWriter<BookIdRecord>,
    info_out: TableWriter<BookRecord>,
    author_out: TableWriter<BookAuthorRecord>,
    series_out: TableWriter<BookSeriesRecord>,
}

impl BookWriter {
    pub fn open() -> Result<BookWriter> {
        let id_out = TableWriter::open(ID_FILE)?;
        let info_out = TableWriter::open(INFO_FILE)?;
        let author_out = TableWriter::open(AUTHOR_FILE)?;
        let series_out = TableWriter::open(SERIES_FILE)?;
        Ok(BookWriter {
            id_out,
            info_out,
            author_out,
            series_out,
        })
    }
}

impl DataSink for BookWriter {
    fn output_files<'a>(&'a self) -> Vec<PathBuf> {
        path_list(&[ID_FILE, INFO_FILE, AUTHOR_FILE, SERIES_FILE])
    }
}

impl ObjectWriter<RawBook> for BookWriter {
    fn write_object(&mut self, row: RawBook) -> Result<()> {
        let book_id = row.book_id.parse()?;
        let work_id = parse_opt(&row.work_id)?;
        let gr_item = if let Some(w) = work_id {
            NS_GR_WORK.to_code(w)
        } else {
            NS_GR_BOOK.to_code(book_id)
        };

        self.id_out.write_object(BookIdRecord {
            book_id,
            work_id,
            gr_item,
            isbn10: trim_opt(&row.isbn)
                .map(|s| clean_asin_chars(s))
                .filter(|s| s.len() >= 7),
            isbn13: trim_opt(&row.isbn13)
                .map(|s| clean_asin_chars(s))
                .filter(|s| s.len() >= 7),
            asin: trim_opt(&row.asin)
                .map(|s| clean_asin_chars(s))
                .filter(|s| s.len() >= 7),
        })?;

        let pub_year = parse_opt(&row.publication_year)?;
        let pub_month = parse_opt(&row.publication_month)?;

        self.info_out.write_object(BookRecord {
            book_id,
            title: trim_owned(&row.title),
            pub_year,
            pub_month,
        })?;

        for author in row.authors {
            self.author_out.write_object(BookAuthorRecord {
                book_id,
                author_id: author.author_id.parse()?,
                role: Some(author.role).filter(|s| !s.is_empty()),
            })?;
        }

        for series in row.series {
            self.series_out
                .write_object(BookSeriesRecord { book_id, series })?;
        }

        Ok(())
    }

    fn finish(self) -> Result<usize> {
        self.id_out.finish()?;
        self.info_out.finish()?;
        self.author_out.finish()?;
        self.series_out.finish()?;
        Ok(0)
    }
}