bookdata/goodreads/
book.rs

1//! GoodReads book schemas and record processing.
2use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::*;
7use crate::ids::codes::NS_GR_BOOK;
8use crate::ids::codes::NS_GR_WORK;
9use crate::parsing::*;
10use crate::prelude::*;
11
12const ID_FILE: &'static str = "gr-book-ids.parquet";
13const INFO_FILE: &'static str = "gr-book-info.parquet";
14const SERIES_FILE: &'static str = "gr-book-series.parquet";
15const AUTHOR_FILE: &'static str = "gr-book-authors.parquet";
16
17/// The raw records we read from JSON
18#[allow(dead_code)]
19#[derive(Deserialize)]
20pub struct RawBook {
21    pub book_id: String,
22    pub work_id: String,
23    pub isbn: String,
24    pub isbn13: String,
25    pub asin: String,
26    #[serde(default)]
27    pub title: String,
28    #[serde(default)]
29    pub authors: Vec<RawAuthor>,
30    #[serde(default)]
31    pub publication_year: String,
32    #[serde(default)]
33    pub publication_month: String,
34    #[serde(default)]
35    pub publication_day: String,
36    #[serde(default)]
37    pub series: Vec<String>,
38}
39
40/// The raw author records from JSON.
41#[derive(Deserialize)]
42pub struct RawAuthor {
43    pub author_id: String,
44    #[serde(default)]
45    pub role: String,
46}
47
48/// the book ID records to write to Parquet.
49#[derive(ParquetRecordWriter)]
50pub struct BookIdRecord {
51    /// The book ID, converted from UUID.
52    pub book_id: i32,
53    /// The work ID, converted from UUID.
54    pub work_id: Option<i32>,
55    /// The integrated item ID, converted from book and work IDs projected into number spaces.
56    pub item_id: i32,
57    pub isbn10: Option<String>,
58    pub isbn13: Option<String>,
59    pub asin: Option<String>,
60}
61
62/// book info records to actually write
63#[derive(ParquetRecordWriter)]
64pub struct BookRecord {
65    pub book_id: i32,
66    pub title: Option<String>,
67    pub pub_year: Option<u16>,
68    pub pub_month: Option<u8>,
69}
70
71/// book series linking records
72#[derive(ParquetRecordWriter)]
73pub struct BookSeriesRecord {
74    pub book_id: i32,
75    pub series: String,
76}
77
78/// book author linking records
79#[derive(ParquetRecordWriter)]
80pub struct BookAuthorRecord {
81    pub book_id: i32,
82    pub author_id: i32,
83    pub role: Option<String>,
84}
85
86/// Output handler for GoodReads books.
87pub struct BookWriter {
88    id_out: TableWriter<BookIdRecord>,
89    info_out: TableWriter<BookRecord>,
90    author_out: TableWriter<BookAuthorRecord>,
91    series_out: TableWriter<BookSeriesRecord>,
92}
93
94impl BookWriter {
95    pub fn open() -> Result<BookWriter> {
96        let id_out = TableWriter::open(ID_FILE)?;
97        let info_out = TableWriter::open(INFO_FILE)?;
98        let author_out = TableWriter::open(AUTHOR_FILE)?;
99        let series_out = TableWriter::open(SERIES_FILE)?;
100        Ok(BookWriter {
101            id_out,
102            info_out,
103            author_out,
104            series_out,
105        })
106    }
107}
108
109impl DataSink for BookWriter {
110    fn output_files<'a>(&'a self) -> Vec<PathBuf> {
111        path_list(&[ID_FILE, INFO_FILE, AUTHOR_FILE, SERIES_FILE])
112    }
113}
114
115impl ObjectWriter<RawBook> for BookWriter {
116    fn write_object(&mut self, row: RawBook) -> Result<()> {
117        let book_id = row.book_id.parse()?;
118        let work_id = parse_opt(&row.work_id)?;
119        let item_id = if let Some(w) = work_id {
120            NS_GR_WORK.to_code(w)
121        } else {
122            NS_GR_BOOK.to_code(book_id)
123        };
124
125        self.id_out.write_object(BookIdRecord {
126            book_id,
127            work_id,
128            item_id,
129            isbn10: trim_opt(&row.isbn)
130                .map(|s| clean_asin_chars(s))
131                .filter(|s| s.len() >= 7),
132            isbn13: trim_opt(&row.isbn13)
133                .map(|s| clean_asin_chars(s))
134                .filter(|s| s.len() >= 7),
135            asin: trim_opt(&row.asin)
136                .map(|s| clean_asin_chars(s))
137                .filter(|s| s.len() >= 7),
138        })?;
139
140        let pub_year = parse_opt(&row.publication_year)?;
141        let pub_month = parse_opt(&row.publication_month)?;
142
143        self.info_out.write_object(BookRecord {
144            book_id,
145            title: trim_owned(&row.title),
146            pub_year,
147            pub_month,
148        })?;
149
150        for author in row.authors {
151            self.author_out.write_object(BookAuthorRecord {
152                book_id,
153                author_id: author.author_id.parse()?,
154                role: Some(author.role).filter(|s| !s.is_empty()),
155            })?;
156        }
157
158        for series in row.series {
159            self.series_out
160                .write_object(BookSeriesRecord { book_id, series })?;
161        }
162
163        Ok(())
164    }
165
166    fn finish(self) -> Result<usize> {
167        self.id_out.finish()?;
168        self.info_out.finish()?;
169        self.author_out.finish()?;
170        self.series_out.finish()?;
171        Ok(0)
172    }
173}