bookdata/goodreads/
book.rs1use parquet_derive::ParquetRecordWriter;
3use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::*;
7use crate::ids::codes::NS_GR_BOOK;
8use crate::ids::codes::NS_GR_WORK;
9use crate::parsing::*;
10use crate::prelude::*;
11
12const ID_FILE: &'static str = "gr-book-ids.parquet";
13const INFO_FILE: &'static str = "gr-book-info.parquet";
14const SERIES_FILE: &'static str = "gr-book-series.parquet";
15const AUTHOR_FILE: &'static str = "gr-book-authors.parquet";
16
17#[allow(dead_code)]
19#[derive(Deserialize)]
20pub struct RawBook {
21 pub book_id: String,
22 pub work_id: String,
23 pub isbn: String,
24 pub isbn13: String,
25 pub asin: String,
26 #[serde(default)]
27 pub title: String,
28 #[serde(default)]
29 pub authors: Vec<RawAuthor>,
30 #[serde(default)]
31 pub publication_year: String,
32 #[serde(default)]
33 pub publication_month: String,
34 #[serde(default)]
35 pub publication_day: String,
36 #[serde(default)]
37 pub series: Vec<String>,
38}
39
40#[derive(Deserialize)]
42pub struct RawAuthor {
43 pub author_id: String,
44 #[serde(default)]
45 pub role: String,
46}
47
48#[derive(ParquetRecordWriter)]
50pub struct BookIdRecord {
51 pub book_id: i32,
53 pub work_id: Option<i32>,
55 pub item_id: i32,
57 pub isbn10: Option<String>,
58 pub isbn13: Option<String>,
59 pub asin: Option<String>,
60}
61
62#[derive(ParquetRecordWriter)]
64pub struct BookRecord {
65 pub book_id: i32,
66 pub title: Option<String>,
67 pub pub_year: Option<u16>,
68 pub pub_month: Option<u8>,
69}
70
71#[derive(ParquetRecordWriter)]
73pub struct BookSeriesRecord {
74 pub book_id: i32,
75 pub series: String,
76}
77
78#[derive(ParquetRecordWriter)]
80pub struct BookAuthorRecord {
81 pub book_id: i32,
82 pub author_id: i32,
83 pub role: Option<String>,
84}
85
86pub struct BookWriter {
88 id_out: TableWriter<BookIdRecord>,
89 info_out: TableWriter<BookRecord>,
90 author_out: TableWriter<BookAuthorRecord>,
91 series_out: TableWriter<BookSeriesRecord>,
92}
93
94impl BookWriter {
95 pub fn open() -> Result<BookWriter> {
96 let id_out = TableWriter::open(ID_FILE)?;
97 let info_out = TableWriter::open(INFO_FILE)?;
98 let author_out = TableWriter::open(AUTHOR_FILE)?;
99 let series_out = TableWriter::open(SERIES_FILE)?;
100 Ok(BookWriter {
101 id_out,
102 info_out,
103 author_out,
104 series_out,
105 })
106 }
107}
108
109impl DataSink for BookWriter {
110 fn output_files<'a>(&'a self) -> Vec<PathBuf> {
111 path_list(&[ID_FILE, INFO_FILE, AUTHOR_FILE, SERIES_FILE])
112 }
113}
114
115impl ObjectWriter<RawBook> for BookWriter {
116 fn write_object(&mut self, row: RawBook) -> Result<()> {
117 let book_id = row.book_id.parse()?;
118 let work_id = parse_opt(&row.work_id)?;
119 let item_id = if let Some(w) = work_id {
120 NS_GR_WORK.to_code(w)
121 } else {
122 NS_GR_BOOK.to_code(book_id)
123 };
124
125 self.id_out.write_object(BookIdRecord {
126 book_id,
127 work_id,
128 item_id,
129 isbn10: trim_opt(&row.isbn)
130 .map(|s| clean_asin_chars(s))
131 .filter(|s| s.len() >= 7),
132 isbn13: trim_opt(&row.isbn13)
133 .map(|s| clean_asin_chars(s))
134 .filter(|s| s.len() >= 7),
135 asin: trim_opt(&row.asin)
136 .map(|s| clean_asin_chars(s))
137 .filter(|s| s.len() >= 7),
138 })?;
139
140 let pub_year = parse_opt(&row.publication_year)?;
141 let pub_month = parse_opt(&row.publication_month)?;
142
143 self.info_out.write_object(BookRecord {
144 book_id,
145 title: trim_owned(&row.title),
146 pub_year,
147 pub_month,
148 })?;
149
150 for author in row.authors {
151 self.author_out.write_object(BookAuthorRecord {
152 book_id,
153 author_id: author.author_id.parse()?,
154 role: Some(author.role).filter(|s| !s.is_empty()),
155 })?;
156 }
157
158 for series in row.series {
159 self.series_out
160 .write_object(BookSeriesRecord { book_id, series })?;
161 }
162
163 Ok(())
164 }
165
166 fn finish(self) -> Result<usize> {
167 self.id_out.finish()?;
168 self.info_out.finish()?;
169 self.author_out.finish()?;
170 self.series_out.finish()?;
171 Ok(0)
172 }
173}