bookdata/goodreads/
review.rs1use parquet_derive::ParquetRecordWriter;
3pub use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::ids::index::IdIndex;
7use crate::parsing::dates::*;
8use crate::parsing::*;
9use crate::prelude::*;
10
11use super::ids::load_id_links;
12use super::ids::BookLinkMap;
13use super::users::load_user_index;
14
15const OUT_FILE: &'static str = "gr-reviews.parquet";
16
17#[derive(Deserialize)]
19pub struct RawReview {
20 pub user_id: String,
21 pub book_id: String,
22 pub review_id: String,
23 pub rating: f32,
24 pub review_text: String,
25 pub n_votes: i32,
26 pub date_added: String,
27 pub date_updated: String,
28 pub read_at: String,
29 pub started_at: String,
30}
31
32#[derive(ParquetRecordWriter)]
34pub struct ReviewRecord {
35 pub rec_id: u32,
37 pub review_id: i64,
39 pub user_id: i32,
41 pub book_id: i32,
43 pub work_id: Option<i32>,
45 pub cluster: i32,
49 pub item_id: i32,
51 pub rating: Option<f32>,
53 pub review: String,
55 pub n_votes: i32,
57 pub added: f32,
59 pub updated: f32,
61}
62
63pub struct ReviewWriter {
65 writer: TableWriter<ReviewRecord>,
66 users: IdIndex<String>,
67 books: BookLinkMap,
68 n_recs: u32,
69}
70
71impl ReviewWriter {
72 pub fn open() -> Result<ReviewWriter> {
74 let writer = TableWriter::open(OUT_FILE)?;
75 let users = load_user_index()?.freeze();
76 let books = load_id_links()?;
77 Ok(ReviewWriter {
78 writer,
79 users,
80 books,
81 n_recs: 0,
82 })
83 }
84}
85
86impl DataSink for ReviewWriter {
87 fn output_files(&self) -> Vec<PathBuf> {
88 path_list(&[OUT_FILE])
89 }
90}
91
92impl ObjectWriter<RawReview> for ReviewWriter {
93 fn write_object(&mut self, row: RawReview) -> Result<()> {
95 self.n_recs += 1;
96 let rec_id = self.n_recs;
97 let user_id = self.users.intern_owned(row.user_id)?;
98 let book_id: i32 = row.book_id.parse()?;
99 let (rev_hi, rev_lo) = decode_hex_i64_pair(&row.review_id)?;
100 let review_id = rev_hi ^ rev_lo;
102 let link = self
103 .books
104 .get(&book_id)
105 .ok_or_else(|| anyhow!("unknown book ID"))?;
106
107 self.writer.write_object(ReviewRecord {
108 rec_id,
109 review_id,
110 user_id,
111 book_id,
112 work_id: link.work_id,
113 item_id: link.item_id(),
114 cluster: link.cluster,
115 review: row.review_text,
116 rating: if row.rating > 0.0 {
117 Some(row.rating)
118 } else {
119 None
120 },
121 n_votes: row.n_votes,
122 added: parse_gr_date(&row.date_added).map(check_ts("added", 2000))?,
123 updated: parse_gr_date(&row.date_updated).map(check_ts("updated", 2000))?,
124 })?;
125
126 Ok(())
127 }
128
129 fn finish(self) -> Result<usize> {
131 info!(
132 "wrote {} records for {} users, closing output",
133 self.n_recs,
134 self.users.len()
135 );
136 self.writer.finish()
137 }
138}