bookdata/cli/goodreads/
scan.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
use crate::goodreads::*;
use crate::io::object::{ChunkWriter, ThreadObjectWriter, UnchunkWriter};
use crate::prelude::*;
use crate::util::logging::data_progress;
use serde::de::DeserializeOwned;

#[derive(clap::Subcommand, Debug)]
pub enum GRScan {
    /// Scan GoodReads works.
    Works(ScanInput),
    /// Scan GoodReads books.
    Books(ScanInput),
    /// Scan GoodReads genres.
    Genres(ScanInput),
    /// Scan GoodReads authors.
    Authors(ScanInput),
    /// Scan GoodReads interactions.
    Interactions(ScanInput),
    /// Scan GoodReads reviews.
    Reviews(ScanInput),
}

#[derive(Args, Debug)]
pub struct ScanInput {
    /// Input file
    #[arg(name = "INPUT")]
    infile: PathBuf,
}

fn scan_gr<R, W>(path: &Path, proc: W) -> Result<()>
where
    W: ObjectWriter<R> + DataSink + Send + Sync + 'static,
    R: DeserializeOwned + Send + Sync + 'static,
{
    let outs: Vec<_> = proc.output_files();

    info!("reading data from {}", path.display());
    let pb = data_progress(0);
    let read = LineProcessor::open_gzip(path, pb.clone())?;
    let proc = ChunkWriter::new(proc);
    let writer = ThreadObjectWriter::wrap(proc).with_name("output").spawn();
    let mut writer = UnchunkWriter::new(writer);
    read.process_json(&mut writer)?;
    pb.finish_and_clear();

    writer.finish()?;

    for out in outs {
        let outf = out.as_path();
        info!(
            "output {} is {}",
            outf.display(),
            friendly::bytes(file_size(outf)?)
        );
    }

    Ok(())
}

impl GRScan {
    pub fn exec(&self) -> Result<()> {
        match self {
            GRScan::Works(opts) => {
                info!("scanning GoodReads works");
                scan_gr(&opts.infile, work::WorkWriter::open()?)?;
            }
            GRScan::Books(opts) => {
                info!("scanning GoodReads books");
                scan_gr(&opts.infile, book::BookWriter::open()?)?;
            }
            GRScan::Genres(opts) => {
                info!("scanning GoodReads book genres");
                scan_gr(&opts.infile, genres::BookGenreWriter::open()?)?;
            }
            GRScan::Authors(opts) => {
                info!("scanning GoodReads book genres");
                scan_gr(&opts.infile, author::AuthorWriter::open()?)?;
            }
            GRScan::Interactions(opts) => {
                info!("scanning GoodReads interactions");
                scan_gr(&opts.infile, interaction::IntWriter::open()?)?;
            }
            GRScan::Reviews(opts) => {
                info!("scanning GoodReads reviews");
                scan_gr(&opts.infile, review::ReviewWriter::open()?)?;
            }
        };

        Ok(())
    }
}