1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
use anyhow::Result;
use std::path::Path;

use super::record::*;
use crate::arrow::*;
use crate::io::*;

/// Flat MARC field record.
#[derive(TableRow, Debug, Default)]
pub struct FieldRecord {
    pub rec_id: u32,
    pub fld_no: u32,
    pub tag: i16,
    pub ind1: u8,
    pub ind2: u8,
    pub sf_code: u8,
    pub contents: String,
}

/// Output for writing flat MARC fields to Parquet.
pub struct FieldOutput {
    rec_count: u32,
    writer: TableWriter<FieldRecord>,
}

impl FieldOutput {
    /// Create a new output.
    pub fn new(writer: TableWriter<FieldRecord>) -> FieldOutput {
        FieldOutput {
            rec_count: 0,
            writer,
        }
    }

    /// Open a field output going to a file.
    pub fn open<P: AsRef<Path>>(path: P) -> Result<FieldOutput> {
        let writer = TableWriter::open(path)?;
        Ok(Self::new(writer))
    }
}

impl DataSink for FieldOutput {
    fn output_files(&self) -> Vec<std::path::PathBuf> {
        self.writer.output_files()
    }
}

impl ObjectWriter<MARCRecord> for FieldOutput {
    fn write_object(&mut self, rec: MARCRecord) -> Result<()> {
        self.rec_count += 1;
        let rec_id = self.rec_count;
        let mut fld_no = 0;

        // write the leader
        self.writer.write_object(FieldRecord {
            rec_id,
            fld_no,
            tag: -1,
            ind1: 0.into(),
            ind2: 0.into(),
            sf_code: 0.into(),
            contents: rec.leader,
        })?;

        // write the control fields
        for cf in rec.control {
            fld_no += 1;
            self.writer.write_object(FieldRecord {
                rec_id,
                fld_no,
                tag: cf.tag.into(),
                ind1: 0.into(),
                ind2: 0.into(),
                sf_code: 0.into(),
                contents: cf.content,
            })?;
        }

        // write the data fields
        for df in rec.fields {
            for sf in df.subfields {
                fld_no += 1;
                self.writer.write_object(FieldRecord {
                    rec_id,
                    fld_no,
                    tag: df.tag,
                    ind1: df.ind1.into(),
                    ind2: df.ind2.into(),
                    sf_code: sf.code.into(),
                    contents: sf.content,
                })?;
            }
        }

        Ok(())
    }

    fn finish(self) -> Result<usize> {
        self.writer.finish()
    }
}