bookdata/cleaning/names/mod.rs
1//! Extract and normalize author names.
2//!
3//! Names in the book data — both in author records and in their references in
4//! book records — come in a variety of formats. This module is responsible for
5//! expanding and normalizing those name formats to improve data linkability.
6//! Some records also include a year or date range for the author's lifetime. We
7//! normalize names as follows:
8//!
9//! - If the name is “Last, First”, we emit both “Last, First” and “First Last”
10//! variants.
11//! - If the name has a year, we emit each variant both with and without the
12//! year.
13//! - Leading and trailing junk is cleaned
14//!
15//! This maximizes our ability to match records across sources recording names
16//! in different formats.
17//!
18//! [`name_variants`] is the primary entry point for using this module. The
19//! [`clean_name`] function provides cleanup utilities without parsing, for
20//! emitting names from book records.
21
22use anyhow::Result;
23
24use super::strings::norm_unicode;
25
26mod parse;
27mod types;
28
29#[cfg(test)]
30mod test_cleaning;
31#[cfg(test)]
32mod test_variants;
33
34pub use types::NameError;
35use types::NameFmt;
36
37pub use parse::parse_name_entry;
38
39/// Pre-clean a string without copying.
40///
41/// Many strings don't need advanced cleaning. This method tries to pre-clean
42/// a string. If the string cannot be pre-cleaned, it returns None.
43fn preclean<'a>(name: &'a str) -> Option<&'a str> {
44 let name = name.trim();
45
46 let mut ws_count = 0;
47 for c in name.bytes() {
48 if c == b'.' {
49 return None;
50 } else if c.is_ascii_whitespace() {
51 ws_count += 1;
52 if ws_count > 1 {
53 return None;
54 }
55 } else {
56 if c == b',' && ws_count > 0 {
57 return None; // need cleaning of space and ,
58 }
59 ws_count = 0;
60 }
61 }
62
63 Some(name)
64}
65
66/// Clean up a name from unnecessary special characters.
67pub fn clean_name<'a>(name: &'a str) -> String {
68 let name = norm_unicode(name);
69
70 // fast path for pretty clean strings
71 // directly copying a string is faster than our character-by-character copying,
72 // probably due to simd, so it's worth scanning for a fast path.
73 if let Some(pc) = preclean(&name) {
74 return pc.to_string();
75 }
76
77 // we use a manually-coded state machine instead of REs for performance
78 let mut res = Vec::with_capacity(name.len());
79 let mut in_seq = false;
80 for c in name.bytes() {
81 if in_seq {
82 if c.is_ascii_whitespace() || c == b'.' {
83 // no-op
84 } else if c == b',' || res.is_empty() {
85 // emit the comma and proceed
86 res.push(c);
87 in_seq = false;
88 } else {
89 // collapse whitespace sequence and proceed
90 res.push(b' ');
91 res.push(c);
92 in_seq = false;
93 }
94 } else {
95 if c.is_ascii_whitespace() || c == b'.' {
96 in_seq = true;
97 } else {
98 res.push(c);
99 }
100 }
101 }
102 unsafe {
103 // since we have copied bytes, except for ASCII manipulations, this is safe
104 String::from_utf8_unchecked(res)
105 }
106}
107
108/// Extract all variants from a name.
109///
110/// See the [module documentation][self] for details on this parsing process.
111pub fn name_variants(name: &str) -> Result<Vec<String>, NameError> {
112 let parse = parse_name_entry(name)?;
113 let mut variants = Vec::new();
114 match parse.name.simplify() {
115 NameFmt::Empty => (),
116 NameFmt::Single(n) => variants.push(n),
117 NameFmt::TwoPart(last, first) => {
118 variants.push(format!("{} {}", first, last));
119 variants.push(format!("{}, {}", last, first));
120 }
121 };
122
123 // create a version with the year
124 if let Some(y) = parse.year {
125 for i in 0..variants.len() {
126 variants.push(format!("{}, {}", variants[i], y));
127 }
128 }
129
130 let mut variants: Vec<String> = variants.iter().map(|s| clean_name(s)).collect();
131 variants.sort();
132 variants.dedup();
133
134 Ok(variants)
135}