bookdata/cleaning/
isbns.rs

1//! Code for cleaning up ISBNs.
2//!
3//! This module contains two families of functions:
4//!
5//! - The simple character-cleaning functions [clean_isbn_chars] and [clean_asin_chars].
6//! - The full multi-ISBN parser [parse_isbn_string].
7//!
8//! When a string is a relatively well-formed ISBN (or ASIN), the character-cleaning functions
9//! are fine.  Some sources, however (such as the Library of Congress) have messy ISBNs that
10//! may have multiple ISBNs in one string, descriptive tags, and all manner of other messes.
11//! The multi-ISBN parser exposed through [parse_isbn_string] supports cleaning these ISBN
12//! strings using a PEG-based parser.
13use lazy_static::lazy_static;
14use regex::RegexSet;
15
16use crate::util::unicode::NONSPACING_MARK;
17
18/// Single ISBN parsed from a string.
19#[derive(Debug, PartialEq)]
20pub struct ISBN {
21    pub text: String,
22    pub tags: Vec<String>,
23}
24
25/// Result of parsing an ISBN string.
26#[derive(Debug, PartialEq)]
27pub enum ParseResult {
28    /// Vaid ISBNs, possibly with additional trailing text
29    Valid(Vec<ISBN>, String),
30    /// An ignored string
31    Ignored(String),
32    /// An unparsable string
33    Unmatched(String),
34}
35
36/// Regular expressions for unparsable ISBN strings to ignore.
37/// This cleans up warning displays.
38static IGNORES: &'static [&'static str] = &[
39    r"^[$]?[[:digit:]., ]+(?:[a-zA-Z*]{1,4})?(\s+\(.*?\))?$",
40    r"^[[:digit:].]+(/[[:digit:].]+)+$",
41    r"^[A-Z]-[A-Z]-\d{8,}",
42    r"^\s*$",
43];
44
45lazy_static! {
46    static ref IGNORE_RE: RegexSet = RegexSet::new(IGNORES).unwrap();
47}
48
49peg::parser! {
50    grammar isbn_parser() for str {
51        rule space() = quiet!{[' ' | '\n' | '\r' | '\t']}
52
53        rule lead() =
54        [';' | '.']? space()* prefix()?
55        rule prefix()
56        = ['a'..='z'] space()+
57        / "(" ['0'..='9']+ ")" space()+
58        / "*"
59        / "ISBN" space()+
60
61        rule single_tag() -> String = s:$([^ ':' | ')' | ']']+) { s.trim().to_string() }
62
63        rule tags() -> Vec<String>
64        = space()* ['[' | '('] tags:(single_tag() ** ":") [']' | ')'] { tags }
65
66        rule tail_skip() = space()* [';' | ':' | '/' | '.']?
67
68        // digits, hyphens, and misc. marks are allowed (will clean later)
69        rule digit_char() -> char
70        = mc:['0'..='9' | '-' | 'O'] { mc }
71        / mc:[c if NONSPACING_MARK.contains(c)] { mc }
72
73        // some ISBNs have some random junk in the middle, match allowed junk
74        rule inner_junk() = ['a'..='z' | 'A'..='Z']+ / [' ' | '+']
75
76        rule isbn_text() -> String
77        = s:$(digit_char()*<8,> ['X' | 'x']?) { clean_isbn_chars(s) }
78        / s:$(['0'..='9']*<1,5> inner_junk() ['0'..='9' | '-']*<4,>) { clean_isbn_chars(s) }
79
80        rule isbn() -> ISBN
81        = lead() i:isbn_text() tags:tags()* { ISBN {
82            text: i,
83            tags: tags.into_iter().flatten().collect()
84        }}
85
86        pub rule parse_isbns() -> (Vec<ISBN>, String)
87        = v:(isbn() ** tail_skip()) tail_skip() t:$([_]*) { (v, t.into()) }
88    }
89}
90
91/// Crude ISBN cleanup.
92///
93/// To be used with ISBN strings that are relatively well-formed, but may have invalid
94/// characters.
95pub fn clean_isbn_chars(isbn: &str) -> String {
96    let mut vec = Vec::with_capacity(isbn.len());
97    let bytes = isbn.as_bytes();
98    for i in 0..bytes.len() {
99        let b = bytes[i];
100        if b.is_ascii_digit() || b == b'X' {
101            vec.push(b)
102        } else if b == b'x' {
103            vec.push(b'X')
104        } else if b == b'O' {
105            vec.push(b'0')
106        }
107    }
108    unsafe {
109        // we know it's only ascii
110        String::from_utf8_unchecked(vec)
111    }
112}
113
114/// Crude ASIN cleanup.
115pub fn clean_asin_chars(isbn: &str) -> String {
116    let mut vec = Vec::with_capacity(isbn.len());
117    let bytes = isbn.as_bytes();
118    for i in 0..bytes.len() {
119        let b = bytes[i];
120        if b.is_ascii_alphanumeric() {
121            vec.push(b.to_ascii_uppercase())
122        }
123    }
124    unsafe {
125        // we know it's only ascii
126        String::from_utf8_unchecked(vec)
127    }
128}
129
130/// Parse an ISBN string.
131pub fn parse_isbn_string(s: &str) -> ParseResult {
132    // let mut parser = self.create_parser(s);
133    // parser.read_all()
134    if let Ok((isbns, tail)) = isbn_parser::parse_isbns(s) {
135        if isbns.is_empty() {
136            if IGNORE_RE.is_match(s) {
137                ParseResult::Ignored(s.to_owned())
138            } else {
139                ParseResult::Unmatched(s.to_owned())
140            }
141        } else {
142            ParseResult::Valid(isbns, tail)
143        }
144    } else {
145        ParseResult::Unmatched(s.to_owned())
146    }
147}
148
149#[test]
150fn test_parse_empty() {
151    let res = parse_isbn_string("");
152    assert_eq!(res, ParseResult::Ignored("".to_owned()));
153}
154
155#[test]
156fn test_parse_ws() {
157    let res = parse_isbn_string("  ");
158    assert_eq!(res, ParseResult::Ignored("  ".to_owned()));
159}
160
161#[test]
162fn test_parse_isbn() {
163    let isbn = "349224010X";
164
165    let res = parse_isbn_string(isbn);
166    match res {
167        ParseResult::Valid(isbns, trail) => {
168            assert_eq!(isbns.len(), 1);
169            assert_eq!(isbns[0].text, isbn);
170            assert_eq!(isbns[0].tags.len(), 0);
171            assert_eq!(trail, "");
172        }
173        x => panic!("bad parse: {:?}", x),
174    }
175}
176
177#[test]
178fn test_parse_isbn_trail() {
179    let src = "349224010X :";
180    let isbn = "349224010X";
181
182    let res = parse_isbn_string(src);
183    match res {
184        ParseResult::Valid(isbns, trail) => {
185            assert_eq!(isbns.len(), 1);
186            assert_eq!(isbns[0].text, isbn);
187            assert_eq!(isbns[0].tags.len(), 0);
188            assert_eq!(trail, "");
189        }
190        x => panic!("bad parse: {:?}", x),
191    }
192}
193
194#[test]
195fn test_parse_isbn_caron() {
196    let src = "349̌224010X";
197    let isbn = "349224010X";
198
199    let res = parse_isbn_string(src);
200    match res {
201        ParseResult::Valid(isbns, trail) => {
202            assert_eq!(isbns.len(), 1);
203            assert_eq!(isbns[0].text, isbn);
204            assert_eq!(isbns[0].tags.len(), 0);
205            assert_eq!(trail, "");
206        }
207        x => panic!("bad parse: {:?}", x),
208    }
209}
210
211#[test]
212fn test_parse_hyphen_isbn() {
213    let src = "978-03-2948-9391";
214    let isbn = "9780329489391";
215
216    let res = parse_isbn_string(src);
217    match res {
218        ParseResult::Valid(isbns, trail) => {
219            assert_eq!(isbns.len(), 1);
220            assert_eq!(isbns[0].text, isbn);
221            assert_eq!(isbns[0].tags.len(), 0);
222            assert_eq!(trail, "");
223        }
224        x => panic!("bad parse: {:?}", x),
225    }
226}
227
228#[test]
229fn test_parse_space_isbn() {
230    let src = "978 032948-9391";
231    let isbn = "9780329489391";
232
233    let res = parse_isbn_string(src);
234    match res {
235        ParseResult::Valid(isbns, trail) => {
236            assert_eq!(isbns.len(), 1);
237            assert_eq!(isbns[0].text, isbn);
238            assert_eq!(isbns[0].tags.len(), 0);
239            assert_eq!(trail, "");
240        }
241        x => panic!("bad parse: {:?}", x),
242    }
243}
244
245#[test]
246fn test_parse_isbn_tag() {
247    let src = "34922401038 (set)";
248    let isbn = "34922401038";
249    let tag = "set";
250
251    let res = parse_isbn_string(src);
252    match res {
253        ParseResult::Valid(isbns, trail) => {
254            assert_eq!(isbns.len(), 1);
255            assert_eq!(isbns[0].text, isbn);
256            assert_eq!(isbns[0].tags, vec![tag]);
257            assert_eq!(trail, "");
258        }
259        x => panic!("bad parse: {:?}", x),
260    }
261}
262
263#[test]
264fn test_parse_isbn_square_tag() {
265    let src = "34922401038 [set]";
266    let isbn = "34922401038";
267    let tag = "set";
268
269    let res = parse_isbn_string(src);
270    match res {
271        ParseResult::Valid(isbns, trail) => {
272            assert_eq!(isbns.len(), 1);
273            assert_eq!(isbns[0].text, isbn);
274            assert_eq!(isbns[0].tags, vec![tag]);
275            assert_eq!(trail, "");
276        }
277        x => panic!("bad parse: {:?}", x),
278    }
279}
280
281#[test]
282fn test_parse_isbn_multi_tag_sep() {
283    let src = "34922401038 (set : alk. paper)";
284    let isbn = "34922401038";
285
286    let res = parse_isbn_string(src);
287    match res {
288        ParseResult::Valid(isbns, trail) => {
289            assert_eq!(isbns.len(), 1);
290            assert_eq!(isbns[0].text, isbn);
291            assert_eq!(isbns[0].tags, vec!["set", "alk. paper"]);
292            assert_eq!(trail, "");
293        }
294        x => panic!("bad parse: {:?}", x),
295    }
296}
297
298#[test]
299fn test_parse_isbn_tags() {
300    let src = "34922401038 (pbk.) (set)";
301    let isbn = "34922401038";
302
303    let res = parse_isbn_string(src);
304    match res {
305        ParseResult::Valid(isbns, trail) => {
306            assert_eq!(isbns.len(), 1);
307            assert_eq!(isbns[0].text, isbn);
308            assert_eq!(isbns[0].tags, vec!["pbk.", "set"]);
309            assert_eq!(trail, "");
310        }
311        x => panic!("bad parse: {:?}", x),
312    }
313}
314
315#[test]
316fn test_parse_isbn_leader() {
317    let src = "a 970238408138";
318    let isbn = "970238408138";
319
320    let res = parse_isbn_string(src);
321    match res {
322        ParseResult::Valid(isbns, trail) => {
323            assert_eq!(isbns.len(), 1);
324            assert_eq!(isbns[0].text, isbn);
325            assert_eq!(isbns[0].tags.len(), 0);
326            assert_eq!(trail, "");
327        }
328        x => panic!("bad parse: {:?}", x),
329    }
330}
331
332#[test]
333fn test_parse_two_isbns_ws() {
334    let src = "970238408138 30148100103";
335    let isbn1 = "970238408138";
336    let isbn2 = "30148100103";
337
338    let res = parse_isbn_string(src);
339    match res {
340        ParseResult::Valid(isbns, trail) => {
341            assert_eq!(isbns.len(), 2);
342            assert_eq!(isbns[0].text, isbn1);
343            assert_eq!(isbns[0].tags.len(), 0);
344            assert_eq!(isbns[1].text, isbn2);
345            assert_eq!(isbns[1].tags.len(), 0);
346            assert_eq!(trail, "");
347        }
348        x => panic!("bad parse: {:?}", x),
349    }
350}
351
352#[test]
353fn test_parse_two_isbns_semi() {
354    let src = "970238408138; ISBN 30148100103";
355    let isbn1 = "970238408138";
356    let isbn2 = "30148100103";
357
358    let res = parse_isbn_string(src);
359    match res {
360        ParseResult::Valid(isbns, trail) => {
361            assert_eq!(isbns.len(), 2);
362            assert_eq!(isbns[0].text, isbn1);
363            assert_eq!(isbns[0].tags.len(), 0);
364            assert_eq!(isbns[1].text, isbn2);
365            assert_eq!(isbns[1].tags.len(), 0);
366            assert_eq!(trail, "");
367        }
368        x => panic!("bad parse: {:?}", x),
369    }
370}
371
372#[test]
373fn test_parse_two_isbns_real() {
374    // Real example from record 2175696
375    let src = "8719359022. ISBN 8719359004 (pbk.)";
376    let isbn1 = "8719359022";
377    let isbn2 = "8719359004";
378
379    let res = parse_isbn_string(src);
380    match res {
381        ParseResult::Valid(isbns, trail) => {
382            assert_eq!(isbns.len(), 2);
383            assert_eq!(isbns[0].text, isbn1);
384            assert_eq!(isbns[0].tags.len(), 0);
385            assert_eq!(isbns[1].text, isbn2);
386            assert_eq!(isbns[1].tags, vec!["pbk."]);
387            assert_eq!(trail, "");
388        }
389        x => panic!("bad parse: {:?}", x),
390    }
391}
392
393#[test]
394pub fn test_parse_isbn_junk_colon() {
395    let src = "95l3512401 :";
396    let isbn = "953512401";
397    let isbns = parse_isbn_string(src);
398    if let ParseResult::Valid(isbns, _tail) = isbns {
399        assert_eq!(isbns.len(), 1);
400        assert_eq!(&isbns[0].text, isbn);
401    } else {
402        panic!("failed to parse {}: {:?}", src, isbns);
403    }
404}
405
406#[test]
407pub fn test_parse_isbn_oh() {
408    let src = "O882970208 (pbk.)";
409    let isbn = "0882970208";
410
411    let isbns = parse_isbn_string(src);
412    if let ParseResult::Valid(isbns, _tail) = isbns {
413        assert_eq!(isbns.len(), 1);
414        assert_eq!(&isbns[0].text, isbn);
415    } else {
416        panic!("failed to parse {}: {:?}", src, isbns);
417    }
418}