1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
// Parsing functions that are common to textual types
use crate::text::parse_result::{
fatal_parse_error, IonParseError, IonParseResult, OrFatalParseError, UpgradeIResult,
};
use nom::branch::alt;
use nom::bytes::streaming::tag;
use nom::character::streaming::{char, satisfy};
use nom::combinator::{map, recognize, value};
use nom::sequence::{preceded, tuple};
use nom::{AsChar, IResult, Parser};
use std::str;
/// The text Ion types each need to be able to read strings that contain escaped characters.
/// This type represents the possible types of substring that make up any given piece of text from
/// the parser's perspective. escaped characters that need to be replaced, escaped characters that
/// need to be discarded, and unescaped substrings that are valid as-is.
#[derive(Debug, Clone, PartialEq)]
pub(crate) enum StringFragment<'a> {
/// A substring that contains no escaped characters and which is valid as-is.
Substring(&'a str),
/// An escaped character (like '\n' or '\xFF') that maps to a substitute character.
EscapedChar(char),
/// An escaped newline, which can be ignored altogether.
EscapedNewline,
}
/// Checks the given input for a leading slash (`\`); if it finds one, it applies the provided
/// `parser` to the text that follows. If the parser doesn't match, returns a fatal error.
pub(crate) fn escape_and_then<'a, 'b, P>(
input: &'a str,
label: &'b str,
mut parser: P,
) -> IonParseResult<'a, StringFragment<'a>>
where
P: Parser<&'a str, StringFragment<'a>, IonParseError<'a>>,
{
// If it doesn't start with a slash, it's not a match. Return a non-fatal error.
let (remaining, _slash) = char('\\')(input).upgrade()?;
// If the provided parser doesn't match what comes next, it's not a valid escape.
// Return a fatal error.
match parser.parse(remaining) {
Ok((remaining, string_fragment)) => Ok((remaining, string_fragment)),
Err(e @ nom::Err::Incomplete(_)) => Err(e),
Err(e) => fatal_parse_error(remaining, format!("could not parse {label}: {e}")),
}
}
/// Matches an escaped newline, returning [StringFragment::EscapedNewline].
pub(crate) fn escaped_newline(input: &str) -> IonParseResult<StringFragment> {
value(
StringFragment::EscapedNewline,
alt((tag("\\\n"), tag("\\\r\n"), tag("\\\r"))),
)(input)
.upgrade()
}
/// Matches an escaped literal (like '\n') or a Unicode escape (starting with '\x', '\u', or '\U'),
/// returning the appropriate substitute character as a [StringFragment::EscapedChar].
pub(crate) fn escaped_char(input: &str) -> IonParseResult<StringFragment> {
let parser = map(
alt((escaped_char_unicode, escaped_char_literal)),
StringFragment::EscapedChar,
);
escape_and_then(
input,
"an escaped character (Unicode, hex, or literal)",
parser,
)
}
/// Matches an escaped literal (like '\n') or a hex escape (like `\x`), returning the appropriate
/// substitute character as a [StringFragment::EscapedChar]. Does NOT match Unicode escapes
/// ('\u' or '\U').
pub(crate) fn escaped_char_no_unicode(input: &str) -> IonParseResult<StringFragment> {
let parser = map(
alt((escaped_hex_char, escaped_char_literal)),
StringFragment::EscapedChar,
);
escape_and_then(input, "an escaped character (hex or literal)", parser)
}
/// Matches an escaped literal and returns the appropriate substitute character.
/// See: <https://amazon-ion.github.io/ion-docs/docs/spec.html#escapes>
pub(crate) fn escaped_char_literal(input: &str) -> IonParseResult<char> {
alt((
value('\n', char('n')),
value('\r', char('r')),
value('\t', char('t')),
value('\\', char('\\')),
value('/', char('/')),
value('"', char('"')),
value('\'', char('\'')),
value('?', char('?')),
value('\u{00}', char('0')), // NUL
value('\u{07}', char('a')), // alert BEL
value('\u{08}', char('b')), // backspace
value('\u{0B}', char('v')), // vertical tab
value('\u{0C}', char('f')), // form feed
))(input)
.upgrade()
}
pub(crate) fn escaped_hex_char(input: &str) -> IonParseResult<char> {
// First, try to match the input to a hex escape sequence. If successful, extract the hex
// digits that were included in the sequence. If matching fails, this isn't a hex escape sequence.
// Return early with a non-fatal error.
let (remaining_input, hex_digits) = escaped_char_2_digit_hex(input).upgrade()?;
// Now that we have our hex digits, we'll try to convert them to a char.
// If this fails, it will return a fatal error.
decode_hex_digits_to_char(remaining_input, hex_digits)
}
/// Returns `true` if the provided code point is a utf-16 high surrogate.
///
/// Terse primer: Unicode text is made up of a stream of unsigned integers called 'code points'.
/// What a person might think of as a 'character' (for example: 'a', '本', or '🥸') can be made up
/// of one or more code points.
///
/// A single code point can require up to 21 bits. Depending on which Unicode encoding you're using,
/// these 21 bits can come with different amounts of additional overhead bits:
/// * In utf-8, a code point can be 1, 2, 3, or 4 bytes, with some bits in each byte being used
/// for the code point and others being used to indicate whether more bytes are coming.
/// * In utf-16, a code point can be 2 bytes or 4 bytes. If it's four bytes, the first two bytes will
/// be a 'high surrogate' (a value between 0xD800 and 0xDFFF) to communicate that another two
/// bytes are coming to complete the code point.
/// * In utf-32, a code point is always 32 bits. This is a bit wasteful, but makes for simple
/// processing.
///
/// This helper function detects high surrogates (which are only used in utf-16) so the parser
/// can know to require a second one immediately following.
///
/// Further reading:
/// * <https://doc.rust-lang.org/std/primitive.char.html>
/// * <https://www.unicode.org/glossary/#surrogate_code_point>
fn code_point_is_a_high_surrogate(value: u32) -> bool {
(0xD800..=0xDFFF).contains(&value)
}
/// Matches a Unicode escape (starting with '\x', '\u', or '\U'), returning the appropriate
/// substitute character. If the value represented by the escape is a utf-16 high surrogate,
/// another Unicode escape will be matched from input to produce a Unicode scalar.
pub(crate) fn escaped_char_unicode(input: &str) -> IonParseResult<char> {
// First, try to match the input to a Unicode escape sequence. If successful, extract the hex
// digits that were included in the sequence. If matching fails, this isn't an escape sequence.
// Return early with a non-fatal error.
let (remaining_input, hex_digits) = alt((
escaped_char_2_digit_hex,
escaped_char_unicode_4_digit_hex,
escaped_char_unicode_8_digit_hex,
))(input)
.upgrade()?;
// We matched on a sequence of hex digits of some length; convert it to a `u32`.
let (_, number_value) = u32::from_str_radix(hex_digits, 16)
.or_fatal_parse_error(hex_digits, "could not parse escape hex sequence")?;
// Check to see if this is a high surrogate; if it is, our code point isn't complete. Another
// unicode escape representing the low surrogate has to be next in the input to complete it.
// See the docs for this helper function for details. (Note: this will only ever be true for
// 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a
// high surrogate.)
if code_point_is_a_high_surrogate(number_value) {
// It's a high surrogate. It needs to be followed by a low surrogate to complete the
// codepoint.
return complete_surrogate_pair(input, remaining_input, hex_digits, number_value);
}
// A Rust `char` can represent any Unicode scalar value--a code point that is not part of a
// surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar
// value. We can safely convert it to a `char`.
let character = char::from_u32(number_value).unwrap();
Ok((remaining_input, character))
}
/// Rust's [`char`](prim@char) type represents any Unicode code point EXCEPT a surrogate. If we
/// encounter a high surrogate in the stream, we cannot convert it to a `char` yet. We must find the
/// (mandatory) low surrogate that follows it in the stream; then we can combine the high and low
/// surrogates into a complete code point. This code point can then be returned as a Rust
/// [`char`](prim@char).
fn complete_surrogate_pair<'a>(
input: &'a str,
input_after_high_surrogate: &'a str,
high_surrogate_hex_digits: &'a str,
high_surrogate_number_value: u32,
) -> IonParseResult<'a, char> {
let (_, (input_after_low_surrogate, low_surrogate_hex_digits)) =
// Look for a `\` followed by a \uXXXX or \UXXXXXXXX escape
preceded(
char('\\'),
alt((
escaped_char_unicode_4_digit_hex,
escaped_char_unicode_8_digit_hex,
)),
)(input_after_high_surrogate)
.or_fatal_parse_error(
input_after_high_surrogate,
"encountered an incomplete surrogate pair",
)?;
// Convert the second set of hex digits to a `u32`.
let low_surrogate_number_value = u32::from_str_radix(low_surrogate_hex_digits, 16)
.or_fatal_parse_error(
high_surrogate_hex_digits,
"could not parse escape hex sequence for trailing surrogate",
)?
.1;
// Convert our pair of surrogate number values into u16s so we can feed them into the utf-16
// decoder. We know the first surrogate number value will fit in a u16 because we checked
// its range above, so we can safely unwrap it.
let high_surrogate: u16 = u16::try_from(high_surrogate_number_value).unwrap();
let low_surrogate: u16 = u16::try_from(low_surrogate_number_value)
.or_fatal_parse_error(
low_surrogate_hex_digits,
"trailing surrogate number value did not fit in a u16",
)?
.1;
let character = char::decode_utf16([high_surrogate, low_surrogate])
.next()
.unwrap() // We provided enough data to produce either a char or an Err
.or_fatal_parse_error(input, "encountered invalid surrogate pair")?
.1;
Ok((input_after_low_surrogate, character))
}
/// Treats a given string as the hex-encoded byte representation of a char
pub(crate) fn decode_hex_digits_to_char<'a>(
remaining_input: &'a str,
hex_digits: &'a str,
) -> IonParseResult<'a, char> {
// If this step fails, the Ion data stream is malformed and we need to bail out completely.
// We can't simply return an error as we did above; if we did that, the parser would go on to
// treat the input as a String literal without escapes, which is the incorrect behavior.
// Instead, we need to return a nom `Err::Failure`, indicating that we cannot proceed.
let number_value = match u32::from_str_radix(hex_digits, 16) {
Ok(number_value) => number_value,
Err(parse_int_error) => {
return fatal_parse_error(
hex_digits,
format!("could not parse escaped code unit: {parse_int_error}"),
)
}
};
let char_value = match std::char::from_u32(number_value) {
Some(char_value) => char_value,
None => {
return fatal_parse_error(
hex_digits,
format!("escape value (decimal:'{number_value}') is not a valid character"),
);
}
};
Ok((remaining_input, char_value))
}
/// Matches a 2-digit hex escape (starting with '\x'), returning the appropriate
/// substitute character.
pub(crate) fn escaped_char_2_digit_hex(input: &str) -> IResult<&str, &str> {
let hex_digit = single_hex_digit;
preceded(char('x'), recognize(tuple((hex_digit, hex_digit))))(input)
}
/// Matches a 4-digit Unicode escape (starting with '\u'), returning the appropriate
/// substitute character.
pub(crate) fn escaped_char_unicode_4_digit_hex(input: &str) -> IResult<&str, &str> {
let hex_digit = single_hex_digit;
preceded(
char('u'),
recognize(tuple((hex_digit, hex_digit, hex_digit, hex_digit))),
)(input)
}
/// Matches an 8-digit Unicode escape (starting with '\U'), returning the appropriate
/// substitute character.
pub(crate) fn escaped_char_unicode_8_digit_hex(input: &str) -> IResult<&str, &str> {
let hex_digit = single_hex_digit;
preceded(
char('U'),
recognize(tuple((
hex_digit, hex_digit, hex_digit, hex_digit, hex_digit, hex_digit, hex_digit, hex_digit,
))),
)(input)
}
/// Matches and returns a single base-16 digit.
pub(crate) fn single_hex_digit(input: &str) -> IResult<&str, char> {
satisfy(<char as AsChar>::is_hex_digit)(input)
}
/// Matches a `\r` or `\r\n` and returns a StringFragment::EscapedChar('\n').
pub(crate) fn normalized_newline(input: &str) -> IonParseResult<StringFragment> {
// In a long string, \r and \r\n are both normalized to `\n`
value(
// Return a newline...
StringFragment::EscapedChar('\n'),
// ...if the input is one of the following:
alt((tag("\r\n"), tag("\r"))),
)(input)
.upgrade()
}
/// If `byte_index` is zero, returns an `Err` signaling that the input was not matched. Otherwise,
/// splits the text at `byte_index` and returns a match on the head with the tail as remaining
/// input.
///
/// This is used by the string and clob parsers to detect non-empty long-string-formatted text
/// fragments. (e.g. '''hello''' ''' world!''')
pub(crate) fn string_fragment_or_mismatch(
input: &str,
byte_index: usize,
) -> IonParseResult<StringFragment> {
if byte_index == 0 {
return Err(nom::Err::Error(IonParseError::new(input)));
}
Ok((
&input[byte_index..],
StringFragment::Substring(&input[0..byte_index]),
))
}