use partiql_source_map::location::{ByteOffset, BytePosition, ToLocated};
use std::borrow::Cow;
use logos::{Logos, Span};
use std::cmp::max;
use std::fmt;
use std::fmt::Formatter;
use crate::error::{LexError, ParseError};
use partiql_source_map::line_offset_tracker::LineOffsetTracker;
pub type Spanned<Tok, Loc> = (Loc, Tok, Loc);
pub(crate) type SpannedResult<Tok, Loc, Broke> = Result<Spanned<Tok, Loc>, Spanned<Broke, Loc>>;
type CommentStringResult<'input> = SpannedResult<&'input str, ByteOffset, LexError<'input>>;
#[derive(Logos, Debug, Clone, PartialEq, Eq)]
enum CommentToken {
#[error]
#[regex(r"[^/*\r\n\u0085\u2028\u2029]+", logos::skip)]
Any,
#[regex(r"(([\r])?[\n])|\u0085|\u2028|\u2029")]
Newline,
#[token("*/")]
End,
#[token("/*")]
Start,
}
struct CommentLexer<'input, 'tracker> {
lexer: logos::Lexer<'input, CommentToken>,
comment_nesting: bool,
tracker: &'tracker mut LineOffsetTracker,
}
impl<'input, 'tracker> CommentLexer<'input, 'tracker> {
#[inline]
pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self {
CommentLexer {
lexer: CommentToken::lexer(input),
comment_nesting: false,
tracker,
}
}
#[inline]
fn with_nesting(mut self) -> Self {
self.comment_nesting = true;
self
}
fn next_internal(&mut self) -> Option<CommentStringResult<'input>> {
let Span { start, .. } = self.lexer.span();
let mut nesting = 0;
let nesting_inc = i32::from(self.comment_nesting);
'comment: loop {
match self.lexer.next() {
Some(CommentToken::Any) => continue,
Some(CommentToken::Newline) => {
self.tracker.record(self.lexer.span().end.into());
}
Some(CommentToken::Start) => nesting = max(1, nesting + nesting_inc),
Some(CommentToken::End) => {
if nesting == 0 {
let Span { end, .. } = self.lexer.span();
return Some(Err((start.into(), LexError::Unknown, end.into())));
}
nesting -= 1;
if nesting == 0 {
break 'comment;
}
}
None => {
return if nesting != 0 {
let Span { end, .. } = self.lexer.span();
Some(Err((
start.into(),
LexError::UnterminatedComment,
end.into(),
)))
} else {
None
}
}
}
}
let Span { end, .. } = self.lexer.span();
let comment = &self.lexer.source()[start..end];
Some(Ok((start.into(), comment, end.into())))
}
}
impl<'input, 'tracker> Iterator for CommentLexer<'input, 'tracker> {
type Item = CommentStringResult<'input>;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.next_internal()
}
}
type EmbeddedIonStringResult<'input> = SpannedResult<&'input str, ByteOffset, LexError<'input>>;
#[derive(Logos, Debug, Clone, PartialEq)]
enum EmbeddedIonToken {
#[error]
#[regex(r#"[^/*'"`\r\n\u0085\u2028\u2029]+"#, logos::skip)]
Any,
#[regex(r"(([\r])?[\n])|\u0085|\u2028|\u2029")]
Newline,
#[token("`")]
Embed,
#[regex(r"//[^\n]*")]
CommentLine,
#[token("/*")]
CommentBlock,
#[regex(r#""([^"\\]|\\t|\\u|\\")*""#)]
String,
#[regex(r#"'([^'\\]|\\t|\\u|\\')*'"#)]
Symbol,
#[token("'''")]
LongString,
}
struct EmbeddedIonLexer<'input, 'tracker> {
lexer: logos::Lexer<'input, EmbeddedIonToken>,
tracker: &'tracker mut LineOffsetTracker,
}
impl<'input, 'tracker> EmbeddedIonLexer<'input, 'tracker> {
#[inline]
pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self {
EmbeddedIonLexer {
lexer: EmbeddedIonToken::lexer(input),
tracker,
}
}
fn next_internal(&mut self) -> Option<EmbeddedIonStringResult<'input>> {
let next_token = self.lexer.next();
match next_token {
Some(EmbeddedIonToken::Embed) => {
let Span { start, .. } = self.lexer.span();
'ion_value: loop {
let next_tok = self.lexer.next();
match next_tok {
Some(EmbeddedIonToken::Newline) => {
self.tracker.record(self.lexer.span().end.into());
}
Some(EmbeddedIonToken::Embed) => {
break 'ion_value;
}
Some(EmbeddedIonToken::CommentBlock) => {
let embed = self.lexer.span();
let remaining = &self.lexer.source()[embed.start..];
let mut comment_tracker = LineOffsetTracker::default();
let mut comment_lexer =
CommentLexer::new(remaining, &mut comment_tracker);
match comment_lexer.next_internal() {
Some(Ok((s, _c, e))) => {
self.tracker.append(&comment_tracker, embed.start.into());
self.lexer.bump((e - s).to_usize() - embed.len())
}
Some(Err((s, err, e))) => {
let offset: ByteOffset = embed.start.into();
return Some(Err((s + offset, err, e + offset)));
}
None => unreachable!(),
}
}
Some(EmbeddedIonToken::LongString) => {
'triple_quote: loop {
let next_tok = self.lexer.next();
match next_tok {
Some(EmbeddedIonToken::LongString) => break 'triple_quote,
Some(_) => (), None => continue 'ion_value,
}
}
}
Some(_) => {
}
None => {
let Span { end, .. } = self.lexer.span();
return Some(Err((
start.into(),
LexError::UnterminatedIonLiteral,
end.into(),
)));
}
}
}
let Span { end, .. } = self.lexer.span();
let (str_start, str_end) = (start + 1, end - 1);
let ion_value = &self.lexer.source()[str_start..str_end];
Some(Ok((start.into(), ion_value, end.into())))
}
_ => None,
}
}
}
impl<'input, 'tracker> Iterator for EmbeddedIonLexer<'input, 'tracker> {
type Item = EmbeddedIonStringResult<'input>;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.next_internal()
}
}
pub(crate) struct PartiqlLexer<'input, 'tracker> {
lexer: logos::Lexer<'input, Token<'input>>,
tracker: &'tracker mut LineOffsetTracker,
}
pub(crate) type InternalLexResult<'input> =
SpannedResult<Token<'input>, ByteOffset, LexError<'input>>;
pub(crate) type LexResult<'input> =
Result<Spanned<Token<'input>, ByteOffset>, ParseError<'input, BytePosition>>;
impl<'input> From<Spanned<LexError<'input>, ByteOffset>> for ParseError<'input, BytePosition> {
fn from(res: Spanned<LexError<'input>, ByteOffset>) -> Self {
let (start, cause, end) = res;
ParseError::LexicalError(
cause.to_located(BytePosition::from(start)..BytePosition::from(end)),
)
}
}
impl<'input, 'tracker> PartiqlLexer<'input, 'tracker> {
#[inline]
pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self {
PartiqlLexer {
lexer: Token::lexer(input),
tracker,
}
}
#[inline]
fn err_here(
&self,
err_ctor: fn(Cow<'input, str>) -> LexError<'input>,
) -> InternalLexResult<'input> {
let region = self.lexer.slice();
let Span { start, end } = self.lexer.span();
Err((start.into(), err_ctor(region.into()), end.into()))
}
pub fn slice(&self) -> &'input str {
self.lexer.slice()
}
#[inline(always)]
fn wrap(&mut self, token: Token<'input>) -> InternalLexResult<'input> {
let Span { start, end } = self.lexer.span();
Ok((start.into(), token, end.into()))
}
#[inline]
pub(crate) fn next_internal(&mut self) -> Option<InternalLexResult<'input>> {
'next_tok: loop {
return match self.lexer.next() {
None => None,
Some(token) => match token {
Token::Error => Some(self.err_here(LexError::InvalidInput)),
Token::Newline => {
self.tracker.record(self.lexer.span().end.into());
continue 'next_tok;
}
Token::EmbeddedIonQuote => self.parse_embedded_ion(),
Token::CommentBlockStart => self.parse_block_comment(),
_ => Some(self.wrap(token)),
},
};
}
}
fn parse_block_comment(&mut self) -> Option<InternalLexResult<'input>> {
let embed = self.lexer.span();
let remaining = &self.lexer.source()[embed.start..];
let mut comment_tracker = LineOffsetTracker::default();
let mut comment_lexer = CommentLexer::new(remaining, &mut comment_tracker).with_nesting();
comment_lexer.next_internal().map(|res| match res {
Ok((s, comment, e)) => {
let val_len = e - s;
let val_start = embed.start.into(); let val_end = val_start + val_len;
self.tracker.append(&comment_tracker, embed.start.into());
self.lexer.bump(val_len.to_usize() - embed.len());
Ok((val_start, Token::CommentBlock(comment), val_end))
}
Err((s, err, e)) => {
let offset: ByteOffset = embed.start.into();
Err((s + offset, err, e + offset))
}
})
}
fn parse_embedded_ion(&mut self) -> Option<InternalLexResult<'input>> {
let embed = self.lexer.span();
let remaining = &self.lexer.source()[embed.start..];
let mut ion_tracker = LineOffsetTracker::default();
let mut ion_lexer = EmbeddedIonLexer::new(remaining, &mut ion_tracker);
ion_lexer.next_internal().map(|res| match res {
Ok((s, ion, e)) => {
let val_len = e - s;
let val_start = embed.end.into(); let val_end = val_start + val_len - 2; self.tracker.append(&ion_tracker, embed.start.into());
self.lexer.bump(val_len.to_usize() - embed.len());
Ok((val_start, Token::Ion(ion), val_end))
}
Err((s, err, e)) => {
let offset: ByteOffset = embed.start.into();
Err((s + offset, err, e + offset))
}
})
}
}
impl<'input, 'tracker> Iterator for PartiqlLexer<'input, 'tracker> {
type Item = LexResult<'input>;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.next_internal().map(|res| res.map_err(|e| e.into()))
}
}
#[derive(Logos, Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
pub enum Token<'input> {
#[error]
#[regex(r"[ \t\f]+", logos::skip)]
Error,
#[regex(r"([\r]?[\n])|\u{0085}|\u{2028}|\u{2029}")]
Newline,
#[regex(r"--[^\n]*", |lex| lex.slice())]
CommentLine(&'input str),
#[token("/*")]
CommentBlockStart,
CommentBlock(&'input str),
#[token("[")]
OpenSquare,
#[token("]")]
CloseSquare,
#[token("{")]
OpenCurly,
#[token("}")]
CloseCurly,
#[token("(")]
OpenParen,
#[token(")")]
CloseParen,
#[token("<<")]
OpenDblAngle,
#[token(">>")]
CloseDblAngle,
#[token(",")]
Comma,
#[token(";")]
Semicolon,
#[token(":")]
Colon,
#[token("==")]
EqualEqual,
#[token("!=")]
BangEqual,
#[token("<>")]
LessGreater,
#[token("<=")]
LessEqual,
#[token(">=")]
GreaterEqual,
#[token("=")]
Equal,
#[token("<")]
LessThan,
#[token(">")]
GreaterThan,
#[token("-")]
Minus,
#[token("+")]
Plus,
#[token("*")]
Star,
#[token("?")]
SqlParameter,
#[token("%")]
Percent,
#[token("/")]
Slash,
#[token("^")]
Caret,
#[token(".")]
Period,
#[token("||")]
DblPipe,
#[regex("[a-zA-Z_$][a-zA-Z0-9_$]*", |lex| lex.slice())]
UnquotedIdent(&'input str),
#[regex(r#""([^"\\]|\\t|\\u|\\n|\\")*""#,
|lex| lex.slice().trim_matches('"'))]
QuotedIdent(&'input str),
#[regex("@[a-zA-Z_$][a-zA-Z0-9_$]*", |lex| &lex.slice()[1..])]
UnquotedAtIdentifier(&'input str),
#[regex(r#"@"([^"\\]|\\t|\\u|\\n|\\")*""#,
|lex| lex.slice()[1..].trim_matches('"'))]
QuotedAtIdentifier(&'input str),
#[regex("[0-9]+", |lex| lex.slice())]
Int(&'input str),
#[regex("[0-9]+\\.[0-9]*([eE][-+]?[0-9]+)", |lex| lex.slice())]
#[regex("\\.[0-9]+([eE][-+]?[0-9]+)", |lex| lex.slice())]
#[regex("[0-9]+[eE][-+]?[0-9]+", |lex| lex.slice())]
ExpReal(&'input str),
#[regex("[0-9]+\\.[0-9]*", |lex| lex.slice())]
#[regex("\\.[0-9]+", |lex| lex.slice())]
Real(&'input str),
#[regex(r#"'([^'\\]|\\t|\\u|\\n|\\'|\\|(?:''))*'"#,
|lex| lex.slice().trim_matches('\''))]
String(&'input str),
#[token("`")]
EmbeddedIonQuote,
Ion(&'input str),
#[regex("(?i:All)")]
All,
#[regex("(?i:Asc)")]
Asc,
#[regex("(?i:And)")]
And,
#[regex("(?i:As)")]
As,
#[regex("(?i:At)")]
At,
#[regex("(?i:Between)")]
Between,
#[regex("(?i:By)")]
By,
#[regex("(?i:Case)")]
Case,
#[regex("(?i:Cross)")]
Cross,
#[regex("(?i:Cycle)")]
Cycle,
#[regex("(?i:Date)")]
Date,
#[regex("(?i:Desc)")]
Desc,
#[regex("(?i:Distinct)")]
Distinct,
#[regex("(?i:Else)")]
Else,
#[regex("(?i:End)")]
End,
#[regex("(?i:Escape)")]
Escape,
#[regex("(?i:Except)")]
Except,
#[regex("(?i:False)")]
False,
#[regex("(?i:First)")]
First,
#[regex("(?i:For)")]
For,
#[regex("(?i:Full)")]
Full,
#[regex("(?i:From)")]
From,
#[regex("(?i:Group)")]
Group,
#[regex("(?i:Having)")]
Having,
#[regex("(?i:In)")]
In,
#[regex("(?i:Inner)")]
Inner,
#[regex("(?i:Is)")]
Is,
#[regex("(?i:Intersect)")]
Intersect,
#[regex("(?i:Join)")]
Join,
#[regex("(?i:Last)")]
Last,
#[regex("(?i:Lateral)")]
Lateral,
#[regex("(?i:Left)")]
Left,
#[regex("(?i:Like)")]
Like,
#[regex("(?i:Limit)")]
Limit,
#[regex("(?i:Missing)")]
Missing,
#[regex("(?i:Natural)")]
Natural,
#[regex("(?i:Not)")]
Not,
#[regex("(?i:Null)")]
Null,
#[regex("(?i:Nulls)")]
Nulls,
#[regex("(?i:Offset)")]
Offset,
#[regex("(?i:On)")]
On,
#[regex("(?i:Or)")]
Or,
#[regex("(?i:Order)")]
Order,
#[regex("(?i:Outer)")]
Outer,
#[regex("(?i:Partial)")]
Partial,
#[regex("(?i:Pivot)")]
Pivot,
#[regex("(?i:Preserve)")]
Preserve,
#[regex("(?i:Right)")]
Right,
#[regex("(?i:Recursive)")]
Recursive,
#[regex("(?i:Select)")]
Select,
#[regex("(?i:Search)")]
Search,
#[regex("(?i:Table)")]
Table,
#[regex("(?i:Time)")]
Time,
#[regex("(?i:Timestamp)")]
Timestamp,
#[regex("(?i:Then)")]
Then,
#[regex("(?i:True)")]
True,
#[regex("(?i:Union)")]
Union,
#[regex("(?i:Unpivot)")]
Unpivot,
#[regex("(?i:Using)")]
Using,
#[regex("(?i:Value)")]
Value,
#[regex("(?i:Values)")]
Values,
#[regex("(?i:When)")]
When,
#[regex("(?i:Where)")]
Where,
#[regex("(?i:With)")]
With,
#[regex("(?i:Without)")]
Without,
#[regex("(?i:Zone)")]
Zone,
}
impl<'input> Token<'input> {
pub fn is_keyword(&self) -> bool {
matches!(
self,
Token::All
| Token::Asc
| Token::And
| Token::As
| Token::At
| Token::Between
| Token::By
| Token::Case
| Token::Cross
| Token::Cycle
| Token::Date
| Token::Desc
| Token::Distinct
| Token::Escape
| Token::Except
| Token::First
| Token::For
| Token::Full
| Token::From
| Token::Group
| Token::Having
| Token::In
| Token::Inner
| Token::Is
| Token::Intersect
| Token::Join
| Token::Last
| Token::Lateral
| Token::Left
| Token::Like
| Token::Limit
| Token::Missing
| Token::Natural
| Token::Not
| Token::Null
| Token::Nulls
| Token::Offset
| Token::On
| Token::Or
| Token::Order
| Token::Outer
| Token::Partial
| Token::Pivot
| Token::Preserve
| Token::Right
| Token::Recursive
| Token::Search
| Token::Select
| Token::Table
| Token::Time
| Token::Timestamp
| Token::Then
| Token::Union
| Token::Unpivot
| Token::Using
| Token::Value
| Token::Values
| Token::Where
| Token::With
)
}
}
impl<'input> fmt::Display for Token<'input> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Token::Error => write!(f, "<UNKNOWN>"),
Token::Newline => write!(f, "\\n"),
Token::CommentLine(_) => write!(f, "--"),
Token::CommentBlockStart => write!(f, "/*"),
Token::CommentBlock(_) => write!(f, "/**/"),
Token::OpenSquare => write!(f, "["),
Token::CloseSquare => write!(f, "]"),
Token::OpenCurly => write!(f, "{{"),
Token::CloseCurly => write!(f, "}}"),
Token::OpenParen => write!(f, "("),
Token::CloseParen => write!(f, ")"),
Token::OpenDblAngle => write!(f, "<<"),
Token::CloseDblAngle => write!(f, ">>"),
Token::Comma => write!(f, ","),
Token::Semicolon => write!(f, ";"),
Token::Colon => write!(f, ":"),
Token::EqualEqual => write!(f, "=="),
Token::BangEqual => write!(f, "!="),
Token::LessGreater => write!(f, "<>"),
Token::LessEqual => write!(f, "<="),
Token::GreaterEqual => write!(f, ">="),
Token::Equal => write!(f, "="),
Token::LessThan => write!(f, "<"),
Token::GreaterThan => write!(f, ">"),
Token::Minus => write!(f, "-"),
Token::Plus => write!(f, "+"),
Token::Star => write!(f, "*"),
Token::SqlParameter => write!(f, "?"),
Token::Percent => write!(f, "%"),
Token::Slash => write!(f, "/"),
Token::Caret => write!(f, "^"),
Token::Period => write!(f, "."),
Token::DblPipe => write!(f, "||"),
Token::UnquotedIdent(id) => write!(f, "<{id}:UNQUOTED_IDENT>"),
Token::QuotedIdent(id) => write!(f, "<{id}:QUOTED_IDENT>"),
Token::UnquotedAtIdentifier(id) => write!(f, "<{id}:UNQUOTED_ATIDENT>"),
Token::QuotedAtIdentifier(id) => write!(f, "<{id}:QUOTED_ATIDENT>"),
Token::Int(txt) => write!(f, "<{txt}:INT>"),
Token::ExpReal(txt) => write!(f, "<{txt}:REAL>"),
Token::Real(txt) => write!(f, "<{txt}:REAL>"),
Token::String(txt) => write!(f, "<{txt}:STRING>"),
Token::EmbeddedIonQuote => write!(f, "<ION>"),
Token::Ion(txt) => write!(f, "<{txt}:ION>"),
Token::All
| Token::Asc
| Token::And
| Token::As
| Token::At
| Token::Between
| Token::By
| Token::Case
| Token::Cross
| Token::Cycle
| Token::Date
| Token::Desc
| Token::Distinct
| Token::Else
| Token::End
| Token::Escape
| Token::Except
| Token::False
| Token::First
| Token::For
| Token::Full
| Token::From
| Token::Group
| Token::Having
| Token::In
| Token::Inner
| Token::Is
| Token::Intersect
| Token::Join
| Token::Last
| Token::Lateral
| Token::Left
| Token::Like
| Token::Limit
| Token::Missing
| Token::Natural
| Token::Not
| Token::Null
| Token::Nulls
| Token::Offset
| Token::On
| Token::Or
| Token::Order
| Token::Outer
| Token::Partial
| Token::Pivot
| Token::Preserve
| Token::Right
| Token::Recursive
| Token::Search
| Token::Select
| Token::Table
| Token::Time
| Token::Timestamp
| Token::Then
| Token::True
| Token::Union
| Token::Unpivot
| Token::Using
| Token::Value
| Token::Values
| Token::When
| Token::Where
| Token::With
| Token::Without
| Token::Zone => {
write!(f, "{}", format!("{self:?}").to_uppercase())
}
}
}
}
pub(crate) struct CommentSkippingLexer<'input, L>
where
L: Iterator<Item = LexResult<'input>>,
{
lexer: L,
}
impl<'input, L> CommentSkippingLexer<'input, L>
where
L: Iterator<Item = LexResult<'input>>,
{
#[inline]
pub fn new(lexer: L) -> Self {
Self { lexer }
}
}
impl<'input, L> Iterator for CommentSkippingLexer<'input, L>
where
L: Iterator<Item = LexResult<'input>>,
{
type Item = LexResult<'input>;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
'next_tok: loop {
let next = self.lexer.next();
if matches!(
next,
Some(Ok((_, Token::CommentBlock(_) | Token::CommentLine(_), _)))
) {
continue 'next_tok;
}
return next;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use partiql_source_map::line_offset_tracker::{LineOffsetError, LineOffsetTracker};
use partiql_source_map::location::{
CharOffset, LineAndCharPosition, LineAndColumn, LineOffset, Located, Location,
};
use itertools::Itertools;
#[test]
fn display() -> Result<(), ParseError<'static, BytePosition>> {
let symbols =
"( [ { } ] ) << >> ; , < > <= >= != <> = == - + * ? % / ^ . || : --foo /*block*/";
let primitives = r#"unquoted_ident "quoted_ident" @unquoted_atident @"quoted_atident""#;
let keywords =
"WiTH Where Value uSiNg Unpivot UNION True Select right Preserve pivoT Outer Order Or \
On Offset Nulls Null Not Natural Missing Limit Like Left Lateral Last Join \
Intersect Is Inner In Having Group From For Full First False Except Escape Desc \
Cross Table Time Timestamp Date By Between At As And Asc All Values Case When Then Else End";
let symbols = symbols.split(' ').chain(primitives.split(' '));
let keywords = keywords.split(' ');
let text = symbols.interleave(keywords).join("\n");
let s = text.as_str();
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(s, &mut offset_tracker);
let toks: Vec<_> = lexer.collect::<Result<_, _>>().unwrap();
#[rustfmt::skip]
let expected = vec![
"(", "WITH", "[", "WHERE", "{", "VALUE", "}", "USING", "]", "UNPIVOT", ")", "UNION",
"<<", "TRUE", ">>", "SELECT", ";", "RIGHT", ",", "PRESERVE", "<", "PIVOT", ">", "OUTER",
"<=", "ORDER", ">=", "OR", "!=", "ON", "<>", "OFFSET", "=", "NULLS", "==", "NULL", "-",
"NOT", "+", "NATURAL", "*", "MISSING", "?", "LIMIT", "%", "LIKE", "/", "LEFT", "^",
"LATERAL", ".", "LAST", "||", "JOIN", ":", "INTERSECT", "--", "IS", "/**/", "INNER",
"<unquoted_ident:UNQUOTED_IDENT>", "IN", "<quoted_ident:QUOTED_IDENT>", "HAVING",
"<unquoted_atident:UNQUOTED_ATIDENT>", "GROUP", "<quoted_atident:QUOTED_ATIDENT>",
"FROM", "FOR", "FULL", "FIRST", "FALSE", "EXCEPT", "ESCAPE", "DESC", "CROSS", "TABLE",
"TIME", "TIMESTAMP", "DATE", "BY", "BETWEEN", "AT", "AS", "AND", "ASC", "ALL", "VALUES",
"CASE", "WHEN", "THEN", "ELSE", "END"
];
let displayed = toks
.into_iter()
.map(|(_s, t, _e)| t.to_string())
.collect::<Vec<_>>();
assert_eq!(expected, displayed);
Ok(())
}
#[test]
fn ion_simple() {
let ion_value = r#" `{'input':1, 'b':1}`--comment "#;
let mut offset_tracker = LineOffsetTracker::default();
let ion_lexer = EmbeddedIonLexer::new(ion_value.trim(), &mut offset_tracker);
assert_eq!(ion_lexer.into_iter().count(), 1);
assert_eq!(offset_tracker.num_lines(), 1);
let mut offset_tracker = LineOffsetTracker::default();
let mut lexer = PartiqlLexer::new(ion_value, &mut offset_tracker);
let tok = lexer.next().unwrap().unwrap();
assert!(
matches!(tok, (ByteOffset(5), Token::Ion(ion_str), ByteOffset(24)) if ion_str == "{'input':1, 'b':1}")
);
let tok = lexer.next().unwrap().unwrap();
assert!(
matches!(tok, (ByteOffset(25), Token::CommentLine(cmt_str), ByteOffset(35)) if cmt_str == "--comment ")
);
}
#[test]
fn ion() {
let ion_value = r#" `{'input' // comment ' "
:1, /*
comment
*/
'b':1}` "#;
let mut offset_tracker = LineOffsetTracker::default();
let ion_lexer = EmbeddedIonLexer::new(ion_value.trim(), &mut offset_tracker);
assert_eq!(ion_lexer.into_iter().count(), 1);
assert_eq!(offset_tracker.num_lines(), 5);
let mut offset_tracker = LineOffsetTracker::default();
let mut lexer = PartiqlLexer::new(ion_value, &mut offset_tracker);
let tok = lexer.next().unwrap().unwrap();
assert!(
matches!(tok, (ByteOffset(2), Token::Ion(ion_str), ByteOffset(158)) if ion_str == ion_value.trim().trim_matches('`'))
);
assert_eq!(offset_tracker.num_lines(), 5);
}
#[test]
fn nested_comments() {
let comments = r#"/*
/* / * * * /
/* ' " ''' `
*/ text
*/ 1 2 3 4 5 6,7,8,9 10.112^5
*/"#;
let mut offset_tracker = LineOffsetTracker::default();
let nested_lex = CommentLexer::new(comments, &mut offset_tracker).with_nesting();
assert_eq!(nested_lex.into_iter().count(), 1);
assert_eq!(offset_tracker.num_lines(), 6);
let mut offset_tracker = LineOffsetTracker::default();
let nonnested_lex = CommentLexer::new(comments, &mut offset_tracker);
let toks: Result<Vec<_>, Spanned<LexError, ByteOffset>> = nonnested_lex.collect();
assert!(toks.is_err());
let error = toks.unwrap_err();
assert!(matches!(
error,
(ByteOffset(142), LexError::Unknown, ByteOffset(189))
));
assert_eq!(error.1.to_string(), "Lexing error: unknown error");
}
#[test]
fn select() -> Result<(), ParseError<'static, BytePosition>> {
let query = r#"SELECT g
FROM "data"
GROUP BY a"#;
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
let toks: Vec<_> = lexer.collect::<Result<_, _>>()?;
let mut pre_offset_tracker = LineOffsetTracker::default();
let pre_lexer = PartiqlLexer::new(query, &mut pre_offset_tracker);
let pre_toks: Vec<_> = pre_lexer.collect::<Result<_, _>>()?;
let expected_toks = vec![
Token::Select,
Token::UnquotedIdent("g"),
Token::From,
Token::QuotedIdent("data"),
Token::Group,
Token::By,
Token::UnquotedIdent("a"),
];
assert_eq!(
expected_toks,
toks.into_iter().map(|(_s, t, _e)| t).collect::<Vec<_>>()
);
assert_eq!(
expected_toks,
pre_toks
.into_iter()
.map(|(_s, t, _e)| t)
.collect::<Vec<_>>()
);
assert_eq!(offset_tracker.num_lines(), 3);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, 0.into()).unwrap()),
LineAndColumn::new(1, 1).unwrap()
);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, 1.into()).unwrap()),
LineAndColumn::new(1, 2).unwrap()
);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, 9.into()).unwrap()),
LineAndColumn::new(2, 1).unwrap()
);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, 19.into()).unwrap()),
LineAndColumn::new(2, 11).unwrap()
);
let offset_r_a = query.rfind('a').unwrap();
let offset_r_n = query.rfind('\n').unwrap();
assert_eq!(
LineAndColumn::from(
offset_tracker
.at(query, BytePosition::from(query.len() - 1))
.unwrap()
),
LineAndColumn::new(3, offset_r_a - offset_r_n).unwrap()
);
Ok(())
}
#[test]
fn select_unicode() -> Result<(), ParseError<'static, BytePosition>> {
let query = "\u{2028}SELECT \"🐈\"\r\nFROM \"❤\u{211D}\"\u{2029}\u{0085}GROUP BY \"🧸\"";
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
let toks: Vec<_> = lexer.collect::<Result<_, _>>()?;
assert_eq!(
vec![
Token::Select,
Token::QuotedIdent("🐈"),
Token::From,
Token::QuotedIdent("❤ℝ"),
Token::Group,
Token::By,
Token::QuotedIdent("🧸")
],
toks.into_iter().map(|(_s, t, _e)| t).collect::<Vec<_>>()
);
assert_eq!(offset_tracker.num_lines(), 5);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, 0.into()).unwrap()),
LineAndColumn::new(1, 1).unwrap()
);
let offset_s = query.find('S').unwrap();
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, offset_s.into()).unwrap()),
LineAndColumn::new(2, 1).unwrap()
);
let offset_f = query.find('F').unwrap();
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, offset_f.into()).unwrap()),
LineAndColumn::new(3, 1).unwrap()
);
let offset_g = query.find('G').unwrap();
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, offset_g.into()).unwrap()),
LineAndColumn::new(5, 1).unwrap()
);
Ok(())
}
#[test]
fn offset_overflow() {
let query = "\u{2028}SELECT \"🐈\"\r\nFROM \"❤\u{211D}\"\u{2029}\u{0085}GROUP BY x";
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
lexer.count();
let last = offset_tracker.at(query, ByteOffset(query.len() as u32).into());
assert!(matches!(
last,
Ok(LineAndCharPosition {
line: LineOffset(4),
char: CharOffset(10)
})
));
let overflow = offset_tracker.at(query, ByteOffset(1 + query.len() as u32).into());
assert!(matches!(overflow, Err(LineOffsetError::EndOfInput)));
}
#[test]
fn offset_into_codepoint() {
let query = "\u{2028}SELECT \"🐈\"\r\nFROM \"❤\u{211D}\"\u{2029}\u{0085}GROUP BY \"🧸\"";
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
lexer.count();
assert_eq!(
offset_tracker.at(query, ByteOffset(1).into()),
Err(LineOffsetError::InsideUnicodeCodepoint)
);
}
#[test]
fn select_comment_line() -> Result<(), ParseError<'static, BytePosition>> {
let query = "SELECT --comment\n@g from @\"foo\"";
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
let toks: Vec<_> = lexer.collect::<Result<_, _>>()?;
assert_eq!(
vec![
Token::Select,
Token::CommentLine("--comment"),
Token::UnquotedAtIdentifier("g"),
Token::From,
Token::QuotedAtIdentifier("foo"),
],
toks.into_iter().map(|(_s, t, _e)| t).collect::<Vec<_>>()
);
assert_eq!(offset_tracker.num_lines(), 2);
Ok(())
}
#[test]
fn select_comment_block() -> Result<(), ParseError<'static, BytePosition>> {
let query = "SELECT /*comment*/ g";
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
let toks: Vec<_> = lexer.collect::<Result<_, _>>()?;
assert_eq!(
vec![
Token::Select,
Token::CommentBlock("/*comment*/"),
Token::UnquotedIdent("g"),
],
toks.into_iter().map(|(_s, t, _e)| t).collect::<Vec<_>>()
);
assert_eq!(offset_tracker.num_lines(), 1);
Ok(())
}
#[test]
fn select_non_reserved_keywords() -> Result<(), ParseError<'static, BytePosition>> {
let query =
"SELECT acyclic, BoTh, DOMAIN, SiMpLe, Trail, leading, TRailing, USER\nfrom @\"foo\"";
let mut offset_tracker = LineOffsetTracker::default();
let lexer = PartiqlLexer::new(query, &mut offset_tracker);
let toks: Vec<_> = lexer.collect::<Result<_, _>>()?;
assert_eq!(
vec![
Token::Select,
Token::UnquotedIdent("acyclic"),
Token::Comma,
Token::UnquotedIdent("BoTh"),
Token::Comma,
Token::UnquotedIdent("DOMAIN"),
Token::Comma,
Token::UnquotedIdent("SiMpLe"),
Token::Comma,
Token::UnquotedIdent("Trail"),
Token::Comma,
Token::UnquotedIdent("leading"),
Token::Comma,
Token::UnquotedIdent("TRailing"),
Token::Comma,
Token::UnquotedIdent("USER"),
Token::From,
Token::QuotedAtIdentifier("foo"),
],
toks.into_iter().map(|(_s, t, _e)| t).collect::<Vec<_>>()
);
assert_eq!(offset_tracker.num_lines(), 2);
Ok(())
}
#[test]
fn err_invalid_input() {
let query = "SELECT # FROM data GROUP BY a";
let mut offset_tracker = LineOffsetTracker::default();
let toks: Result<Vec<_>, _> = PartiqlLexer::new(query, &mut offset_tracker).collect();
assert!(toks.is_err());
let error = toks.unwrap_err();
assert_eq!(
error.to_string(),
r##"Lexing error: invalid input `#` at `(b7..b8)`"##
);
assert!(matches!(error,
ParseError::LexicalError(Located {
inner: LexError::InvalidInput(s),
location: Location{start: BytePosition(ByteOffset(7)), end: BytePosition(ByteOffset(8))}
}) if s == "#"));
assert_eq!(offset_tracker.num_lines(), 1);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, 7.into()).unwrap()),
LineAndColumn::new(1, 8).unwrap()
);
}
#[test]
fn err_unterminated_ion() {
let query = r#" ` "fooo` "#;
let mut offset_tracker = LineOffsetTracker::default();
let toks: Result<Vec<_>, _> = PartiqlLexer::new(query, &mut offset_tracker).collect();
assert!(toks.is_err());
let error = toks.unwrap_err();
assert!(matches!(
error,
ParseError::LexicalError(Located {
inner: LexError::UnterminatedIonLiteral,
location: Location {
start: BytePosition(ByteOffset(1)),
end: BytePosition(ByteOffset(10))
}
})
));
assert_eq!(
error.to_string(),
"Lexing error: unterminated ion literal at `(b1..b10)`"
);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, BytePosition::from(1)).unwrap()),
LineAndColumn::new(1, 2).unwrap()
);
}
#[test]
fn err_unterminated_comment() {
let query = r#" /*12345678"#;
let mut offset_tracker = LineOffsetTracker::default();
let toks: Result<Vec<_>, _> = PartiqlLexer::new(query, &mut offset_tracker).collect();
assert!(toks.is_err());
let error = toks.unwrap_err();
assert!(matches!(
error,
ParseError::LexicalError(Located {
inner: LexError::UnterminatedComment,
location: Location {
start: BytePosition(ByteOffset(1)),
end: BytePosition(ByteOffset(11))
}
})
));
assert_eq!(
error.to_string(),
"Lexing error: unterminated comment at `(b1..b11)`"
);
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, BytePosition::from(1)).unwrap()),
LineAndColumn::new(1, 2).unwrap()
);
}
#[test]
fn err_unterminated_ion_comment() {
let query = r#" `/*12345678`"#;
let mut offset_tracker = LineOffsetTracker::default();
let ion_lexer = EmbeddedIonLexer::new(query, &mut offset_tracker);
let toks: Result<Vec<_>, Spanned<LexError, ByteOffset>> = ion_lexer.collect();
assert!(toks.is_err());
let error = toks.unwrap_err();
assert!(matches!(
error,
(ByteOffset(2), LexError::UnterminatedComment, ByteOffset(13))
));
assert_eq!(error.1.to_string(), "Lexing error: unterminated comment");
assert_eq!(
LineAndColumn::from(offset_tracker.at(query, BytePosition::from(2)).unwrap()),
LineAndColumn::new(1, 3).unwrap()
);
}
}