src/tokenizer.rs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// https://drafts.csswg.org/css-syntax/#tokenization

use self::Token::*;
use crate::cow_rc_str::CowRcStr;
use crate::parser::ParserState;
use std::char;
use std::ops::Range;

#[cfg(not(feature = "dummy_match_byte"))]
use cssparser_macros::match_byte;

#[cfg(feature = "dummy_match_byte")]
macro_rules! match_byte {
    ($value:expr, $($rest:tt)* ) => {
        match $value {
            $(
                $rest
            )+
        }
    };
}

/// One of the pieces the CSS input is broken into.
///
/// Some components use `Cow` in order to borrow from the original input string
/// and avoid allocating/copying when possible.
#[derive(PartialEq, Debug, Clone)]
pub enum Token<'a> {
    /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
    Ident(CowRcStr<'a>),

    /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
    ///
    /// The value does not include the `@` marker.
    AtKeyword(CowRcStr<'a>),

    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
    ///
    /// The value does not include the `#` marker.
    Hash(CowRcStr<'a>),

    /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
    ///
    /// The value does not include the `#` marker.
    IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.

    /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
    ///
    /// The value does not include the quotes.
    QuotedString(CowRcStr<'a>),

    /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
    ///
    /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a
    /// `Function` token.
    UnquotedUrl(CowRcStr<'a>),

    /// A `<delim-token>`
    Delim(char),

    /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
    Number {
        /// Whether the number had a `+` or `-` sign.
        ///
        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
        has_sign: bool,

        /// The value as a float
        value: f32,

        /// If the origin source did not include a fractional part, the value as an integer.
        int_value: Option<i32>,
    },

    /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
    Percentage {
        /// Whether the number had a `+` or `-` sign.
        has_sign: bool,

        /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
        unit_value: f32,

        /// If the origin source did not include a fractional part, the value as an integer.
        /// It is **not** divided by 100.
        int_value: Option<i32>,
    },

    /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
    Dimension {
        /// Whether the number had a `+` or `-` sign.
        ///
        /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
        has_sign: bool,

        /// The value as a float
        value: f32,

        /// If the origin source did not include a fractional part, the value as an integer.
        int_value: Option<i32>,

        /// The unit, e.g. "px" in `12px`
        unit: CowRcStr<'a>,
    },

    /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
    WhiteSpace(&'a str),

    /// A comment.
    ///
    /// The CSS Syntax spec does not generate tokens for comments,
    /// But we do, because we can (borrowed &str makes it cheap).
    ///
    /// The value does not include the `/*` `*/` markers.
    Comment(&'a str),

    /// A `:` `<colon-token>`
    Colon, // :

    /// A `;` `<semicolon-token>`
    Semicolon, // ;

    /// A `,` `<comma-token>`
    Comma, // ,

    /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
    IncludeMatch,

    /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
    DashMatch,

    /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
    PrefixMatch,

    /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
    SuffixMatch,

    /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
    SubstringMatch,

    /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
    CDO,

    /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
    CDC,

    /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
    ///
    /// The value (name) does not include the `(` marker.
    Function(CowRcStr<'a>),

    /// A `<(-token>`
    ParenthesisBlock,

    /// A `<[-token>`
    SquareBracketBlock,

    /// A `<{-token>`
    CurlyBracketBlock,

    /// A `<bad-url-token>`
    ///
    /// This token always indicates a parse error.
    BadUrl(CowRcStr<'a>),

    /// A `<bad-string-token>`
    ///
    /// This token always indicates a parse error.
    BadString(CowRcStr<'a>),

    /// A `<)-token>`
    ///
    /// When obtained from one of the `Parser::next*` methods,
    /// this token is always unmatched and indicates a parse error.
    CloseParenthesis,

    /// A `<]-token>`
    ///
    /// When obtained from one of the `Parser::next*` methods,
    /// this token is always unmatched and indicates a parse error.
    CloseSquareBracket,

    /// A `<}-token>`
    ///
    /// When obtained from one of the `Parser::next*` methods,
    /// this token is always unmatched and indicates a parse error.
    CloseCurlyBracket,
}

impl Token<'_> {
    /// Return whether this token represents a parse error.
    ///
    /// `BadUrl` and `BadString` are tokenizer-level parse errors.
    ///
    /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
    /// and therefore parse errors when returned by one of the `Parser::next*` methods.
    pub fn is_parse_error(&self) -> bool {
        matches!(
            *self,
            BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
        )
    }
}

#[derive(Clone)]
pub struct Tokenizer<'a> {
    input: &'a str,
    /// Counted in bytes, not code points. From 0.
    position: usize,
    /// The position at the start of the current line; but adjusted to
    /// ensure that computing the column will give the result in units
    /// of UTF-16 characters.
    current_line_start_position: usize,
    current_line_number: u32,
    var_or_env_functions: SeenStatus,
    source_map_url: Option<&'a str>,
    source_url: Option<&'a str>,
}

#[derive(Copy, Clone, PartialEq, Eq)]
enum SeenStatus {
    DontCare,
    LookingForThem,
    SeenAtLeastOne,
}

impl<'a> Tokenizer<'a> {
    #[inline]
    pub fn new(input: &str) -> Tokenizer {
        Tokenizer {
            input,
            position: 0,
            current_line_start_position: 0,
            current_line_number: 0,
            var_or_env_functions: SeenStatus::DontCare,
            source_map_url: None,
            source_url: None,
        }
    }

    #[inline]
    pub fn look_for_var_or_env_functions(&mut self) {
        self.var_or_env_functions = SeenStatus::LookingForThem;
    }

    #[inline]
    pub fn seen_var_or_env_functions(&mut self) -> bool {
        let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
        self.var_or_env_functions = SeenStatus::DontCare;
        seen
    }

    #[inline]
    pub fn see_function(&mut self, name: &str) {
        if self.var_or_env_functions == SeenStatus::LookingForThem
            && (name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env"))
        {
            self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
        }
    }

    #[inline]
    pub fn next(&mut self) -> Result<Token<'a>, ()> {
        next_token(self)
    }

    #[inline]
    pub fn position(&self) -> SourcePosition {
        debug_assert!(self.input.is_char_boundary(self.position));
        SourcePosition(self.position)
    }

    #[inline]
    pub fn current_source_location(&self) -> SourceLocation {
        SourceLocation {
            line: self.current_line_number,
            column: (self.position - self.current_line_start_position + 1) as u32,
        }
    }

    #[inline]
    pub fn current_source_map_url(&self) -> Option<&'a str> {
        self.source_map_url
    }

    #[inline]
    pub fn current_source_url(&self) -> Option<&'a str> {
        self.source_url
    }

    #[inline]
    pub fn state(&self) -> ParserState {
        ParserState {
            position: self.position,
            current_line_start_position: self.current_line_start_position,
            current_line_number: self.current_line_number,
            at_start_of: None,
        }
    }

    #[inline]
    pub fn reset(&mut self, state: &ParserState) {
        self.position = state.position;
        self.current_line_start_position = state.current_line_start_position;
        self.current_line_number = state.current_line_number;
    }

    #[inline]
    pub(crate) fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
        self.slice(start_pos..self.position())
    }

    #[inline]
    pub(crate) fn slice(&self, range: Range<SourcePosition>) -> &'a str {
        debug_assert!(self.input.is_char_boundary(range.start.0));
        debug_assert!(self.input.is_char_boundary(range.end.0));
        unsafe { self.input.get_unchecked(range.start.0..range.end.0) }
    }

    pub fn current_source_line(&self) -> &'a str {
        let current = self.position();
        let start = self
            .slice(SourcePosition(0)..current)
            .rfind(['\r', '\n', '\x0C'])
            .map_or(0, |start| start + 1);
        let end = self
            .slice(current..SourcePosition(self.input.len()))
            .find(['\r', '\n', '\x0C'])
            .map_or(self.input.len(), |end| current.0 + end);
        self.slice(SourcePosition(start)..SourcePosition(end))
    }

    #[inline]
    pub fn next_byte(&self) -> Option<u8> {
        if self.is_eof() {
            None
        } else {
            Some(self.input.as_bytes()[self.position])
        }
    }

    // If false, `tokenizer.next_char()` will not panic.
    #[inline]
    fn is_eof(&self) -> bool {
        !self.has_at_least(0)
    }

    // If true, the input has at least `n` bytes left *after* the current one.
    // That is, `tokenizer.char_at(n)` will not panic.
    #[inline]
    fn has_at_least(&self, n: usize) -> bool {
        self.position + n < self.input.len()
    }

    // Advance over N bytes in the input.  This function can advance
    // over ASCII bytes (excluding newlines), or UTF-8 sequence
    // leaders (excluding leaders for 4-byte sequences).
    #[inline]
    pub fn advance(&mut self, n: usize) {
        if cfg!(debug_assertions) {
            // Each byte must either be an ASCII byte or a sequence
            // leader, but not a 4-byte leader; also newlines are
            // rejected.
            for i in 0..n {
                let b = self.byte_at(i);
                debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
                debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
            }
        }
        self.position += n
    }

    // Assumes non-EOF
    #[inline]
    fn next_byte_unchecked(&self) -> u8 {
        self.byte_at(0)
    }

    #[inline]
    fn byte_at(&self, offset: usize) -> u8 {
        self.input.as_bytes()[self.position + offset]
    }

    // Advance over a single byte; the byte must be a UTF-8 sequence
    // leader for a 4-byte sequence.
    #[inline]
    fn consume_4byte_intro(&mut self) {
        debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
        // This takes two UTF-16 characters to represent, so we
        // actually have an undercount.
        self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
        self.position += 1;
    }

    // Advance over a single byte; the byte must be a UTF-8
    // continuation byte.
    #[inline]
    fn consume_continuation_byte(&mut self) {
        debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
        // Continuation bytes contribute to column overcount.  Note
        // that due to the special case for the 4-byte sequence intro,
        // we must use wrapping add here.
        self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
        self.position += 1;
    }

    // Advance over any kind of byte, excluding newlines.
    #[inline(never)]
    fn consume_known_byte(&mut self, byte: u8) {
        debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
        self.position += 1;
        // Continuation bytes contribute to column overcount.
        if byte & 0xF0 == 0xF0 {
            // This takes two UTF-16 characters to represent, so we
            // actually have an undercount.
            self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
        } else if byte & 0xC0 == 0x80 {
            // Note that due to the special case for the 4-byte
            // sequence intro, we must use wrapping add here.
            self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
        }
    }

    #[inline]
    fn next_char(&self) -> char {
        unsafe { self.input.get_unchecked(self.position().0..) }
            .chars()
            .next()
            .unwrap()
    }

    // Given that a newline has been seen, advance over the newline
    // and update the state.
    #[inline]
    fn consume_newline(&mut self) {
        let byte = self.next_byte_unchecked();
        debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
        self.position += 1;
        if byte == b'\r' && self.next_byte() == Some(b'\n') {
            self.position += 1;
        }
        self.current_line_start_position = self.position;
        self.current_line_number += 1;
    }

    #[inline]
    fn has_newline_at(&self, offset: usize) -> bool {
        self.position + offset < self.input.len()
            && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
    }

    #[inline]
    fn consume_char(&mut self) -> char {
        let c = self.next_char();
        let len_utf8 = c.len_utf8();
        self.position += len_utf8;
        // Note that due to the special case for the 4-byte sequence
        // intro, we must use wrapping add here.
        self.current_line_start_position = self
            .current_line_start_position
            .wrapping_add(len_utf8 - c.len_utf16());
        c
    }

    #[inline]
    fn starts_with(&self, needle: &[u8]) -> bool {
        self.input.as_bytes()[self.position..].starts_with(needle)
    }

    pub fn skip_whitespace(&mut self) {
        while !self.is_eof() {
            match_byte! { self.next_byte_unchecked(),
                b' ' | b'\t' => {
                    self.advance(1)
                },
                b'\n' | b'\x0C' | b'\r' => {
                    self.consume_newline();
                },
                b'/' => {
                    if self.starts_with(b"/*") {
                        consume_comment(self);
                    } else {
                        return
                    }
                }
                _ => return,
            }
        }
    }

    pub fn skip_cdc_and_cdo(&mut self) {
        while !self.is_eof() {
            match_byte! { self.next_byte_unchecked(),
                b' ' | b'\t' => {
                    self.advance(1)
                },
                b'\n' | b'\x0C' | b'\r' => {
                    self.consume_newline();
                },
                b'/' => {
                    if self.starts_with(b"/*") {
                        consume_comment(self);
                    } else {
                        return
                    }
                }
                b'<' => {
                    if self.starts_with(b"<!--") {
                        self.advance(4)
                    } else {
                        return
                    }
                }
                b'-' => {
                    if self.starts_with(b"-->") {
                        self.advance(3)
                    } else {
                        return
                    }
                }
                _ => {
                    return
                }
            }
        }
    }
}

/// A position from the start of the input, counted in UTF-8 bytes.
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
pub struct SourcePosition(pub(crate) usize);

#[cfg(feature = "malloc_size_of")]
malloc_size_of::malloc_size_of_is_0!(SourcePosition);

impl SourcePosition {
    /// Returns the current byte index in the original input.
    #[inline]
    pub fn byte_index(&self) -> usize {
        self.0
    }
}

/// The line and column number for a given position within the input.
#[derive(PartialEq, Eq, Debug, Clone, Copy, Default)]
pub struct SourceLocation {
    /// The line number, starting at 0 for the first line.
    pub line: u32,

    /// The column number within a line, starting at 1 for first the character of the line.
    /// Column numbers are counted in UTF-16 code units.
    pub column: u32,
}

#[cfg(feature = "malloc_size_of")]
malloc_size_of::malloc_size_of_is_0!(SourceLocation);

fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
    if tokenizer.is_eof() {
        return Err(());
    }
    let b = tokenizer.next_byte_unchecked();
    let token = match_byte! { b,
        b' ' | b'\t' => {
            consume_whitespace(tokenizer, false)
        },
        b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true),
        b'"' => consume_string(tokenizer, false),
        b'#' => {
            tokenizer.advance(1);
            if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
            else if !tokenizer.is_eof() &&
                matches!(tokenizer.next_byte_unchecked(), b'0'..=b'9' | b'-') {
                // Any other valid case here already resulted in IDHash.
                Hash(consume_name(tokenizer))
            }
            else { Delim('#') }
        },
        b'$' => {
            if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
            else { tokenizer.advance(1); Delim('$') }
        },
        b'\'' => consume_string(tokenizer, true),
        b'(' => { tokenizer.advance(1); ParenthesisBlock },
        b')' => { tokenizer.advance(1); CloseParenthesis },
        b'*' => {
            if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
            else { tokenizer.advance(1); Delim('*') }
        },
        b'+' => {
            if (
                tokenizer.has_at_least(1)
                && tokenizer.byte_at(1).is_ascii_digit()
            ) || (
                tokenizer.has_at_least(2)
                && tokenizer.byte_at(1) == b'.'
                && tokenizer.byte_at(2).is_ascii_digit()
            ) {
                consume_numeric(tokenizer)
            } else {
                tokenizer.advance(1);
                Delim('+')
            }
        },
        b',' => { tokenizer.advance(1); Comma },
        b'-' => {
            if (
                tokenizer.has_at_least(1)
                && tokenizer.byte_at(1).is_ascii_digit()
            ) || (
                tokenizer.has_at_least(2)
                && tokenizer.byte_at(1) == b'.'
                && tokenizer.byte_at(2).is_ascii_digit()
            ) {
                consume_numeric(tokenizer)
            } else if tokenizer.starts_with(b"-->") {
                tokenizer.advance(3);
                CDC
            } else if is_ident_start(tokenizer) {
                consume_ident_like(tokenizer)
            } else {
                tokenizer.advance(1);
                Delim('-')
            }
        },
        b'.' => {
            if tokenizer.has_at_least(1)
                && tokenizer.byte_at(1).is_ascii_digit() {
                consume_numeric(tokenizer)
            } else {
                tokenizer.advance(1);
                Delim('.')
            }
        }
        b'/' => {
            if tokenizer.starts_with(b"/*") {
                Comment(consume_comment(tokenizer))
            } else {
                tokenizer.advance(1);
                Delim('/')
            }
        }
        b'0'..=b'9' => consume_numeric(tokenizer),
        b':' => { tokenizer.advance(1); Colon },
        b';' => { tokenizer.advance(1); Semicolon },
        b'<' => {
            if tokenizer.starts_with(b"<!--") {
                tokenizer.advance(4);
                CDO
            } else {
                tokenizer.advance(1);
                Delim('<')
            }
        },
        b'@' => {
            tokenizer.advance(1);
            if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
            else { Delim('@') }
        },
        b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
        b'[' => { tokenizer.advance(1); SquareBracketBlock },
        b'\\' => {
            if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
            else { tokenizer.advance(1); Delim('\\') }
        },
        b']' => { tokenizer.advance(1); CloseSquareBracket },
        b'^' => {
            if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
            else { tokenizer.advance(1); Delim('^') }
        },
        b'{' => { tokenizer.advance(1); CurlyBracketBlock },
        b'|' => {
            if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
            else { tokenizer.advance(1); Delim('|') }
        },
        b'}' => { tokenizer.advance(1); CloseCurlyBracket },
        b'~' => {
            if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
            else { tokenizer.advance(1); Delim('~') }
        },
        _ => {
            if !b.is_ascii() {
                consume_ident_like(tokenizer)
            } else {
                tokenizer.advance(1);
                Delim(b as char)
            }
        },
    };
    Ok(token)
}

fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
    let start_position = tokenizer.position();
    if newline {
        tokenizer.consume_newline();
    } else {
        tokenizer.advance(1);
    }
    while !tokenizer.is_eof() {
        let b = tokenizer.next_byte_unchecked();
        match_byte! { b,
            b' ' | b'\t' => {
                tokenizer.advance(1);
            }
            b'\n' | b'\x0C' | b'\r' => {
                tokenizer.consume_newline();
            }
            _ => {
                break
            }
        }
    }
    WhiteSpace(tokenizer.slice_from(start_position))
}

// Check for sourceMappingURL or sourceURL comments and update the
// tokenizer appropriately.
fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
    let directive = "# sourceMappingURL=";
    let directive_old = "@ sourceMappingURL=";

    // If there is a source map directive, extract the URL.
    if contents.starts_with(directive) || contents.starts_with(directive_old) {
        let contents = &contents[directive.len()..];
        tokenizer.source_map_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next();
    }

    let directive = "# sourceURL=";
    let directive_old = "@ sourceURL=";

    // If there is a source map directive, extract the URL.
    if contents.starts_with(directive) || contents.starts_with(directive_old) {
        let contents = &contents[directive.len()..];
        tokenizer.source_url = contents.split([' ', '\t', '\x0C', '\r', '\n']).next()
    }
}

fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
    tokenizer.advance(2); // consume "/*"
    let start_position = tokenizer.position();
    while !tokenizer.is_eof() {
        match_byte! { tokenizer.next_byte_unchecked(),
            b'*' => {
                let end_position = tokenizer.position();
                tokenizer.advance(1);
                if tokenizer.next_byte() == Some(b'/') {
                    tokenizer.advance(1);
                    let contents = tokenizer.slice(start_position..end_position);
                    check_for_source_map(tokenizer, contents);
                    return contents
                }
            }
            b'\n' | b'\x0C' | b'\r' => {
                tokenizer.consume_newline();
            }
            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
            _ => {
                // ASCII or other leading byte.
                tokenizer.advance(1);
            }
        }
    }
    let contents = tokenizer.slice_from(start_position);
    check_for_source_map(tokenizer, contents);
    contents
}

fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
    match consume_quoted_string(tokenizer, single_quote) {
        Ok(value) => QuotedString(value),
        Err(value) => BadString(value),
    }
}

/// Return `Err(())` on syntax error (ie. unescaped newline)
fn consume_quoted_string<'a>(
    tokenizer: &mut Tokenizer<'a>,
    single_quote: bool,
) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
    tokenizer.advance(1); // Skip the initial quote
                          // start_pos is at code point boundary, after " or '
    let start_pos = tokenizer.position();
    let mut string_bytes;
    loop {
        if tokenizer.is_eof() {
            return Ok(tokenizer.slice_from(start_pos).into());
        }
        match_byte! { tokenizer.next_byte_unchecked(),
            b'"' => {
                if !single_quote {
                    let value = tokenizer.slice_from(start_pos);
                    tokenizer.advance(1);
                    return Ok(value.into())
                }
                tokenizer.advance(1);
            }
            b'\'' => {
                if single_quote {
                    let value = tokenizer.slice_from(start_pos);
                    tokenizer.advance(1);
                    return Ok(value.into())
                }
                tokenizer.advance(1);
            }
            b'\\' | b'\0' => {
                // * The tokenizer’s input is UTF-8 since it’s `&str`.
                // * start_pos is at a code point boundary
                // * so is the current position (which is before '\\' or '\0'
                //
                // So `string_bytes` is well-formed UTF-8.
                string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                break
            }
            b'\n' | b'\r' | b'\x0C' => {
                return Err(tokenizer.slice_from(start_pos).into())
            },
            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
            _ => {
                // ASCII or other leading byte.
                tokenizer.advance(1);
            }
        }
    }

    while !tokenizer.is_eof() {
        let b = tokenizer.next_byte_unchecked();
        match_byte! { b,
            b'\n' | b'\r' | b'\x0C' => {
                return Err(
                    // string_bytes is well-formed UTF-8, see other comments.
                    unsafe {
                        from_utf8_release_unchecked(string_bytes)
                    }.into()
                );
            }
            b'"' => {
                tokenizer.advance(1);
                if !single_quote {
                    break;
                }
            }
            b'\'' => {
                tokenizer.advance(1);
                if single_quote {
                    break;
                }
            }
            b'\\' => {
                tokenizer.advance(1);
                if !tokenizer.is_eof() {
                    match tokenizer.next_byte_unchecked() {
                        // Escaped newline
                        b'\n' | b'\x0C' | b'\r' => {
                            tokenizer.consume_newline();
                        }
                        // This pushes one well-formed code point
                        _ => consume_escape_and_write(tokenizer, &mut string_bytes)
                    }
                }
                // else: escaped EOF, do nothing.
                continue;
            }
            b'\0' => {
                tokenizer.advance(1);
                string_bytes.extend("\u{FFFD}".as_bytes());
                continue;
            }
            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
            _ => {
                // ASCII or other leading byte.
                tokenizer.advance(1);
            },
        }

        // If this byte is part of a multi-byte code point,
        // we’ll end up copying the whole code point before this loop does something else.
        string_bytes.push(b);
    }

    Ok(
        // string_bytes is well-formed UTF-8, see other comments.
        unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
    )
}

#[inline]
fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
    !tokenizer.is_eof()
        && match_byte! { tokenizer.next_byte_unchecked(),
            b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => true,
            b'-' => {
                tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
                    b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
                        true
                    }
                    b'\\' => !tokenizer.has_newline_at(1),
                    b => !b.is_ascii(),
                }
            },
            b'\\' => !tokenizer.has_newline_at(1),
            b => !b.is_ascii(),
        }
}

fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
    let value = consume_name(tokenizer);
    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
        tokenizer.advance(1);
        if value.eq_ignore_ascii_case("url") {
            consume_unquoted_url(tokenizer).unwrap_or(Function(value))
        } else {
            tokenizer.see_function(&value);
            Function(value)
        }
    } else {
        Ident(value)
    }
}

fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
    // start_pos is the end of the previous token, therefore at a code point boundary
    let start_pos = tokenizer.position();
    let mut value_bytes;
    loop {
        if tokenizer.is_eof() {
            return tokenizer.slice_from(start_pos).into();
        }
        match_byte! { tokenizer.next_byte_unchecked(),
            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => tokenizer.advance(1),
            b'\\' | b'\0' => {
                // * The tokenizer’s input is UTF-8 since it’s `&str`.
                // * start_pos is at a code point boundary
                // * so is the current position (which is before '\\' or '\0'
                //
                // So `value_bytes` is well-formed UTF-8.
                value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                break
            }
            b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
            b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
            b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
            _b => {
                return tokenizer.slice_from(start_pos).into();
            }
        }
    }

    while !tokenizer.is_eof() {
        let b = tokenizer.next_byte_unchecked();
        match_byte! { b,
            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'  => {
                tokenizer.advance(1);
                value_bytes.push(b)  // ASCII
            }
            b'\\' => {
                if tokenizer.has_newline_at(1) { break }
                tokenizer.advance(1);
                // This pushes one well-formed code point
                consume_escape_and_write(tokenizer, &mut value_bytes)
            }
            b'\0' => {
                tokenizer.advance(1);
                value_bytes.extend("\u{FFFD}".as_bytes());
            },
            b'\x80'..=b'\xBF' => {
                // This byte *is* part of a multi-byte code point,
                // we’ll end up copying the whole code point before this loop does something else.
                tokenizer.consume_continuation_byte();
                value_bytes.push(b)
            }
            b'\xC0'..=b'\xEF' => {
                // This byte *is* part of a multi-byte code point,
                // we’ll end up copying the whole code point before this loop does something else.
                tokenizer.advance(1);
                value_bytes.push(b)
            }
            b'\xF0'..=b'\xFF' => {
                tokenizer.consume_4byte_intro();
                value_bytes.push(b)
            }
            _ => {
                // ASCII
                break;
            }
        }
    }
    // string_bytes is well-formed UTF-8, see other comments.
    unsafe { from_utf8_release_unchecked(value_bytes) }.into()
}

fn byte_to_hex_digit(b: u8) -> Option<u32> {
    Some(match_byte! { b,
        b'0' ..= b'9' => b - b'0',
        b'a' ..= b'f' => b - b'a' + 10,
        b'A' ..= b'F' => b - b'A' + 10,
        _ => {
            return None
        }
    } as u32)
}

fn byte_to_decimal_digit(b: u8) -> Option<u32> {
    if b.is_ascii_digit() {
        Some((b - b'0') as u32)
    } else {
        None
    }
}

fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
    // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
    // But this is always called so that there is at least one digit in \d*(\.\d+)?

    // Do all the math in f64 so that large numbers overflow to +/-inf
    // and i32::{MIN, MAX} are within range.

    let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
        b'-' => (true, -1.),
        b'+' => (true, 1.),
        _ => (false, 1.),
    };
    if has_sign {
        tokenizer.advance(1);
    }

    let mut integral_part: f64 = 0.;
    while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
        integral_part = integral_part * 10. + digit as f64;
        tokenizer.advance(1);
        if tokenizer.is_eof() {
            break;
        }
    }

    let mut is_integer = true;

    let mut fractional_part: f64 = 0.;
    if tokenizer.has_at_least(1)
        && tokenizer.next_byte_unchecked() == b'.'
        && tokenizer.byte_at(1).is_ascii_digit()
    {
        is_integer = false;
        tokenizer.advance(1); // Consume '.'
        let mut factor = 0.1;
        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
            fractional_part += digit as f64 * factor;
            factor *= 0.1;
            tokenizer.advance(1);
            if tokenizer.is_eof() {
                break;
            }
        }
    }

    let mut value = sign * (integral_part + fractional_part);

    if tokenizer.has_at_least(1)
        && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E')
        && (tokenizer.byte_at(1).is_ascii_digit()
            || (tokenizer.has_at_least(2)
                && matches!(tokenizer.byte_at(1), b'+' | b'-')
                && tokenizer.byte_at(2).is_ascii_digit()))
    {
        is_integer = false;
        tokenizer.advance(1);
        let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
            b'-' => (true, -1.),
            b'+' => (true, 1.),
            _ => (false, 1.),
        };
        if has_sign {
            tokenizer.advance(1);
        }
        let mut exponent: f64 = 0.;
        while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
            exponent = exponent * 10. + digit as f64;
            tokenizer.advance(1);
            if tokenizer.is_eof() {
                break;
            }
        }
        value *= f64::powf(10., sign * exponent);
    }

    let int_value = if is_integer {
        Some(if value >= i32::MAX as f64 {
            i32::MAX
        } else if value <= i32::MIN as f64 {
            i32::MIN
        } else {
            value as i32
        })
    } else {
        None
    };

    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
        tokenizer.advance(1);
        return Percentage {
            unit_value: (value / 100.) as f32,
            int_value,
            has_sign,
        };
    }
    let value = value as f32;
    if is_ident_start(tokenizer) {
        let unit = consume_name(tokenizer);
        Dimension {
            value,
            int_value,
            has_sign,
            unit,
        }
    } else {
        Number {
            value,
            int_value,
            has_sign,
        }
    }
}

#[inline]
unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
    if cfg!(debug_assertions) {
        String::from_utf8(string_bytes).unwrap()
    } else {
        String::from_utf8_unchecked(string_bytes)
    }
}

fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
    // This is only called after "url(", so the current position is a code point boundary.
    let start_position = tokenizer.position;
    let from_start = &tokenizer.input[tokenizer.position..];
    let mut newlines = 0;
    let mut last_newline = 0;
    let mut found_printable_char = false;
    let mut iter = from_start.bytes().enumerate();
    loop {
        let (offset, b) = match iter.next() {
            Some(item) => item,
            None => {
                tokenizer.position = tokenizer.input.len();
                break;
            }
        };
        match_byte! { b,
            b' ' | b'\t' => {},
            b'\n' | b'\x0C' => {
                newlines += 1;
                last_newline = offset;
            }
            b'\r' => {
                if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
                    newlines += 1;
                    last_newline = offset;
                }
            }
            b'"' | b'\'' => return Err(()),  // Do not advance
            b')' => {
                // Don't use advance, because we may be skipping
                // newlines here, and we want to avoid the assert.
                tokenizer.position += offset + 1;
                break
            }
            _ => {
                // Don't use advance, because we may be skipping
                // newlines here, and we want to avoid the assert.
                tokenizer.position += offset;
                found_printable_char = true;
                break
            }
        }
    }

    if newlines > 0 {
        tokenizer.current_line_number += newlines;
        // No need for wrapping_add here, because there's no possible
        // way to wrap.
        tokenizer.current_line_start_position = start_position + last_newline + 1;
    }

    if found_printable_char {
        // This function only consumed ASCII (whitespace) bytes,
        // so the current position is a code point boundary.
        return Ok(consume_unquoted_url_internal(tokenizer));
    } else {
        return Ok(UnquotedUrl("".into()));
    }

    fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
        // This function is only called with start_pos at a code point boundary.
        let start_pos = tokenizer.position();
        let mut string_bytes: Vec<u8>;
        loop {
            if tokenizer.is_eof() {
                return UnquotedUrl(tokenizer.slice_from(start_pos).into());
            }
            match_byte! { tokenizer.next_byte_unchecked(),
                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
                    let value = tokenizer.slice_from(start_pos);
                    return consume_url_end(tokenizer, start_pos, value.into())
                }
                b')' => {
                    let value = tokenizer.slice_from(start_pos);
                    tokenizer.advance(1);
                    return UnquotedUrl(value.into())
                }
                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
                    | b'"' | b'\'' | b'(' => {
                    tokenizer.advance(1);
                    return consume_bad_url(tokenizer, start_pos)
                },
                b'\\' | b'\0' => {
                    // * The tokenizer’s input is UTF-8 since it’s `&str`.
                    // * start_pos is at a code point boundary
                    // * so is the current position (which is before '\\' or '\0'
                    //
                    // So `string_bytes` is well-formed UTF-8.
                    string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                    break
                }
                b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
                b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
                _ => {
                    // ASCII or other leading byte.
                    tokenizer.advance(1);
                }
            }
        }
        while !tokenizer.is_eof() {
            let b = tokenizer.next_byte_unchecked();
            match_byte! { b,
                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
                    // string_bytes is well-formed UTF-8, see other comments.
                    let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
                    return consume_url_end(tokenizer, start_pos, string)
                }
                b')' => {
                    tokenizer.advance(1);
                    break;
                }
                b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
                    | b'"' | b'\'' | b'(' => {
                    tokenizer.advance(1);
                    return consume_bad_url(tokenizer, start_pos);
                }
                b'\\' => {
                    tokenizer.advance(1);
                    if tokenizer.has_newline_at(0) {
                        return consume_bad_url(tokenizer, start_pos)
                    }

                    // This pushes one well-formed code point to string_bytes
                    consume_escape_and_write(tokenizer, &mut string_bytes)
                },
                b'\0' => {
                    tokenizer.advance(1);
                    string_bytes.extend("\u{FFFD}".as_bytes());
                }
                b'\x80'..=b'\xBF' => {
                    // We’ll end up copying the whole code point
                    // before this loop does something else.
                    tokenizer.consume_continuation_byte();
                    string_bytes.push(b);
                }
                b'\xF0'..=b'\xFF' => {
                    // We’ll end up copying the whole code point
                    // before this loop does something else.
                    tokenizer.consume_4byte_intro();
                    string_bytes.push(b);
                }
                // If this byte is part of a multi-byte code point,
                // we’ll end up copying the whole code point before this loop does something else.
                b => {
                    // ASCII or other leading byte.
                    tokenizer.advance(1);
                    string_bytes.push(b)
                }
            }
        }
        UnquotedUrl(
            // string_bytes is well-formed UTF-8, see other comments.
            unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
        )
    }

    fn consume_url_end<'a>(
        tokenizer: &mut Tokenizer<'a>,
        start_pos: SourcePosition,
        string: CowRcStr<'a>,
    ) -> Token<'a> {
        while !tokenizer.is_eof() {
            match_byte! { tokenizer.next_byte_unchecked(),
                b')' => {
                    tokenizer.advance(1);
                    break
                }
                b' ' | b'\t' => { tokenizer.advance(1); }
                b'\n' | b'\x0C' | b'\r' => {
                    tokenizer.consume_newline();
                }
                b => {
                    tokenizer.consume_known_byte(b);
                    return consume_bad_url(tokenizer, start_pos);
                }
            }
        }
        UnquotedUrl(string)
    }

    fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
        // Consume up to the closing )
        while !tokenizer.is_eof() {
            match_byte! { tokenizer.next_byte_unchecked(),
                b')' => {
                    let contents = tokenizer.slice_from(start_pos).into();
                    tokenizer.advance(1);
                    return BadUrl(contents)
                }
                b'\\' => {
                    tokenizer.advance(1);
                    if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
                        tokenizer.advance(1); // Skip an escaped ')' or '\'
                    }
                }
                b'\n' | b'\x0C' | b'\r' => {
                    tokenizer.consume_newline();
                }
                b => {
                    tokenizer.consume_known_byte(b);
                }
            }
        }
        BadUrl(tokenizer.slice_from(start_pos).into())
    }
}

// (value, number of digits up to 6)
fn consume_hex_digits(tokenizer: &mut Tokenizer<'_>) -> (u32, u32) {
    let mut value = 0;
    let mut digits = 0;
    while digits < 6 && !tokenizer.is_eof() {
        match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
            Some(digit) => {
                value = value * 16 + digit;
                digits += 1;
                tokenizer.advance(1);
            }
            None => break,
        }
    }
    (value, digits)
}

// Same constraints as consume_escape except it writes into `bytes` the result
// instead of returning it.
fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
    bytes.extend(
        consume_escape(tokenizer)
            .encode_utf8(&mut [0; 4])
            .as_bytes(),
    )
}

// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
// and that the next input character has already been verified
// to not be a newline.
fn consume_escape(tokenizer: &mut Tokenizer) -> char {
    if tokenizer.is_eof() {
        return '\u{FFFD}';
    } // Escaped EOF
    match_byte! { tokenizer.next_byte_unchecked(),
        b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
            let (c, _) = consume_hex_digits(tokenizer);
            if !tokenizer.is_eof() {
                match_byte! { tokenizer.next_byte_unchecked(),
                    b' ' | b'\t' => {
                        tokenizer.advance(1)
                    }
                    b'\n' | b'\x0C' | b'\r' => {
                        tokenizer.consume_newline();
                    }
                    _ => {}
                }
            }
            static REPLACEMENT_CHAR: char = '\u{FFFD}';
            if c != 0 {
                let c = char::from_u32(c);
                c.unwrap_or(REPLACEMENT_CHAR)
            } else {
                REPLACEMENT_CHAR
            }
        },
        b'\0' => {
            tokenizer.advance(1);
            '\u{FFFD}'
        }
        _ => tokenizer.consume_char(),
    }
}