| // Copyright (C) 2020-2023 Free Software Foundation, Inc. |
| |
| // This file is part of GCC. |
| |
| // GCC is free software; you can redistribute it and/or modify it under |
| // the terms of the GNU General Public License as published by the Free |
| // Software Foundation; either version 3, or (at your option) any later |
| // version. |
| |
| // GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| // WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| // for more details. |
| |
| // You should have received a copy of the GNU General Public License |
| // along with GCC; see the file COPYING3. If not see |
| // <http://www.gnu.org/licenses/>. |
| |
| #include "rust-system.h" |
| #include "rust-lex.h" |
| #include "rust-diagnostics.h" |
| #include "rust-linemap.h" |
| #include "rust-session-manager.h" |
| #include "safe-ctype.h" |
| |
| namespace Rust { |
| // TODO: move to separate compilation unit? |
| // overload += for uint32_t to allow 32-bit encoded utf-8 to be added |
| std::string & |
| operator+= (std::string &str, Codepoint char32) |
| { |
| if (char32.value < 0x80) |
| { |
| str += static_cast<char> (char32.value); |
| } |
| else if (char32.value < (0x1F + 1) << (1 * 6)) |
| { |
| str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F)); |
| str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F)); |
| } |
| else if (char32.value < (0x0F + 1) << (2 * 6)) |
| { |
| str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F)); |
| str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F)); |
| str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F)); |
| } |
| else if (char32.value < (0x07 + 1) << (3 * 6)) |
| { |
| str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07)); |
| str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F)); |
| str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F)); |
| str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F)); |
| } |
| else |
| { |
| rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value); |
| } |
| return str; |
| } |
| |
| std::string |
| Codepoint::as_string () |
| { |
| std::string str; |
| |
| // str += Codepoint (value); |
| str += *this; |
| |
| return str; |
| } |
| |
| /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead |
| * for handling. */ |
| bool |
| is_float_digit (char number) |
| { |
| return ISDIGIT (number) || number == 'E' || number == 'e'; |
| } |
| |
| /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or |
| * whatever is different */ |
| bool |
| is_x_digit (char number) |
| { |
| return ISXDIGIT (number); |
| } |
| |
| bool |
| is_octal_digit (char number) |
| { |
| return number >= '0' && number <= '7'; |
| } |
| |
| bool |
| is_bin_digit (char number) |
| { |
| return number == '0' || number == '1'; |
| } |
| |
| bool |
| check_valid_float_dot_end (char character) |
| { |
| return character != '.' && character != '_' && !ISALPHA (character); |
| } |
| |
| // ISSPACE from safe-ctype but may change in future |
| bool |
| is_whitespace (char character) |
| { |
| return ISSPACE (character); |
| } |
| |
| bool |
| is_non_decimal_int_literal_separator (char character) |
| { |
| return character == 'x' || character == 'o' || character == 'b'; |
| } |
| |
| Lexer::Lexer (const std::string &input) |
| : input (RAIIFile::create_error ()), current_line (1), current_column (1), |
| line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()), |
| raw_input_source (new BufferInputSource (input, 0)), |
| input_queue{*raw_input_source}, token_queue (TokenSource (this)) |
| {} |
| |
| Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap, |
| Optional<std::ofstream &> dump_lex_opt) |
| : input (std::move (file_input)), current_line (1), current_column (1), |
| line_map (linemap), dump_lex_out (dump_lex_opt), |
| raw_input_source (new FileInputSource (input.get_raw ())), |
| input_queue{*raw_input_source}, token_queue (TokenSource (this)) |
| { |
| // inform line_table that file is being entered and is in line 1 |
| if (linemap) |
| line_map->start_file (filename, current_line); |
| } |
| |
| Lexer::~Lexer () |
| { |
| /* ok apparently stop (which is equivalent of original code in destructor) is |
| * meant to be called after all files have finished parsing, for cleanup. On |
| * the other hand, actual code that it calls to leave a certain line map is |
| * mentioned in GCC docs as being useful for "just leaving an included header" |
| * and stuff like that, so this line mapping functionality may need fixing. |
| * FIXME: find out whether this occurs. */ |
| |
| // line_map->stop(); |
| } |
| |
| /* TODO: need to optimise somehow to avoid the virtual function call in the |
| * tight loop. Best idea at the moment is CRTP, but that might make lexer |
| * implementation annoying when storing the "base class" (i.e. would need |
| * template parameter everywhere), although in practice it would mostly just |
| * look ugly and make enclosing classes like Parser also require a type |
| * parameter. At this point a macro might be better. OK I guess macros can be |
| * replaced by constexpr if or something if possible. */ |
| Location |
| Lexer::get_current_location () |
| { |
| if (line_map) |
| return line_map->get_location (current_column); |
| else |
| // If we have no linemap, we're lexing something without proper locations |
| return Location (); |
| } |
| |
| int |
| Lexer::peek_input (int n) |
| { |
| return input_queue.peek (n); |
| } |
| |
| int |
| Lexer::peek_input () |
| { |
| return peek_input (0); |
| } |
| |
| void |
| Lexer::skip_input (int n) |
| { |
| input_queue.skip (n); |
| } |
| |
| void |
| Lexer::skip_input () |
| { |
| skip_input (0); |
| } |
| |
| void |
| Lexer::skip_token (int n) |
| { |
| // dump tokens if dump-lex option is enabled |
| if (dump_lex_out.is_some ()) |
| dump_and_skip (n); |
| else |
| token_queue.skip (n); |
| } |
| |
| void |
| Lexer::dump_and_skip (int n) |
| { |
| std::ofstream &out = dump_lex_out.get (); |
| bool found_eof = false; |
| const_TokenPtr tok; |
| for (int i = 0; i < n + 1; i++) |
| { |
| if (!found_eof) |
| { |
| tok = peek_token (); |
| found_eof |= tok->get_id () == Rust::END_OF_FILE; |
| |
| Location loc = tok->get_locus (); |
| |
| out << "<id="; |
| out << tok->token_id_to_str (); |
| out << (tok->has_str () ? (std::string (", text=") + tok->get_str () |
| + std::string (", typehint=") |
| + std::string (tok->get_type_hint_str ())) |
| : "") |
| << " "; |
| out << get_line_map ()->to_string (loc) << " "; |
| } |
| |
| token_queue.skip (0); |
| } |
| } |
| |
| void |
| Lexer::replace_current_token (TokenPtr replacement) |
| { |
| token_queue.replace_current_value (replacement); |
| |
| rust_debug ("called 'replace_current_token' - this is deprecated"); |
| } |
| |
| /* shitty anonymous namespace that can only be accessed inside the compilation |
| * unit - used for classify_keyword binary search in sorted array of keywords |
| * created with x-macros. */ |
| namespace { |
| // TODO: make constexpr when update to c++20 |
| const std::string keyword_index[] = { |
| #define RS_TOKEN(x, y) |
| #define RS_TOKEN_KEYWORD(name, keyword) keyword, |
| RS_TOKEN_LIST |
| #undef RS_TOKEN_KEYWORD |
| #undef RS_TOKEN |
| }; |
| |
| constexpr TokenId keyword_keys[] = { |
| #define RS_TOKEN(x, y) |
| #define RS_TOKEN_KEYWORD(name, keyword) name, |
| RS_TOKEN_LIST |
| #undef RS_TOKEN_KEYWORD |
| #undef RS_TOKEN |
| }; |
| |
| constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index); |
| } // namespace |
| |
| /* Determines whether the string passed in is a keyword or not. If it is, it |
| * returns the keyword name. */ |
| TokenId |
| Lexer::classify_keyword (const std::string &str) |
| { |
| const std::string *last = keyword_index + num_keywords; |
| const std::string *idx = std::lower_bound (keyword_index, last, str); |
| |
| if (idx == last || str != *idx) |
| return IDENTIFIER; |
| |
| // TODO: possibly replace this x-macro system with something like hash map? |
| |
| // We now have the expected token ID of the reserved keyword. However, some |
| // keywords are reserved starting in certain editions. For example, `try` is |
| // only a reserved keyword in editions >=2018. The language might gain new |
| // reserved keywords in the future. |
| // |
| // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords |
| auto id = keyword_keys[idx - keyword_index]; |
| |
| // `try` is not a reserved keyword before 2018 |
| if (Session::get_instance ().options.get_edition () |
| == CompileOptions::Edition::E2015 |
| && id == TRY) |
| return IDENTIFIER; |
| |
| return id; |
| } |
| |
| TokenPtr |
| Lexer::build_token () |
| { |
| // loop to go through multiple characters to build a single token |
| while (true) |
| { |
| Location loc = get_current_location (); |
| current_char = peek_input (); |
| skip_input (); |
| |
| // detect UTF8 bom |
| // |
| // Must be the first thing on the first line. |
| // There might be an optional BOM (Byte Order Mark), which for UTF-8 is |
| // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. |
| if (current_line == 1 && current_column == 1 && current_char == 0xef |
| && peek_input () == 0xbb && peek_input (1) == 0xbf) |
| { |
| skip_input (1); |
| current_char = peek_input (); |
| skip_input (); |
| } |
| |
| // detect shebang |
| // Must be the first thing on the first line, starting with #! |
| // But since an attribute can also start with an #! we don't count it as a |
| // shebang line when after any whitespace or comments there is a [. If it |
| // is a shebang line we simple drop the line. Otherwise we don't consume |
| // any characters and fall through to the real tokenizer. |
| if (current_line == 1 && current_column == 1 && current_char == '#' |
| && peek_input () == '!') |
| { |
| int n = 1; |
| while (true) |
| { |
| int next_char = peek_input (n); |
| if (is_whitespace (next_char)) |
| n++; |
| else if ((next_char == '/' && peek_input (n + 1) == '/' |
| && peek_input (n + 2) != '!' |
| && peek_input (n + 2) != '/') |
| || (next_char == '/' && peek_input (n + 1) == '/' |
| && peek_input (n + 2) == '/' |
| && peek_input (n + 3) == '/')) |
| { |
| // two // or four //// |
| // A single line comment |
| // (but not an inner or outer doc comment) |
| n += 2; |
| next_char = peek_input (n); |
| while (next_char != '\n' && next_char != EOF) |
| { |
| n++; |
| next_char = peek_input (n); |
| } |
| if (next_char == '\n') |
| n++; |
| } |
| else if (next_char == '/' && peek_input (n + 1) == '*' |
| && peek_input (n + 2) == '*' |
| && peek_input (n + 3) == '/') |
| { |
| /**/ |
| n += 4; |
| } |
| else if (next_char == '/' && peek_input (n + 1) == '*' |
| && peek_input (n + 2) == '*' && peek_input (n + 3) == '*' |
| && peek_input (n + 4) == '/') |
| { |
| /***/ |
| n += 5; |
| } |
| else if ((next_char == '/' && peek_input (n + 1) == '*' |
| && peek_input (n + 2) != '*' |
| && peek_input (n + 2) != '!') |
| || (next_char == '/' && peek_input (n + 1) == '*' |
| && peek_input (n + 2) == '*' |
| && peek_input (n + 3) == '*')) |
| { |
| // one /* or three /*** |
| // Start of a block comment |
| // (but not an inner or outer doc comment) |
| n += 2; |
| int level = 1; |
| while (level > 0) |
| { |
| if (peek_input (n) == EOF) |
| break; |
| else if (peek_input (n) == '/' |
| && peek_input (n + 1) == '*') |
| { |
| n += 2; |
| level += 1; |
| } |
| else if (peek_input (n) == '*' |
| && peek_input (n + 1) == '/') |
| { |
| n += 2; |
| level -= 1; |
| } |
| else |
| n++; |
| } |
| } |
| else if (next_char != '[') |
| { |
| // definitely shebang, ignore the first line |
| while (current_char != '\n' && current_char != EOF) |
| { |
| current_char = peek_input (); |
| skip_input (); |
| } |
| |
| // newline |
| current_line++; |
| current_column = 1; |
| // tell line_table that new line starts |
| start_line (current_line, max_column_hint); |
| break; |
| } |
| else |
| break; /* Definitely not a shebang line. */ |
| } |
| } |
| |
| // return end of file token if end of file |
| if (current_char == EOF) |
| return Token::make (END_OF_FILE, loc); |
| |
| // if not end of file, start tokenising |
| switch (current_char) |
| { |
| /* ignore whitespace characters for tokens but continue updating |
| * location */ |
| case '\n': // newline |
| current_line++; |
| current_column = 1; |
| // tell line_table that new line starts |
| start_line (current_line, max_column_hint); |
| continue; |
| case '\r': // cr |
| // Ignore, we expect a newline (lf) soon. |
| continue; |
| case ' ': // space |
| current_column++; |
| continue; |
| case '\t': // tab |
| // width of a tab is not well-defined, assume 8 spaces |
| current_column += 8; |
| continue; |
| |
| // punctuation - actual tokens |
| case '=': |
| if (peek_input () == '>') |
| { |
| // match arm arrow |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (MATCH_ARROW, loc); |
| } |
| else if (peek_input () == '=') |
| { |
| // equality operator |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (EQUAL_EQUAL, loc); |
| } |
| else |
| { |
| // assignment operator |
| current_column++; |
| return Token::make (EQUAL, loc); |
| } |
| case '(': |
| current_column++; |
| return Token::make (LEFT_PAREN, loc); |
| case '-': |
| if (peek_input () == '>') |
| { |
| // return type specifier |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (RETURN_TYPE, loc); |
| } |
| else if (peek_input () == '=') |
| { |
| // minus-assign |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (MINUS_EQ, loc); |
| } |
| else |
| { |
| // minus |
| current_column++; |
| return Token::make (MINUS, loc); |
| } |
| case '+': |
| if (peek_input () == '=') |
| { |
| // add-assign |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (PLUS_EQ, loc); |
| } |
| else |
| { |
| // add |
| current_column++; |
| return Token::make (PLUS, loc); |
| } |
| case ')': |
| current_column++; |
| return Token::make (RIGHT_PAREN, loc); |
| case ';': |
| current_column++; |
| return Token::make (SEMICOLON, loc); |
| case '*': |
| if (peek_input () == '=') |
| { |
| // multiplication-assign |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (ASTERISK_EQ, loc); |
| } |
| else |
| { |
| // multiplication |
| current_column++; |
| return Token::make (ASTERISK, loc); |
| } |
| case ',': |
| current_column++; |
| return Token::make (COMMA, loc); |
| case '/': |
| if (peek_input () == '=') |
| { |
| // division-assign |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (DIV_EQ, loc); |
| } |
| else if ((peek_input () == '/' && peek_input (1) != '!' |
| && peek_input (1) != '/') |
| || (peek_input () == '/' && peek_input (1) == '/' |
| && peek_input (2) == '/')) |
| { |
| // two // or four //// |
| // single line comment |
| // (but not an inner or outer doc comment) |
| skip_input (); |
| current_column += 2; |
| current_char = peek_input (); |
| |
| // basically ignore until line finishes |
| while (current_char != '\n' && current_char != EOF) |
| { |
| skip_input (); |
| current_column++; // not used |
| current_char = peek_input (); |
| } |
| continue; |
| } |
| else if (peek_input () == '/' |
| && (peek_input (1) == '!' || peek_input (1) == '/')) |
| { |
| /* single line doc comment, inner or outer. */ |
| bool is_inner = peek_input (1) == '!'; |
| skip_input (1); |
| current_column += 3; |
| |
| std::string str; |
| str.reserve (32); |
| current_char = peek_input (); |
| while (current_char != '\n') |
| { |
| skip_input (); |
| if (current_char == '\r') |
| { |
| char next_char = peek_input (); |
| if (next_char == '\n') |
| { |
| current_char = '\n'; |
| break; |
| } |
| rust_error_at ( |
| loc, "Isolated CR %<\\r%> not allowed in doc comment"); |
| current_char = next_char; |
| continue; |
| } |
| if (current_char == EOF) |
| { |
| rust_error_at ( |
| loc, "unexpected EOF while looking for end of comment"); |
| break; |
| } |
| str += current_char; |
| current_char = peek_input (); |
| } |
| skip_input (); |
| current_line++; |
| current_column = 1; |
| // tell line_table that new line starts |
| start_line (current_line, max_column_hint); |
| |
| str.shrink_to_fit (); |
| |
| loc += str.size () - 1; |
| if (is_inner) |
| return Token::make_inner_doc_comment (loc, std::move (str)); |
| else |
| return Token::make_outer_doc_comment (loc, std::move (str)); |
| } |
| else if (peek_input () == '*' && peek_input (1) == '*' |
| && peek_input (2) == '/') |
| { |
| /**/ |
| skip_input (2); |
| current_column += 4; |
| continue; |
| } |
| else if (peek_input () == '*' && peek_input (1) == '*' |
| && peek_input (2) == '*' && peek_input (3) == '/') |
| { |
| /***/ |
| skip_input (3); |
| current_column += 5; |
| continue; |
| } |
| else if ((peek_input () == '*' && peek_input (1) != '!' |
| && peek_input (1) != '*') |
| || (peek_input () == '*' && peek_input (1) == '*' |
| && peek_input (2) == '*')) |
| { |
| // one /* or three /*** |
| // block comment |
| // (but not an inner or outer doc comment) |
| skip_input (); |
| current_column += 2; |
| |
| int level = 1; |
| while (level > 0) |
| { |
| current_char = peek_input (); |
| |
| if (current_char == EOF) |
| { |
| rust_error_at ( |
| loc, "unexpected EOF while looking for end of comment"); |
| break; |
| } |
| |
| // if /* found |
| if (current_char == '/' && peek_input (1) == '*') |
| { |
| // skip /* characters |
| skip_input (1); |
| |
| current_column += 2; |
| |
| level += 1; |
| continue; |
| } |
| |
| // ignore until */ is found |
| if (current_char == '*' && peek_input (1) == '/') |
| { |
| // skip */ characters |
| skip_input (1); |
| |
| current_column += 2; |
| |
| level -= 1; |
| continue; |
| } |
| |
| if (current_char == '\n') |
| { |
| skip_input (); |
| current_line++; |
| current_column = 1; |
| // tell line_table that new line starts |
| start_line (current_line, max_column_hint); |
| continue; |
| } |
| |
| skip_input (); |
| current_column++; |
| } |
| |
| // refresh new token |
| continue; |
| } |
| else if (peek_input () == '*' |
| && (peek_input (1) == '!' || peek_input (1) == '*')) |
| { |
| // block doc comment, inner /*! or outer /** |
| bool is_inner = peek_input (1) == '!'; |
| skip_input (1); |
| current_column += 3; |
| |
| std::string str; |
| str.reserve (96); |
| |
| int level = 1; |
| while (level > 0) |
| { |
| current_char = peek_input (); |
| |
| if (current_char == EOF) |
| { |
| rust_error_at ( |
| loc, "unexpected EOF while looking for end of comment"); |
| break; |
| } |
| |
| // if /* found |
| if (current_char == '/' && peek_input (1) == '*') |
| { |
| // skip /* characters |
| skip_input (1); |
| current_column += 2; |
| |
| level += 1; |
| str += "/*"; |
| continue; |
| } |
| |
| // ignore until */ is found |
| if (current_char == '*' && peek_input (1) == '/') |
| { |
| // skip */ characters |
| skip_input (1); |
| current_column += 2; |
| |
| level -= 1; |
| if (level > 0) |
| str += "*/"; |
| continue; |
| } |
| |
| if (current_char == '\r' && peek_input (1) != '\n') |
| rust_error_at ( |
| loc, "Isolated CR %<\\r%> not allowed in doc comment"); |
| |
| if (current_char == '\n') |
| { |
| skip_input (); |
| current_line++; |
| current_column = 1; |
| // tell line_table that new line starts |
| start_line (current_line, max_column_hint); |
| str += '\n'; |
| continue; |
| } |
| |
| str += current_char; |
| skip_input (); |
| current_column++; |
| } |
| |
| str.shrink_to_fit (); |
| |
| loc += str.size () - 1; |
| if (is_inner) |
| return Token::make_inner_doc_comment (loc, std::move (str)); |
| else |
| return Token::make_outer_doc_comment (loc, std::move (str)); |
| } |
| else |
| { |
| // division |
| current_column++; |
| return Token::make (DIV, loc); |
| } |
| case '%': |
| if (peek_input () == '=') |
| { |
| // modulo-assign |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (PERCENT_EQ, loc); |
| } |
| else |
| { |
| // modulo |
| current_column++; |
| return Token::make (PERCENT, loc); |
| } |
| case '^': |
| if (peek_input () == '=') |
| { |
| // xor-assign? |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (CARET_EQ, loc); |
| } |
| else |
| { |
| // xor? |
| current_column++; |
| return Token::make (CARET, loc); |
| } |
| case '<': |
| if (peek_input () == '<') |
| { |
| if (peek_input (1) == '=') |
| { |
| // left-shift assign |
| skip_input (1); |
| current_column += 3; |
| loc += 2; |
| |
| return Token::make (LEFT_SHIFT_EQ, loc); |
| } |
| else |
| { |
| // left-shift |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (LEFT_SHIFT, loc); |
| } |
| } |
| else if (peek_input () == '=') |
| { |
| // smaller than or equal to |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (LESS_OR_EQUAL, loc); |
| } |
| else |
| { |
| // smaller than |
| current_column++; |
| return Token::make (LEFT_ANGLE, loc); |
| } |
| break; |
| case '>': |
| if (peek_input () == '>') |
| { |
| if (peek_input (1) == '=') |
| { |
| // right-shift-assign |
| skip_input (1); |
| current_column += 3; |
| loc += 2; |
| |
| return Token::make (RIGHT_SHIFT_EQ, loc); |
| } |
| else |
| { |
| // right-shift |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (RIGHT_SHIFT, loc); |
| } |
| } |
| else if (peek_input () == '=') |
| { |
| // larger than or equal to |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (GREATER_OR_EQUAL, loc); |
| } |
| else |
| { |
| // larger than |
| current_column++; |
| return Token::make (RIGHT_ANGLE, loc); |
| } |
| case ':': |
| if (peek_input () == ':') |
| { |
| // scope resolution :: |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (SCOPE_RESOLUTION, loc); |
| } |
| else |
| { |
| // single colon : |
| current_column++; |
| return Token::make (COLON, loc); |
| } |
| case '!': |
| // no special handling for macros in lexer? |
| if (peek_input () == '=') |
| { |
| // not equal boolean operator |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (NOT_EQUAL, loc); |
| } |
| else |
| { |
| // not equal unary operator |
| current_column++; |
| |
| return Token::make (EXCLAM, loc); |
| } |
| case '?': |
| current_column++; |
| return Token::make (QUESTION_MARK, loc); |
| case '#': |
| current_column++; |
| return Token::make (HASH, loc); |
| case '[': |
| current_column++; |
| return Token::make (LEFT_SQUARE, loc); |
| case ']': |
| current_column++; |
| return Token::make (RIGHT_SQUARE, loc); |
| case '{': |
| current_column++; |
| return Token::make (LEFT_CURLY, loc); |
| case '}': |
| current_column++; |
| return Token::make (RIGHT_CURLY, loc); |
| case '@': |
| current_column++; |
| return Token::make (PATTERN_BIND, loc); |
| case '$': |
| current_column++; |
| return Token::make (DOLLAR_SIGN, loc); |
| case '~': |
| current_column++; |
| return Token::make (TILDE, loc); |
| case '\\': |
| current_column++; |
| return Token::make (BACKSLASH, loc); |
| case '`': |
| current_column++; |
| return Token::make (BACKTICK, loc); |
| case '|': |
| if (peek_input () == '=') |
| { |
| // bitwise or-assign? |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (PIPE_EQ, loc); |
| } |
| else if (peek_input () == '|') |
| { |
| // logical or |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (OR, loc); |
| } |
| else |
| { |
| // bitwise or |
| current_column++; |
| |
| return Token::make (PIPE, loc); |
| } |
| case '&': |
| if (peek_input () == '=') |
| { |
| // bitwise and-assign? |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (AMP_EQ, loc); |
| } |
| else if (peek_input () == '&') |
| { |
| // logical and |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (LOGICAL_AND, loc); |
| } |
| else |
| { |
| // bitwise and/reference |
| current_column++; |
| |
| return Token::make (AMP, loc); |
| } |
| case '.': |
| if (peek_input () == '.') |
| { |
| if (peek_input (1) == '.') |
| { |
| // ellipsis |
| skip_input (1); |
| current_column += 3; |
| loc += 2; |
| |
| return Token::make (ELLIPSIS, loc); |
| } |
| else if (peek_input (1) == '=') |
| { |
| // ..= |
| skip_input (1); |
| current_column += 3; |
| loc += 2; |
| |
| return Token::make (DOT_DOT_EQ, loc); |
| } |
| else |
| { |
| // .. |
| skip_input (); |
| current_column += 2; |
| loc += 1; |
| |
| return Token::make (DOT_DOT, loc); |
| } |
| } |
| else /*if (!ISDIGIT (peek_input ()))*/ |
| { |
| // single dot . |
| // Only if followed by a non-number - otherwise is float |
| // nope, float cannot start with '.'. |
| current_column++; |
| return Token::make (DOT, loc); |
| } |
| } |
| // TODO: special handling of _ in the lexer? instead of being identifier |
| |
| // byte character, byte string and raw byte string literals |
| if (current_char == 'b') |
| { |
| if (peek_input () == '\'') |
| return parse_byte_char (loc); |
| else if (peek_input () == '"') |
| return parse_byte_string (loc); |
| else if (peek_input () == 'r' |
| && (peek_input (1) == '#' || peek_input (1) == '"')) |
| return parse_raw_byte_string (loc); |
| } |
| |
| // raw identifiers and raw strings |
| if (current_char == 'r') |
| { |
| int peek = peek_input (); |
| int peek1 = peek_input (1); |
| |
| if (peek == '#' && (ISALPHA (peek1) || peek1 == '_')) |
| { |
| TokenPtr raw_ident_ptr = parse_raw_identifier (loc); |
| if (raw_ident_ptr != nullptr) |
| return raw_ident_ptr; |
| else |
| continue; /* input got parsed, it just wasn't valid. An error |
| was produced. */ |
| } |
| else |
| { |
| TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc); |
| if (maybe_raw_string_ptr != nullptr) |
| return maybe_raw_string_ptr; |
| } |
| } |
| |
| // find identifiers and keywords |
| if (ISALPHA (current_char) || current_char == '_') |
| return parse_identifier_or_keyword (loc); |
| |
| // int and float literals |
| if (ISDIGIT (current_char)) |
| { // _ not allowed as first char |
| if (current_char == '0' |
| && is_non_decimal_int_literal_separator (peek_input ())) |
| { |
| // handle binary, octal, hex literals |
| TokenPtr non_dec_int_lit_ptr |
| = parse_non_decimal_int_literals (loc); |
| if (non_dec_int_lit_ptr != nullptr) |
| return non_dec_int_lit_ptr; |
| } |
| else |
| { |
| // handle decimals (integer or float) |
| TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc); |
| if (decimal_or_float_ptr != nullptr) |
| return decimal_or_float_ptr; |
| } |
| } |
| |
| // string literals |
| if (current_char == '"') |
| return parse_string (loc); |
| |
| // char literals and lifetime names |
| if (current_char == '\'') |
| { |
| TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc); |
| if (char_or_lifetime_ptr != nullptr) |
| return char_or_lifetime_ptr; |
| } |
| |
| // DEBUG: check for specific character problems: |
| if (current_char == '0') |
| rust_debug ("'0' uncaught before unexpected character"); |
| else if (current_char == ']') |
| rust_debug ("']' uncaught before unexpected character"); |
| else if (current_char == 0x5d) |
| rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before " |
| "unexpected character"); |
| |
| // didn't match anything so error |
| rust_error_at (loc, "unexpected character %<%x%>", current_char); |
| current_column++; |
| } |
| } |
| |
| // Parses in a type suffix. |
| std::pair<PrimitiveCoreType, int> |
| Lexer::parse_in_type_suffix () |
| { |
| std::string suffix; |
| suffix.reserve (5); |
| |
| int additional_length_offset = 0; |
| |
| // get suffix |
| while (ISALPHA (current_char) || ISDIGIT (current_char) |
| || current_char == '_') |
| { |
| if (current_char == '_') |
| { |
| // don't add _ to suffix |
| skip_input (); |
| current_char = peek_input (); |
| |
| additional_length_offset++; |
| |
| continue; |
| } |
| |
| additional_length_offset++; |
| |
| suffix += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| if (suffix.empty ()) |
| { |
| // no type suffix: do nothing but also no error |
| return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); |
| } |
| else if (suffix == "f32") |
| { |
| return std::make_pair (CORETYPE_F32, additional_length_offset); |
| } |
| else if (suffix == "f64") |
| { |
| return std::make_pair (CORETYPE_F64, additional_length_offset); |
| } |
| else if (suffix == "i8") |
| { |
| return std::make_pair (CORETYPE_I8, additional_length_offset); |
| } |
| else if (suffix == "i16") |
| { |
| return std::make_pair (CORETYPE_I16, additional_length_offset); |
| } |
| else if (suffix == "i32") |
| { |
| return std::make_pair (CORETYPE_I32, additional_length_offset); |
| } |
| else if (suffix == "i64") |
| { |
| return std::make_pair (CORETYPE_I64, additional_length_offset); |
| } |
| else if (suffix == "i128") |
| { |
| return std::make_pair (CORETYPE_I128, additional_length_offset); |
| } |
| else if (suffix == "isize") |
| { |
| return std::make_pair (CORETYPE_ISIZE, additional_length_offset); |
| } |
| else if (suffix == "u8") |
| { |
| return std::make_pair (CORETYPE_U8, additional_length_offset); |
| } |
| else if (suffix == "u16") |
| { |
| return std::make_pair (CORETYPE_U16, additional_length_offset); |
| } |
| else if (suffix == "u32") |
| { |
| return std::make_pair (CORETYPE_U32, additional_length_offset); |
| } |
| else if (suffix == "u64") |
| { |
| return std::make_pair (CORETYPE_U64, additional_length_offset); |
| } |
| else if (suffix == "u128") |
| { |
| return std::make_pair (CORETYPE_U128, additional_length_offset); |
| } |
| else if (suffix == "usize") |
| { |
| return std::make_pair (CORETYPE_USIZE, additional_length_offset); |
| } |
| else |
| { |
| rust_error_at (get_current_location (), "unknown number suffix %qs", |
| suffix.c_str ()); |
| |
| return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); |
| } |
| } |
| |
| // Parses in the exponent part (if any) of a float literal. |
| std::pair<std::string, int> |
| Lexer::parse_in_exponent_part () |
| { |
| int additional_length_offset = 0; |
| std::string str; |
| if (current_char == 'E' || current_char == 'e') |
| { |
| // add exponent to string as strtod works with it |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| |
| additional_length_offset++; |
| |
| // special - and + handling |
| if (current_char == '-') |
| { |
| str += '-'; |
| |
| skip_input (); |
| current_char = peek_input (); |
| |
| additional_length_offset++; |
| } |
| else if (current_char == '+') |
| { |
| // don't add + but still skip input |
| skip_input (); |
| current_char = peek_input (); |
| |
| additional_length_offset++; |
| } |
| |
| // parse another decimal number for exponent |
| auto str_length = parse_in_decimal (); |
| str += std::get<0> (str_length); |
| additional_length_offset += std::get<1> (str_length); |
| } |
| return std::make_pair (str, additional_length_offset); |
| } |
| |
| // Parses a decimal integer. |
| std::tuple<std::string, int, bool> |
| Lexer::parse_in_decimal () |
| { |
| /* A pure decimal contains only digits. */ |
| bool pure_decimal = true; |
| int additional_length_offset = 0; |
| std::string str; |
| while (ISDIGIT (current_char) || current_char == '_') |
| { |
| if (current_char == '_') |
| { |
| pure_decimal = false; |
| // don't add _ to number |
| skip_input (); |
| current_char = peek_input (); |
| |
| additional_length_offset++; |
| |
| continue; |
| } |
| |
| additional_length_offset++; |
| |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| return std::make_tuple (str, additional_length_offset, pure_decimal); |
| } |
| |
| /* Parses escapes (and string continues) in "byte" strings and characters. Does |
| * not support unicode. */ |
| std::tuple<char, int, bool> |
| Lexer::parse_escape (char opening_char) |
| { |
| int additional_length_offset = 0; |
| char output_char = 0; |
| |
| // skip to actual letter |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| |
| switch (current_char) |
| { |
| case 'x': { |
| auto hex_escape_pair = parse_partial_hex_escape (); |
| long hexLong = hex_escape_pair.first; |
| additional_length_offset += hex_escape_pair.second; |
| |
| if (hexLong > 255 || hexLong < 0) |
| rust_error_at ( |
| get_current_location (), |
| "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>", |
| static_cast<unsigned int> (hexLong)); |
| /* TODO: restore capital for escape output - gcc pretty-printer doesn't |
| * support %X directly */ |
| char hexChar = static_cast<char> (hexLong); |
| |
| output_char = hexChar; |
| } |
| break; |
| case 'n': |
| output_char = '\n'; |
| break; |
| case 'r': |
| output_char = '\r'; |
| break; |
| case 't': |
| output_char = '\t'; |
| break; |
| case '\\': |
| output_char = '\\'; |
| break; |
| case '0': |
| output_char = '\0'; |
| break; |
| case '\'': |
| output_char = '\''; |
| break; |
| case '"': |
| output_char = '"'; |
| break; |
| case 'u': |
| rust_error_at (get_current_location (), |
| "cannot have a unicode escape \\u in a byte %s", |
| opening_char == '\'' ? "character" : "string"); |
| // Try to parse it anyway, just to skip it |
| parse_partial_unicode_escape (); |
| return std::make_tuple (output_char, additional_length_offset, false); |
| case '\r': |
| case '\n': |
| // string continue |
| return std::make_tuple (0, parse_partial_string_continue (), true); |
| default: |
| rust_error_at (get_current_location (), |
| "unknown escape sequence %<\\%c%>", current_char); |
| // returns false if no parsing could be done |
| // return false; |
| return std::make_tuple (output_char, additional_length_offset, false); |
| break; |
| } |
| // all non-special cases (string continue) should skip their used char |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| |
| // returns true if parsing was successful |
| // return true; |
| return std::make_tuple (output_char, additional_length_offset, false); |
| } |
| |
| /* Parses an escape (or string continue) in a string or character. Supports |
| * unicode escapes. */ |
| std::tuple<Codepoint, int, bool> |
| Lexer::parse_utf8_escape () |
| { |
| Codepoint output_char; |
| int additional_length_offset = 0; |
| |
| // skip to actual letter |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| |
| switch (current_char) |
| { |
| case 'x': { |
| auto hex_escape_pair = parse_partial_hex_escape (); |
| long hexLong = hex_escape_pair.first; |
| additional_length_offset += hex_escape_pair.second; |
| |
| if (hexLong > 127 || hexLong < 0) |
| rust_error_at ( |
| get_current_location (), |
| "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>", |
| static_cast<unsigned int> (hexLong)); |
| /* TODO: restore capital for escape output - gcc pretty-printer doesn't |
| * support %X directly */ |
| char hexChar = static_cast<char> (hexLong); |
| |
| output_char = hexChar; |
| } |
| break; |
| case 'n': |
| output_char = '\n'; |
| break; |
| case 'r': |
| output_char = '\r'; |
| break; |
| case 't': |
| output_char = '\t'; |
| break; |
| case '\\': |
| output_char = '\\'; |
| break; |
| case '0': |
| output_char = '\0'; |
| break; |
| case '\'': |
| output_char = '\''; |
| break; |
| case '"': |
| output_char = '"'; |
| break; |
| case 'u': { |
| auto unicode_escape_pair = parse_partial_unicode_escape (); |
| output_char = unicode_escape_pair.first; |
| additional_length_offset += unicode_escape_pair.second; |
| |
| return std::make_tuple (output_char, additional_length_offset, false); |
| } |
| break; |
| case '\r': |
| case '\n': |
| // string continue |
| return std::make_tuple (0, parse_partial_string_continue (), true); |
| default: |
| rust_error_at (get_current_location (), |
| "unknown escape sequence %<\\%c%>", current_char); |
| // returns false if no parsing could be done |
| // return false; |
| return std::make_tuple (output_char, additional_length_offset, false); |
| break; |
| } |
| /* all non-special cases (unicode, string continue) should skip their used |
| * char */ |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| |
| // returns true if parsing was successful |
| // return true; |
| return std::make_tuple (output_char, additional_length_offset, false); |
| } |
| |
| // Parses the body of a string continue that has been found in an escape. |
| int |
| Lexer::parse_partial_string_continue () |
| { |
| int additional_length_offset = 1; |
| |
| // string continue |
| while (is_whitespace (current_char)) |
| { |
| if (current_char == '\n') |
| { |
| current_line++; |
| current_column = 1; |
| // tell line_table that new line starts |
| start_line (current_line, max_column_hint); |
| |
| // reset "length" |
| additional_length_offset = 1; |
| |
| // get next char |
| skip_input (); |
| current_char = peek_input (); |
| |
| continue; |
| } |
| |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| } |
| |
| return additional_length_offset; |
| } |
| |
| /* Parses the body of a '\x' escape. Note that it does not check that the number |
| * is valid and smaller than 255. */ |
| std::pair<long, int> |
| Lexer::parse_partial_hex_escape () |
| { |
| // hex char string (null-terminated) |
| char hexNum[3] = {0, 0, 0}; |
| |
| // first hex char |
| current_char = peek_input (1); |
| int additional_length_offset = 1; |
| |
| if (!is_x_digit (current_char)) |
| { |
| rust_error_at (get_current_location (), |
| "invalid character %<\\x%c%> in \\x sequence", |
| current_char); |
| return std::make_pair (0, 0); |
| } |
| hexNum[0] = current_char; |
| |
| // second hex char |
| skip_input (); |
| current_char = peek_input (1); |
| additional_length_offset++; |
| |
| if (!is_x_digit (current_char)) |
| { |
| rust_error_at (get_current_location (), |
| "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0], |
| current_char); |
| return std::make_pair (0, 1); |
| } |
| skip_input (); |
| hexNum[1] = current_char; |
| |
| long hexLong = std::strtol (hexNum, nullptr, 16); |
| |
| return std::make_pair (hexLong, additional_length_offset); |
| } |
| |
| // Parses the body of a unicode escape. |
| std::pair<Codepoint, int> |
| Lexer::parse_partial_unicode_escape () |
| { |
| skip_input (); |
| current_char = peek_input (); |
| int additional_length_offset = 0; |
| |
| if (current_char != '{') |
| { |
| rust_error_at (get_current_location (), |
| "unicode escape should start with %<{%>"); |
| /* Skip what should probaby have been between brackets. */ |
| while (is_x_digit (current_char) || current_char == '_') |
| { |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| } |
| return std::make_pair (Codepoint (0), additional_length_offset); |
| } |
| |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| |
| if (current_char == '_') |
| { |
| rust_error_at (get_current_location (), |
| "unicode escape cannot start with %<_%>"); |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| // fallthrough and try to parse the rest anyway |
| } |
| |
| // parse unicode escape - 1-6 hex digits |
| std::string num_str; |
| num_str.reserve (6); |
| |
| // loop through to add entire hex number to string |
| while (is_x_digit (current_char) || current_char == '_') |
| { |
| if (current_char == '_') |
| { |
| // don't add _ to number |
| skip_input (); |
| current_char = peek_input (); |
| |
| additional_length_offset++; |
| |
| continue; |
| } |
| |
| additional_length_offset++; |
| |
| // add raw hex numbers |
| num_str += current_char; |
| |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| if (current_char == '}') |
| { |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| } |
| else |
| { |
| // actually an error, but allow propagation anyway Assume that |
| // wrong bracketm whitespace or single/double quotes are wrong |
| // termination, otherwise it is a wrong character, then skip to the actual |
| // terminator. |
| if (current_char == '{' || is_whitespace (current_char) |
| || current_char == '\'' || current_char == '"') |
| { |
| rust_error_at (get_current_location (), |
| "expected terminating %<}%> in unicode escape"); |
| return std::make_pair (Codepoint (0), additional_length_offset); |
| } |
| else |
| { |
| rust_error_at (get_current_location (), |
| "invalid character %<%c%> in unicode escape", |
| current_char); |
| while (current_char != '}' && current_char != '{' |
| && !is_whitespace (current_char) && current_char != '\'' |
| && current_char != '"') |
| { |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| } |
| // Consume the actual closing bracket if found |
| if (current_char == '}') |
| { |
| skip_input (); |
| current_char = peek_input (); |
| additional_length_offset++; |
| } |
| return std::make_pair (Codepoint (0), additional_length_offset); |
| } |
| } |
| |
| // ensure 1-6 hex characters |
| if (num_str.length () > 6 || num_str.length () < 1) |
| { |
| rust_error_at (get_current_location (), |
| "unicode escape should be between 1 and 6 hex " |
| "characters; it is %lu", |
| (unsigned long) num_str.length ()); |
| // return false; |
| return std::make_pair (Codepoint (0), additional_length_offset); |
| } |
| |
| unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16); |
| |
| if (hex_num > 0xd7ff && hex_num < 0xe000) |
| { |
| rust_error_at ( |
| get_current_location (), |
| "unicode escape cannot be a surrogate value (D800 to DFFF)"); |
| return std::make_pair (Codepoint (0), additional_length_offset); |
| } |
| |
| if (hex_num > 0x10ffff) |
| { |
| rust_error_at (get_current_location (), |
| "unicode escape cannot be larger than 10FFFF"); |
| return std::make_pair (Codepoint (0), additional_length_offset); |
| } |
| |
| // return true; |
| return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)), |
| additional_length_offset); |
| } |
| |
| // Parses a byte character. |
| TokenPtr |
| Lexer::parse_byte_char (Location loc) |
| { |
| skip_input (); |
| current_column++; |
| // make current char the next character |
| current_char = peek_input (); |
| |
| int length = 1; |
| |
| // char to save |
| char byte_char = 0; |
| |
| // detect escapes |
| if (current_char == '\\') |
| { |
| auto escape_length_pair = parse_escape ('\''); |
| byte_char = std::get<0> (escape_length_pair); |
| length += std::get<1> (escape_length_pair); |
| |
| current_char = peek_input (); |
| |
| if (current_char != '\'') |
| { |
| rust_error_at (get_current_location (), "unclosed %<byte char%>"); |
| } |
| |
| skip_input (); |
| current_char = peek_input (); |
| length++; // go to next char |
| } |
| else if (current_char != '\'') |
| { |
| // otherwise, get character from direct input character |
| byte_char = current_char; |
| |
| skip_input (); |
| current_char = peek_input (); |
| length++; |
| |
| if (current_char != '\'') |
| { |
| rust_error_at (get_current_location (), "unclosed %<byte char%>"); |
| } |
| |
| skip_input (); |
| current_char = peek_input (); |
| length++; // go to next char |
| } |
| else |
| { |
| rust_error_at (get_current_location (), |
| "no character inside %<%> for %<byte char%>"); |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| return Token::make_byte_char (loc, byte_char); |
| } |
| |
| // Parses a byte string. |
| TokenPtr |
| Lexer::parse_byte_string (Location loc) |
| { |
| // byte string |
| |
| // skip quote character |
| skip_input (); |
| current_column++; |
| |
| std::string str; |
| str.reserve (16); // some sensible default |
| |
| int length = 1; |
| current_char = peek_input (); |
| |
| while (current_char != '"' && current_char != EOF) |
| { |
| if (current_char == '\\') |
| { |
| auto escape_length_pair = parse_escape ('"'); |
| char output_char = std::get<0> (escape_length_pair); |
| |
| if (output_char == 0 && std::get<2> (escape_length_pair)) |
| length = std::get<1> (escape_length_pair) - 1; |
| else |
| length += std::get<1> (escape_length_pair); |
| |
| if (output_char != 0 || !std::get<2> (escape_length_pair)) |
| str += output_char; |
| |
| continue; |
| } |
| |
| length++; |
| |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| current_column += length; |
| |
| if (current_char == '"') |
| { |
| current_column++; |
| |
| skip_input (); |
| current_char = peek_input (); |
| } |
| else if (current_char == EOF) |
| { |
| rust_error_at (get_current_location (), "unended byte string literal"); |
| return Token::make (END_OF_FILE, get_current_location ()); |
| } |
| else |
| { |
| gcc_unreachable (); |
| } |
| |
| str.shrink_to_fit (); |
| loc += str.size () - 1; |
| |
| return Token::make_byte_string (loc, std::move (str)); |
| } |
| |
| // Parses a raw byte string. |
| TokenPtr |
| Lexer::parse_raw_byte_string (Location loc) |
| { |
| // raw byte string literals |
| std::string str; |
| str.reserve (16); // some sensible default |
| |
| int length = 1; |
| int hash_count = 0; |
| |
| // get hash count at beginnning |
| skip_input (); |
| current_char = peek_input (); |
| length++; |
| while (current_char == '#') |
| { |
| hash_count++; |
| length++; |
| |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| if (current_char != '"') |
| { |
| rust_error_at (get_current_location (), |
| "raw byte string has no opening %<\"%>"); |
| } |
| |
| skip_input (); |
| current_char = peek_input (); |
| length++; |
| |
| while (true) |
| { |
| if (current_char == '"') |
| { |
| bool enough_hashes = true; |
| |
| for (int i = 0; i < hash_count; i++) |
| { |
| if (peek_input (i + 1) != '#') |
| { |
| enough_hashes = false; |
| break; |
| } |
| } |
| |
| if (enough_hashes) |
| { |
| // skip enough input and peek enough input |
| skip_input (hash_count); |
| current_char = peek_input (); |
| length += hash_count + 1; |
| break; |
| } |
| } |
| |
| if ((unsigned char) current_char > 127) |
| { |
| rust_error_at (get_current_location (), |
| "character %<%c%> in raw byte string out of range", |
| current_char); |
| current_char = 0; |
| } |
| |
| length++; |
| |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| |
| return Token::make_byte_string (loc, std::move (str)); |
| } |
| |
| // Parses a raw identifier. |
| TokenPtr |
| Lexer::parse_raw_identifier (Location loc) |
| { |
| // raw identifier |
| std::string str; |
| str.reserve (16); // default |
| |
| skip_input (); |
| current_char = peek_input (); |
| |
| current_column += 2; |
| |
| bool first_is_underscore = current_char == '_'; |
| |
| int length = 0; |
| current_char = peek_input (); |
| // loop through entire name |
| while (ISALPHA (current_char) || ISDIGIT (current_char) |
| || current_char == '_') |
| { |
| length++; |
| |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| current_column += length; |
| |
| // if just a single underscore, not an identifier |
| if (first_is_underscore && length == 1) |
| rust_error_at (get_current_location (), |
| "%<_%> is not a valid raw identifier"); |
| |
| if (str == "crate" || str == "extern" || str == "self" || str == "super" |
| || str == "Self") |
| { |
| rust_error_at (get_current_location (), |
| "%qs is a forbidden raw identifier", str.c_str ()); |
| |
| return nullptr; |
| } |
| else |
| { |
| str.shrink_to_fit (); |
| loc += length - 1; |
| |
| return Token::make_identifier (loc, std::move (str)); |
| } |
| } |
| |
| // skip broken string input (unterminated strings) |
| void |
| Lexer::skip_broken_string_input (int current_char) |
| { |
| while (current_char != '"' && current_char != EOF) |
| { |
| if (current_char == '\n') |
| { |
| current_line++; |
| current_column = 1; |
| } |
| else |
| { |
| current_column++; |
| } |
| skip_input (); |
| current_char = peek_input (); |
| } |
| if (current_char == '"') |
| { |
| current_column++; |
| |
| skip_input (); |
| current_char = peek_input (); |
| } |
| rust_debug ("skipped to %d:%d due to bad quotes", current_line, |
| current_column); |
| } |
| |
| // Parses a unicode string. |
| TokenPtr |
| Lexer::parse_string (Location loc) |
| { |
| Codepoint current_char32; |
| |
| std::string str; |
| str.reserve (16); // some sensible default |
| |
| int length = 1; |
| current_char32 = peek_codepoint_input (); |
| |
| // FIXME: This fails if the input ends. How do we check for EOF? |
| while (current_char32.value != '"' && !current_char32.is_eof ()) |
| { |
| if (current_char32.value == '\\') |
| { |
| // parse escape |
| auto utf8_escape_pair = parse_utf8_escape (); |
| current_char32 = std::get<0> (utf8_escape_pair); |
| |
| if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair)) |
| length = std::get<1> (utf8_escape_pair) - 1; |
| else |
| length += std::get<1> (utf8_escape_pair); |
| |
| if (current_char32 != Codepoint (0) |
| || !std::get<2> (utf8_escape_pair)) |
| str += current_char32; |
| |
| // required as parsing utf8 escape only changes current_char |
| current_char32 = peek_codepoint_input (); |
| |
| continue; |
| } |
| |
| length += get_input_codepoint_length (); |
| |
| str += current_char32; |
| skip_codepoint_input (); |
| current_char32 = peek_codepoint_input (); |
| } |
| |
| current_column += length; |
| |
| if (current_char32.value == '"') |
| { |
| current_column++; |
| |
| skip_input (); |
| current_char = peek_input (); |
| } |
| else if (current_char32.is_eof ()) |
| { |
| rust_error_at (get_current_location (), "unended string literal"); |
| return Token::make (END_OF_FILE, get_current_location ()); |
| } |
| else |
| { |
| gcc_unreachable (); |
| } |
| |
| str.shrink_to_fit (); |
| loc += length - 1; |
| |
| return Token::make_string (loc, std::move (str)); |
| } |
| |
| // Parses an identifier or keyword. |
| TokenPtr |
| Lexer::parse_identifier_or_keyword (Location loc) |
| { |
| std::string str; |
| str.reserve (16); // default |
| str += current_char; |
| |
| bool first_is_underscore = current_char == '_'; |
| |
| int length = 1; |
| current_char = peek_input (); |
| // loop through entire name |
| while (ISALPHA (current_char) || ISDIGIT (current_char) |
| || current_char == '_') |
| { |
| length++; |
| |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| current_column += length; |
| |
| // if just a single underscore, not an identifier |
| if (first_is_underscore && length == 1) |
| return Token::make (UNDERSCORE, loc); |
| |
| str.shrink_to_fit (); |
| |
| loc += length - 1; |
| |
| TokenId keyword = classify_keyword (str); |
| if (keyword == IDENTIFIER) |
| return Token::make_identifier (loc, std::move (str)); |
| else |
| return Token::make (keyword, loc); |
| } |
| |
| // Possibly returns a raw string token if it exists - otherwise returns null. |
| TokenPtr |
| Lexer::maybe_parse_raw_string (Location loc) |
| { |
| int peek_index = 0; |
| while (peek_input (peek_index) == '#') |
| peek_index++; |
| |
| if (peek_input (peek_index) == '"') |
| return parse_raw_string (loc, peek_index); |
| else |
| return nullptr; |
| } |
| |
| // Returns a raw string token. |
| TokenPtr |
| Lexer::parse_raw_string (Location loc, int initial_hash_count) |
| { |
| // raw string literals |
| std::string str; |
| str.reserve (16); // some sensible default |
| |
| int length = 1 + initial_hash_count; |
| |
| if (initial_hash_count > 0) |
| skip_input (initial_hash_count - 1); |
| |
| current_char = peek_input (); |
| |
| if (current_char != '"') |
| rust_error_at (get_current_location (), "raw string has no opening %<\"%>"); |
| |
| length++; |
| skip_input (); |
| Codepoint current_char32 = peek_codepoint_input (); |
| |
| while (!current_char32.is_eof ()) |
| { |
| if (current_char32.value == '"') |
| { |
| bool enough_hashes = true; |
| |
| for (int i = 0; i < initial_hash_count; i++) |
| { |
| if (peek_input (i + 1) != '#') |
| { |
| enough_hashes = false; |
| break; |
| } |
| } |
| |
| if (enough_hashes) |
| { |
| // skip enough input and peek enough input |
| skip_input (initial_hash_count); |
| current_char = peek_input (); |
| length += initial_hash_count + 1; |
| break; |
| } |
| } |
| |
| length++; |
| |
| str += current_char32; |
| skip_codepoint_input (); |
| current_char32 = peek_codepoint_input (); |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| |
| return Token::make_string (loc, std::move (str)); |
| } |
| |
| template <typename IsDigitFunc> |
| TokenPtr |
| Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func, |
| std::string existent_str, int base) |
| { |
| int length = 1; |
| |
| skip_input (); |
| current_char = peek_input (); |
| |
| length++; |
| |
| // loop through to add entire number to string |
| while (is_digit_func (current_char) || current_char == '_') |
| { |
| if (current_char == '_') |
| { |
| // don't add _ to number |
| skip_input (); |
| current_char = peek_input (); |
| |
| length++; |
| |
| continue; |
| } |
| |
| length++; |
| |
| // add raw numbers |
| existent_str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| } |
| |
| // convert value to decimal representation |
| long dec_num = std::strtol (existent_str.c_str (), nullptr, base); |
| |
| existent_str = std::to_string (dec_num); |
| |
| // parse in type suffix if it exists |
| auto type_suffix_pair = parse_in_type_suffix (); |
| PrimitiveCoreType type_hint = type_suffix_pair.first; |
| length += type_suffix_pair.second; |
| |
| current_column += length; |
| |
| if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) |
| { |
| rust_error_at (get_current_location (), |
| "invalid type suffix %qs for integer (%s) literal", |
| get_type_hint_string (type_hint), |
| base == 16 |
| ? "hex" |
| : (base == 8 ? "octal" |
| : (base == 2 ? "binary" |
| : "<insert unknown base>"))); |
| return nullptr; |
| } |
| |
| loc += length - 1; |
| |
| return Token::make_int (loc, std::move (existent_str), type_hint); |
| } |
| |
| // Parses a hex, binary or octal int literal. |
| TokenPtr |
| Lexer::parse_non_decimal_int_literals (Location loc) |
| { |
| std::string str; |
| str.reserve (16); // some sensible default |
| str += current_char; |
| |
| current_char = peek_input (); |
| |
| if (current_char == 'x') |
| { |
| // hex (integer only) |
| return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16); |
| } |
| else if (current_char == 'o') |
| { |
| // octal (integer only) |
| return parse_non_decimal_int_literal (loc, is_octal_digit, |
| std::move (str), 8); |
| } |
| else if (current_char == 'b') |
| { |
| // binary (integer only) |
| return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str), |
| 2); |
| } |
| else |
| { |
| return nullptr; |
| } |
| } |
| |
| // Parses a decimal-based int literal or float literal. |
| TokenPtr |
| Lexer::parse_decimal_int_or_float (Location loc) |
| { |
| std::string str; |
| str.reserve (16); // some sensible default |
| str += current_char; |
| |
| int length = 1; |
| bool first_zero = current_char == '0'; |
| |
| current_char = peek_input (); |
| |
| // parse initial decimal integer (or first integer part of float) literal |
| auto initial_decimal = parse_in_decimal (); |
| str += std::get<0> (initial_decimal); |
| length += std::get<1> (initial_decimal); |
| |
| // detect float literal |
| if (current_char == '.' && is_float_digit (peek_input (1))) |
| { |
| // float with a '.', parse another decimal into it |
| |
| // add . to str |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| length++; |
| |
| // parse another decimal number for float |
| auto second_decimal = parse_in_decimal (); |
| str += std::get<0> (second_decimal); |
| length += std::get<1> (second_decimal); |
| |
| // parse in exponent part if it exists |
| auto exponent_pair = parse_in_exponent_part (); |
| str += exponent_pair.first; |
| length += exponent_pair.second; |
| |
| // parse in type suffix if it exists |
| auto type_suffix_pair = parse_in_type_suffix (); |
| PrimitiveCoreType type_hint = type_suffix_pair.first; |
| length += type_suffix_pair.second; |
| |
| if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 |
| && type_hint != CORETYPE_UNKNOWN) |
| { |
| rust_error_at (get_current_location (), |
| "invalid type suffix %qs for floating-point literal", |
| get_type_hint_string (type_hint)); |
| // ignore invalid type suffix as everything else seems fine |
| type_hint = CORETYPE_UNKNOWN; |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| return Token::make_float (loc, std::move (str), type_hint); |
| } |
| else if (current_char == '.' && check_valid_float_dot_end (peek_input (1))) |
| { |
| // float that is just an integer with a terminating '.' character |
| |
| // add . to str |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| length++; |
| |
| // add a '0' after the . to prevent ambiguity |
| str += '0'; |
| |
| // type hint not allowed |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN); |
| } |
| else if (current_char == 'E' || current_char == 'e') |
| { |
| // exponent float with no '.' character |
| |
| // parse exponent part |
| auto exponent_pair = parse_in_exponent_part (); |
| str += exponent_pair.first; |
| length += exponent_pair.second; |
| |
| // parse in type suffix if it exists |
| auto type_suffix_pair = parse_in_type_suffix (); |
| PrimitiveCoreType type_hint = type_suffix_pair.first; |
| length += type_suffix_pair.second; |
| |
| if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 |
| && type_hint != CORETYPE_UNKNOWN) |
| { |
| rust_error_at (get_current_location (), |
| "invalid type suffix %qs for floating-point literal", |
| get_type_hint_string (type_hint)); |
| // ignore invalid type suffix as everything else seems fine |
| type_hint = CORETYPE_UNKNOWN; |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| return Token::make_float (loc, std::move (str), type_hint); |
| } |
| else |
| { |
| // is an integer |
| |
| // parse in type suffix if it exists |
| auto type_suffix_pair = parse_in_type_suffix (); |
| PrimitiveCoreType type_hint = type_suffix_pair.first; |
| /* A "real" pure decimal doesn't have a suffix and no zero prefix. */ |
| if (type_hint == CORETYPE_UNKNOWN) |
| { |
| bool pure_decimal = std::get<2> (initial_decimal); |
| if (pure_decimal && (!first_zero || str.size () == 1)) |
| type_hint = CORETYPE_PURE_DECIMAL; |
| } |
| length += type_suffix_pair.second; |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| return Token::make_int (loc, std::move (str), type_hint); |
| } |
| } |
| |
| TokenPtr |
| Lexer::parse_char_or_lifetime (Location loc) |
| { |
| Codepoint current_char32; |
| |
| int length = 1; |
| |
| current_char32 = peek_codepoint_input (); |
| if (current_char32.is_eof ()) |
| return nullptr; |
| |
| // parse escaped char literal |
| if (current_char32.value == '\\') |
| { |
| // parse escape |
| auto utf8_escape_pair = parse_utf8_escape (); |
| current_char32 = std::get<0> (utf8_escape_pair); |
| length += std::get<1> (utf8_escape_pair); |
| |
| if (peek_codepoint_input ().value != '\'') |
| { |
| rust_error_at (get_current_location (), "unended character literal"); |
| } |
| else |
| { |
| skip_codepoint_input (); |
| current_char = peek_input (); |
| length++; |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| return Token::make_char (loc, current_char32); |
| } |
| else |
| { |
| skip_codepoint_input (); |
| |
| if (peek_codepoint_input ().value == '\'') |
| { |
| // parse non-escaped char literal |
| |
| // skip the ' character |
| skip_input (); |
| current_char = peek_input (); |
| |
| // TODO fix due to different widths of utf-8 chars? |
| current_column += 3; |
| |
| loc += 2; |
| |
| return Token::make_char (loc, current_char32); |
| } |
| else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value) |
| || current_char32.value == '_') |
| { |
| // parse lifetime name |
| std::string str; |
| str += current_char32; |
| length++; |
| |
| current_char = peek_input (); |
| while (ISDIGIT (current_char) || ISALPHA (current_char) |
| || current_char == '_') |
| { |
| str += current_char; |
| skip_input (); |
| current_char = peek_input (); |
| length++; |
| } |
| |
| current_column += length; |
| |
| loc += length - 1; |
| |
| str.shrink_to_fit (); |
| return Token::make_lifetime (loc, std::move (str)); |
| } |
| else |
| { |
| rust_error_at ( |
| get_current_location (), |
| "expected %' after character constant in character literal"); |
| return nullptr; |
| } |
| } |
| } |
| |
| // Returns the length of the codepoint at the current position. |
| int |
| Lexer::get_input_codepoint_length () |
| { |
| uint8_t input = peek_input (); |
| |
| if ((int8_t) input == EOF) |
| return 0; |
| |
| if (input < 128) |
| { |
| // ascii -- 1 byte |
| // return input; |
| |
| return 1; |
| } |
| else if ((input & 0xC0) == 0x80) |
| { |
| // invalid (continuation; can't be first char) |
| // return 0xFFFE; |
| |
| return 0; |
| } |
| else if ((input & 0xE0) == 0xC0) |
| { |
| // 2 bytes |
| uint8_t input2 = peek_input (1); |
| if ((input2 & 0xC0) != 0x80) |
| return 0; |
| // return 0xFFFE; |
| |
| // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); |
| // return output; |
| return 2; |
| } |
| else if ((input & 0xF0) == 0xE0) |
| { |
| // 3 bytes |
| uint8_t input2 = peek_input (1); |
| if ((input2 & 0xC0) != 0x80) |
| return 0; |
| // return 0xFFFE; |
| |
| uint8_t input3 = peek_input (2); |
| if ((input3 & 0xC0) != 0x80) |
| return 0; |
| // return 0xFFFE; |
| |
| /*uint32_t output |
| = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << |
| 0); return output;*/ |
| return 3; |
| } |
| else if ((input & 0xF8) == 0xF0) |
| { |
| // 4 bytes |
| uint8_t input2 = peek_input (1); |
| if ((input2 & 0xC0) != 0x80) |
| return 0; |
| // return 0xFFFE; |
| |
| uint8_t input3 = peek_input (2); |
| if ((input3 & 0xC0) != 0x80) |
| return 0; |
| // return 0xFFFE; |
| |
| uint8_t input4 = peek_input (3); |
| if ((input4 & 0xC0) != 0x80) |
| return 0; |
| // return 0xFFFE; |
| |
| /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) |
| | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); |
| return output;*/ |
| return 4; |
| } |
| else |
| { |
| rust_error_at (get_current_location (), |
| "invalid UTF-8 [FIRST] (too long)"); |
| return 0; |
| } |
| } |
| |
| // Returns the codepoint at the current position. |
| Codepoint |
| Lexer::peek_codepoint_input () |
| { |
| uint8_t input = peek_input (); |
| |
| if ((int8_t) input == EOF) |
| return Codepoint::eof (); |
| |
| if (input < 128) |
| { |
| // ascii -- 1 byte |
| return {input}; |
| } |
| else if ((input & 0xC0) == 0x80) |
| { |
| // invalid (continuation; can't be first char) |
| return {0xFFFE}; |
| } |
| else if ((input & 0xE0) == 0xC0) |
| { |
| // 2 bytes |
| uint8_t input2 = peek_input (1); |
| if ((input2 & 0xC0) != 0x80) |
| return {0xFFFE}; |
| |
| uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); |
| return {output}; |
| } |
| else if ((input & 0xF0) == 0xE0) |
| { |
| // 3 bytes |
| uint8_t input2 = peek_input (1); |
| if ((input2 & 0xC0) != 0x80) |
| return {0xFFFE}; |
| |
| uint8_t input3 = peek_input (2); |
| if ((input3 & 0xC0) != 0x80) |
| return {0xFFFE}; |
| |
| uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) |
| | ((input3 & 0x3F) << 0); |
| return {output}; |
| } |
| else if ((input & 0xF8) == 0xF0) |
| { |
| // 4 bytes |
| uint8_t input2 = peek_input (1); |
| if ((input2 & 0xC0) != 0x80) |
| return {0xFFFE}; |
| |
| uint8_t input3 = peek_input (2); |
| if ((input3 & 0xC0) != 0x80) |
| return {0xFFFE}; |
| |
| uint8_t input4 = peek_input (3); |
| if ((input4 & 0xC0) != 0x80) |
| return {0xFFFE}; |
| |
| uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) |
| | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); |
| return {output}; |
| } |
| else |
| { |
| rust_error_at (get_current_location (), |
| "invalid UTF-8 [SECND] (too long)"); |
| return {0xFFFE}; |
| } |
| } |
| |
| void |
| Lexer::skip_codepoint_input () |
| { |
| int toSkip = get_input_codepoint_length (); |
| gcc_assert (toSkip >= 1); |
| |
| skip_input (toSkip - 1); |
| } |
| |
| int |
| Lexer::test_get_input_codepoint_n_length (int n_start_offset) |
| { |
| uint8_t input = peek_input (n_start_offset); |
| |
| if (input < 128) |
| { |
| // ascii -- 1 byte |
| // return input; |
| return 1; |
| } |
| else if ((input & 0xC0) == 0x80) |
| { |
| // invalid (continuation; can't be first char) |
| // return 0xFFFE; |
| return 0; |
| } |
| else if ((input & 0xE0) == 0xC0) |
| { |
| // 2 bytes |
| uint8_t input2 = peek_input (n_start_offset + 1); |
| if ((input2 & 0xC0) != 0x80) |
| // return 0xFFFE; |
| return 0; |
| |
| // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); |
| // return output; |
| return 2; |
| } |
| else if ((input & 0xF0) == 0xE0) |
| { |
| // 3 bytes |
| uint8_t input2 = peek_input (n_start_offset + 1); |
| if ((input2 & 0xC0) != 0x80) |
| // return 0xFFFE; |
| return 0; |
| |
| uint8_t input3 = peek_input (n_start_offset + 2); |
| if ((input3 & 0xC0) != 0x80) |
| // return 0xFFFE; |
| return 0; |
| |
| /*uint32_t output |
| = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << |
| 0); return output;*/ |
| return 3; |
| } |
| else if ((input & 0xF8) == 0xF0) |
| { |
| // 4 bytes |
| uint8_t input2 = peek_input (n_start_offset + 1); |
| if ((input2 & 0xC0) != 0x80) |
| // return 0xFFFE; |
| return 0; |
| |
| uint8_t input3 = peek_input (n_start_offset + 2); |
| if ((input3 & 0xC0) != 0x80) |
| // return 0xFFFE; |
| return 0; |
| |
| uint8_t input4 = peek_input (n_start_offset + 3); |
| if ((input4 & 0xC0) != 0x80) |
| // return 0xFFFE; |
| return 0; |
| |
| /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) |
| | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); |
| return output;*/ |
| return 4; |
| } |
| else |
| { |
| rust_error_at (get_current_location (), |
| "invalid UTF-8 [THIRD] (too long)"); |
| return 0; |
| } |
| } |
| |
| // peeks the codepoint input at n codepoints ahead of current codepoint - try |
| // not to use |
| Codepoint |
| Lexer::test_peek_codepoint_input (int n) |
| { |
| int totalOffset = 0; |
| |
| // add up all offsets into total offset? does this do what I want? |
| for (int i = 0; i < n; i++) |
| { |
| totalOffset += test_get_input_codepoint_n_length (totalOffset); |
| } |
| // issues: this would have (at least) O(n) lookup time, not O(1) like the |
| // rest? |
| |
| // TODO: implement if still needed |
| |
| // error out of function as it is not implemented |
| gcc_assert (1 == 0); |
| return {0}; |
| /* |
| uint8_t input = peek_input(); |
| |
| if (input < 128) { |
| // ascii -- 1 byte |
| return input; |
| } else if ((input & 0xC0) == 0x80) { |
| // invalid (continuation; can't be first char) |
| return 0xFFFE; |
| } else if ((input & 0xE0) == 0xC0) { |
| // 2 bytes |
| uint8_t input2 = peek_input(1); |
| if ((input2 & 0xC0) != 0x80) |
| return 0xFFFE; |
| |
| uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); |
| return output; |
| } else if ((input & 0xF0) == 0xE0) { |
| // 3 bytes |
| uint8_t input2 = peek_input(1); |
| if ((input2 & 0xC0) != 0x80) |
| return 0xFFFE; |
| |
| uint8_t input3 = peek_input(2); |
| if ((input3 & 0xC0) != 0x80) |
| return 0xFFFE; |
| |
| uint32_t output |
| = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & |
| 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) { |
| // 4 bytes |
| uint8_t input2 = peek_input(1); |
| if ((input2 & 0xC0) != 0x80) |
| return 0xFFFE; |
| |
| uint8_t input3 = peek_input(2); |
| if ((input3 & 0xC0) != 0x80) |
| return 0xFFFE; |
| |
| uint8_t input4 = peek_input(3); |
| if ((input4 & 0xC0) != 0x80) |
| return 0xFFFE; |
| |
| uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) |
| | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << |
| 0); return output; } else { rust_error_at(get_current_location(), "invalid |
| UTF-8 (too long)"); return 0xFFFE; |
| }*/ |
| } |
| |
| void |
| Lexer::split_current_token (TokenId new_left, TokenId new_right) |
| { |
| /* TODO: assert that this TokenId is a "simple token" like punctuation and not |
| * like "IDENTIFIER"? */ |
| Location current_loc = peek_token ()->get_locus (); |
| TokenPtr new_left_tok = Token::make (new_left, current_loc); |
| TokenPtr new_right_tok = Token::make (new_right, current_loc + 1); |
| |
| token_queue.replace_current_value (std::move (new_left_tok)); |
| token_queue.insert (1, std::move (new_right_tok)); |
| } |
| |
| void |
| Lexer::start_line (int current_line, int current_column) |
| { |
| if (line_map) |
| line_map->start_line (current_line, current_column); |
| } |
| |
| } // namespace Rust |