| |
| /* Compiler implementation of the D programming language |
| * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved |
| * written by Walter Bright |
| * http://www.digitalmars.com |
| * Distributed under the Boost Software License, Version 1.0. |
| * http://www.boost.org/LICENSE_1_0.txt |
| * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c |
| */ |
| |
| /* Lexical Analyzer */ |
| |
| #include "root/dsystem.h" // for time() and ctime() |
| #include "root/rmem.h" |
| |
| #include "mars.h" |
| #include "lexer.h" |
| #include "utf.h" |
| #include "identifier.h" |
| #include "id.h" |
| |
| extern int HtmlNamedEntity(const utf8_t *p, size_t length); |
| |
| #define LS 0x2028 // UTF line separator |
| #define PS 0x2029 // UTF paragraph separator |
| |
| /******************************************** |
| * Do our own char maps |
| */ |
| |
| static unsigned char cmtable[256]; |
| |
| const int CMoctal = 0x1; |
| const int CMhex = 0x2; |
| const int CMidchar = 0x4; |
| |
| inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; } |
| inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; } |
| inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; } |
| |
| struct CMTableInitializer |
| { |
| CMTableInitializer(); |
| }; |
| |
| static CMTableInitializer cmtableinitializer; |
| |
| CMTableInitializer::CMTableInitializer() |
| { |
| for (unsigned c = 0; c < 256; c++) |
| { |
| if ('0' <= c && c <= '7') |
| cmtable[c] |= CMoctal; |
| if (isxdigit(c)) |
| cmtable[c] |= CMhex; |
| if (isalnum(c) || c == '_') |
| cmtable[c] |= CMidchar; |
| } |
| } |
| |
| /*************************** Lexer ********************************************/ |
| |
| OutBuffer Lexer::stringbuffer; |
| |
| Lexer::Lexer(const char *filename, |
| const utf8_t *base, size_t begoffset, size_t endoffset, |
| bool doDocComment, bool commentToken) |
| { |
| scanloc = Loc(filename, 1, 1); |
| //printf("Lexer::Lexer(%p,%d)\n",base,length); |
| //printf("lexer.filename = %s\n", filename); |
| this->token = Token(); |
| this->token.ptr = NULL; |
| this->token.value = TOKreserved; |
| this->token.blockComment = NULL; |
| this->token.lineComment = NULL; |
| this->base = base; |
| this->end = base + endoffset; |
| p = base + begoffset; |
| line = p; |
| this->doDocComment = doDocComment; |
| this->anyToken = 0; |
| this->commentToken = commentToken; |
| this->errors = false; |
| //initKeywords(); |
| |
| /* If first line starts with '#!', ignore the line |
| */ |
| |
| if (p[0] == '#' && p[1] =='!') |
| { |
| p += 2; |
| while (1) |
| { |
| utf8_t c = *p++; |
| switch (c) |
| { |
| case 0: |
| case 0x1A: |
| p--; |
| /* fall through */ |
| |
| case '\n': |
| break; |
| |
| default: |
| continue; |
| } |
| break; |
| } |
| endOfLine(); |
| } |
| } |
| |
| |
| void Lexer::endOfLine() |
| { |
| scanloc.linnum++; |
| line = p; |
| } |
| |
| |
| void Lexer::error(const char *format, ...) |
| { |
| va_list ap; |
| va_start(ap, format); |
| ::verror(token.loc, format, ap); |
| va_end(ap); |
| errors = true; |
| } |
| |
| void Lexer::error(Loc loc, const char *format, ...) |
| { |
| va_list ap; |
| va_start(ap, format); |
| ::verror(loc, format, ap); |
| va_end(ap); |
| errors = true; |
| } |
| |
| void Lexer::deprecation(const char *format, ...) |
| { |
| va_list ap; |
| va_start(ap, format); |
| ::vdeprecation(token.loc, format, ap); |
| va_end(ap); |
| if (global.params.useDeprecated == DIAGNOSTICerror) |
| errors = true; |
| } |
| |
| TOK Lexer::nextToken() |
| { |
| if (token.next) |
| { |
| Token *t = token.next; |
| memcpy(&token,t,sizeof(Token)); |
| t->free(); |
| } |
| else |
| { |
| scan(&token); |
| } |
| //token.print(); |
| return token.value; |
| } |
| |
| Token *Lexer::peek(Token *ct) |
| { |
| Token *t; |
| if (ct->next) |
| t = ct->next; |
| else |
| { |
| t = Token::alloc(); |
| scan(t); |
| ct->next = t; |
| } |
| return t; |
| } |
| |
| /*********************** |
| * Look ahead at next token's value. |
| */ |
| |
| TOK Lexer::peekNext() |
| { |
| return peek(&token)->value; |
| } |
| |
| /*********************** |
| * Look 2 tokens ahead at value. |
| */ |
| |
| TOK Lexer::peekNext2() |
| { |
| Token *t = peek(&token); |
| return peek(t)->value; |
| } |
| |
| /********************************* |
| * tk is on the opening (. |
| * Look ahead and return token that is past the closing ). |
| */ |
| |
| Token *Lexer::peekPastParen(Token *tk) |
| { |
| //printf("peekPastParen()\n"); |
| int parens = 1; |
| int curlynest = 0; |
| while (1) |
| { |
| tk = peek(tk); |
| //tk->print(); |
| switch (tk->value) |
| { |
| case TOKlparen: |
| parens++; |
| continue; |
| |
| case TOKrparen: |
| --parens; |
| if (parens) |
| continue; |
| tk = peek(tk); |
| break; |
| |
| case TOKlcurly: |
| curlynest++; |
| continue; |
| |
| case TOKrcurly: |
| if (--curlynest >= 0) |
| continue; |
| break; |
| |
| case TOKsemicolon: |
| if (curlynest) |
| continue; |
| break; |
| |
| case TOKeof: |
| break; |
| |
| default: |
| continue; |
| } |
| return tk; |
| } |
| } |
| |
| /**************************** |
| * Turn next token in buffer into a token. |
| */ |
| |
| void Lexer::scan(Token *t) |
| { |
| unsigned lastLine = scanloc.linnum; |
| Loc startLoc; |
| |
| t->blockComment = NULL; |
| t->lineComment = NULL; |
| while (1) |
| { |
| t->ptr = p; |
| //printf("p = %p, *p = '%c'\n",p,*p); |
| t->loc = loc(); |
| switch (*p) |
| { |
| case 0: |
| case 0x1A: |
| t->value = TOKeof; // end of file |
| return; |
| |
| case ' ': |
| case '\t': |
| case '\v': |
| case '\f': |
| p++; |
| continue; // skip white space |
| |
| case '\r': |
| p++; |
| if (*p != '\n') // if CR stands by itself |
| endOfLine(); |
| continue; // skip white space |
| |
| case '\n': |
| p++; |
| endOfLine(); |
| continue; // skip white space |
| |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| t->value = number(t); |
| return; |
| |
| case '\'': |
| t->value = charConstant(t); |
| return; |
| |
| case 'r': |
| if (p[1] != '"') |
| goto case_ident; |
| p++; |
| /* fall through */ |
| case '`': |
| t->value = wysiwygStringConstant(t, *p); |
| return; |
| |
| case 'x': |
| if (p[1] != '"') |
| goto case_ident; |
| p++; |
| t->value = hexStringConstant(t); |
| return; |
| |
| case 'q': |
| if (p[1] == '"') |
| { |
| p++; |
| t->value = delimitedStringConstant(t); |
| return; |
| } |
| else if (p[1] == '{') |
| { |
| p++; |
| t->value = tokenStringConstant(t); |
| return; |
| } |
| else |
| goto case_ident; |
| |
| case '"': |
| t->value = escapeStringConstant(t); |
| return; |
| |
| case 'a': case 'b': case 'c': case 'd': case 'e': |
| case 'f': case 'g': case 'h': case 'i': case 'j': |
| case 'k': case 'l': case 'm': case 'n': case 'o': |
| case 'p': /*case 'q': case 'r':*/ case 's': case 't': |
| case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': |
| case 'z': |
| case 'A': case 'B': case 'C': case 'D': case 'E': |
| case 'F': case 'G': case 'H': case 'I': case 'J': |
| case 'K': case 'L': case 'M': case 'N': case 'O': |
| case 'P': case 'Q': case 'R': case 'S': case 'T': |
| case 'U': case 'V': case 'W': case 'X': case 'Y': |
| case 'Z': |
| case '_': |
| case_ident: |
| { utf8_t c; |
| |
| while (1) |
| { |
| c = *++p; |
| if (isidchar(c)) |
| continue; |
| else if (c & 0x80) |
| { const utf8_t *s = p; |
| unsigned u = decodeUTF(); |
| if (isUniAlpha(u)) |
| continue; |
| error("char 0x%04x not allowed in identifier", u); |
| p = s; |
| } |
| break; |
| } |
| |
| Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr); |
| t->ident = id; |
| t->value = (TOK) id->getValue(); |
| anyToken = 1; |
| if (*t->ptr == '_') // if special identifier token |
| { |
| static bool initdone = false; |
| static char date[11+1]; |
| static char time[8+1]; |
| static char timestamp[24+1]; |
| |
| if (!initdone) // lazy evaluation |
| { |
| initdone = true; |
| time_t ct; |
| ::time(&ct); |
| char *p = ctime(&ct); |
| assert(p); |
| sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); |
| sprintf(&time[0], "%.8s", p + 11); |
| sprintf(×tamp[0], "%.24s", p); |
| } |
| |
| if (id == Id::DATE) |
| { |
| t->ustring = (utf8_t *)date; |
| goto Lstr; |
| } |
| else if (id == Id::TIME) |
| { |
| t->ustring = (utf8_t *)time; |
| goto Lstr; |
| } |
| else if (id == Id::VENDOR) |
| { |
| t->ustring = (utf8_t *)const_cast<char *>(global.vendor); |
| goto Lstr; |
| } |
| else if (id == Id::TIMESTAMP) |
| { |
| t->ustring = (utf8_t *)timestamp; |
| Lstr: |
| t->value = TOKstring; |
| t->postfix = 0; |
| t->len = (unsigned)strlen((char *)t->ustring); |
| } |
| else if (id == Id::VERSIONX) |
| { unsigned major = 0; |
| unsigned minor = 0; |
| bool point = false; |
| |
| for (const char *p = global.version + 1; 1; p++) |
| { |
| c = *p; |
| if (isdigit((utf8_t)c)) |
| minor = minor * 10 + c - '0'; |
| else if (c == '.') |
| { |
| if (point) |
| break; // ignore everything after second '.' |
| point = true; |
| major = minor; |
| minor = 0; |
| } |
| else |
| break; |
| } |
| t->value = TOKint64v; |
| t->uns64value = major * 1000 + minor; |
| } |
| else if (id == Id::EOFX) |
| { |
| t->value = TOKeof; |
| // Advance scanner to end of file |
| while (!(*p == 0 || *p == 0x1A)) |
| p++; |
| } |
| } |
| //printf("t->value = %d\n",t->value); |
| return; |
| } |
| |
| case '/': |
| p++; |
| switch (*p) |
| { |
| case '=': |
| p++; |
| t->value = TOKdivass; |
| return; |
| |
| case '*': |
| p++; |
| startLoc = loc(); |
| while (1) |
| { |
| while (1) |
| { utf8_t c = *p; |
| switch (c) |
| { |
| case '/': |
| break; |
| |
| case '\n': |
| endOfLine(); |
| p++; |
| continue; |
| |
| case '\r': |
| p++; |
| if (*p != '\n') |
| endOfLine(); |
| continue; |
| |
| case 0: |
| case 0x1A: |
| error("unterminated /* */ comment"); |
| p = end; |
| t->loc = loc(); |
| t->value = TOKeof; |
| return; |
| |
| default: |
| if (c & 0x80) |
| { unsigned u = decodeUTF(); |
| if (u == PS || u == LS) |
| endOfLine(); |
| } |
| p++; |
| continue; |
| } |
| break; |
| } |
| p++; |
| if (p[-2] == '*' && p - 3 != t->ptr) |
| break; |
| } |
| if (commentToken) |
| { |
| t->loc = startLoc; |
| t->value = TOKcomment; |
| return; |
| } |
| else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr) |
| { // if /** but not /**/ |
| getDocComment(t, lastLine == startLoc.linnum); |
| } |
| continue; |
| |
| case '/': // do // style comments |
| startLoc = loc(); |
| while (1) |
| { utf8_t c = *++p; |
| switch (c) |
| { |
| case '\n': |
| break; |
| |
| case '\r': |
| if (p[1] == '\n') |
| p++; |
| break; |
| |
| case 0: |
| case 0x1A: |
| if (commentToken) |
| { |
| p = end; |
| t->loc = startLoc; |
| t->value = TOKcomment; |
| return; |
| } |
| if (doDocComment && t->ptr[2] == '/') |
| getDocComment(t, lastLine == startLoc.linnum); |
| p = end; |
| t->loc = loc(); |
| t->value = TOKeof; |
| return; |
| |
| default: |
| if (c & 0x80) |
| { unsigned u = decodeUTF(); |
| if (u == PS || u == LS) |
| break; |
| } |
| continue; |
| } |
| break; |
| } |
| |
| if (commentToken) |
| { |
| p++; |
| endOfLine(); |
| t->loc = startLoc; |
| t->value = TOKcomment; |
| return; |
| } |
| if (doDocComment && t->ptr[2] == '/') |
| getDocComment(t, lastLine == startLoc.linnum); |
| |
| p++; |
| endOfLine(); |
| continue; |
| |
| case '+': |
| { int nest; |
| |
| startLoc = loc(); |
| p++; |
| nest = 1; |
| while (1) |
| { utf8_t c = *p; |
| switch (c) |
| { |
| case '/': |
| p++; |
| if (*p == '+') |
| { |
| p++; |
| nest++; |
| } |
| continue; |
| |
| case '+': |
| p++; |
| if (*p == '/') |
| { |
| p++; |
| if (--nest == 0) |
| break; |
| } |
| continue; |
| |
| case '\r': |
| p++; |
| if (*p != '\n') |
| endOfLine(); |
| continue; |
| |
| case '\n': |
| endOfLine(); |
| p++; |
| continue; |
| |
| case 0: |
| case 0x1A: |
| error("unterminated /+ +/ comment"); |
| p = end; |
| t->loc = loc(); |
| t->value = TOKeof; |
| return; |
| |
| default: |
| if (c & 0x80) |
| { unsigned u = decodeUTF(); |
| if (u == PS || u == LS) |
| endOfLine(); |
| } |
| p++; |
| continue; |
| } |
| break; |
| } |
| if (commentToken) |
| { |
| t->loc = startLoc; |
| t->value = TOKcomment; |
| return; |
| } |
| if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr) |
| { // if /++ but not /++/ |
| getDocComment(t, lastLine == startLoc.linnum); |
| } |
| continue; |
| } |
| default: |
| break; |
| } |
| t->value = TOKdiv; |
| return; |
| |
| case '.': |
| p++; |
| if (isdigit(*p)) |
| { /* Note that we don't allow ._1 and ._ as being |
| * valid floating point numbers. |
| */ |
| p--; |
| t->value = inreal(t); |
| } |
| else if (p[0] == '.') |
| { |
| if (p[1] == '.') |
| { p += 2; |
| t->value = TOKdotdotdot; |
| } |
| else |
| { p++; |
| t->value = TOKslice; |
| } |
| } |
| else |
| t->value = TOKdot; |
| return; |
| |
| case '&': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKandass; |
| } |
| else if (*p == '&') |
| { p++; |
| t->value = TOKandand; |
| } |
| else |
| t->value = TOKand; |
| return; |
| |
| case '|': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKorass; |
| } |
| else if (*p == '|') |
| { p++; |
| t->value = TOKoror; |
| } |
| else |
| t->value = TOKor; |
| return; |
| |
| case '-': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKminass; |
| } |
| else if (*p == '-') |
| { p++; |
| t->value = TOKminusminus; |
| } |
| else |
| t->value = TOKmin; |
| return; |
| |
| case '+': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKaddass; |
| } |
| else if (*p == '+') |
| { p++; |
| t->value = TOKplusplus; |
| } |
| else |
| t->value = TOKadd; |
| return; |
| |
| case '<': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKle; // <= |
| } |
| else if (*p == '<') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKshlass; // <<= |
| } |
| else |
| t->value = TOKshl; // << |
| } |
| else if (*p == '>') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKleg; // <>= |
| } |
| else |
| t->value = TOKlg; // <> |
| } |
| else |
| t->value = TOKlt; // < |
| return; |
| |
| case '>': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKge; // >= |
| } |
| else if (*p == '>') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKshrass; // >>= |
| } |
| else if (*p == '>') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKushrass; // >>>= |
| } |
| else |
| t->value = TOKushr; // >>> |
| } |
| else |
| t->value = TOKshr; // >> |
| } |
| else |
| t->value = TOKgt; // > |
| return; |
| |
| case '!': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKnotequal; // != |
| } |
| else if (*p == '<') |
| { p++; |
| if (*p == '>') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKunord; // !<>= |
| } |
| else |
| t->value = TOKue; // !<> |
| } |
| else if (*p == '=') |
| { p++; |
| t->value = TOKug; // !<= |
| } |
| else |
| t->value = TOKuge; // !< |
| } |
| else if (*p == '>') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKul; // !>= |
| } |
| else |
| t->value = TOKule; // !> |
| } |
| else |
| t->value = TOKnot; // ! |
| return; |
| |
| case '=': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKequal; // == |
| } |
| else if (*p == '>') |
| { p++; |
| t->value = TOKgoesto; // => |
| } |
| else |
| t->value = TOKassign; // = |
| return; |
| |
| case '~': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKcatass; // ~= |
| } |
| else |
| t->value = TOKtilde; // ~ |
| return; |
| |
| case '^': |
| p++; |
| if (*p == '^') |
| { p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKpowass; // ^^= |
| } |
| else |
| t->value = TOKpow; // ^^ |
| } |
| else if (*p == '=') |
| { p++; |
| t->value = TOKxorass; // ^= |
| } |
| else |
| t->value = TOKxor; // ^ |
| return; |
| |
| case '(': p++; t->value = TOKlparen; return; |
| case ')': p++; t->value = TOKrparen; return; |
| case '[': p++; t->value = TOKlbracket; return; |
| case ']': p++; t->value = TOKrbracket; return; |
| case '{': p++; t->value = TOKlcurly; return; |
| case '}': p++; t->value = TOKrcurly; return; |
| case '?': p++; t->value = TOKquestion; return; |
| case ',': p++; t->value = TOKcomma; return; |
| case ';': p++; t->value = TOKsemicolon; return; |
| case ':': p++; t->value = TOKcolon; return; |
| case '$': p++; t->value = TOKdollar; return; |
| case '@': p++; t->value = TOKat; return; |
| |
| case '*': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKmulass; |
| } |
| else |
| t->value = TOKmul; |
| return; |
| case '%': |
| p++; |
| if (*p == '=') |
| { p++; |
| t->value = TOKmodass; |
| } |
| else |
| t->value = TOKmod; |
| return; |
| |
| case '#': |
| { |
| p++; |
| Token n; |
| scan(&n); |
| if (n.value == TOKidentifier) |
| { |
| if (n.ident == Id::line) |
| { |
| poundLine(); |
| continue; |
| } |
| else |
| { |
| const Loc locx = loc(); |
| warning(locx, "C preprocessor directive `#%s` is not supported", n.ident->toChars()); |
| } |
| } |
| else if (n.value == TOKif) |
| { |
| error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); |
| } |
| t->value = TOKpound; |
| return; |
| } |
| |
| default: |
| { unsigned c = *p; |
| |
| if (c & 0x80) |
| { c = decodeUTF(); |
| |
| // Check for start of unicode identifier |
| if (isUniAlpha(c)) |
| goto case_ident; |
| |
| if (c == PS || c == LS) |
| { |
| endOfLine(); |
| p++; |
| continue; |
| } |
| } |
| if (c < 0x80 && isprint(c)) |
| error("character '%c' is not a valid token", c); |
| else |
| error("character 0x%02x is not a valid token", c); |
| p++; |
| continue; |
| } |
| } |
| } |
| } |
| |
| /******************************************* |
| * Parse escape sequence. |
| */ |
| |
| unsigned Lexer::escapeSequence() |
| { unsigned c = *p; |
| |
| int n; |
| int ndigits; |
| |
| switch (c) |
| { |
| case '\'': |
| case '"': |
| case '?': |
| case '\\': |
| Lconsume: |
| p++; |
| break; |
| |
| case 'a': c = 7; goto Lconsume; |
| case 'b': c = 8; goto Lconsume; |
| case 'f': c = 12; goto Lconsume; |
| case 'n': c = 10; goto Lconsume; |
| case 'r': c = 13; goto Lconsume; |
| case 't': c = 9; goto Lconsume; |
| case 'v': c = 11; goto Lconsume; |
| |
| case 'u': |
| ndigits = 4; |
| goto Lhex; |
| case 'U': |
| ndigits = 8; |
| goto Lhex; |
| case 'x': |
| ndigits = 2; |
| Lhex: |
| p++; |
| c = *p; |
| if (ishex((utf8_t)c)) |
| { unsigned v; |
| |
| n = 0; |
| v = 0; |
| while (1) |
| { |
| if (isdigit((utf8_t)c)) |
| c -= '0'; |
| else if (islower(c)) |
| c -= 'a' - 10; |
| else |
| c -= 'A' - 10; |
| v = v * 16 + c; |
| c = *++p; |
| if (++n == ndigits) |
| break; |
| if (!ishex((utf8_t)c)) |
| { error("escape hex sequence has %d hex digits instead of %d", n, ndigits); |
| break; |
| } |
| } |
| if (ndigits != 2 && !utf_isValidDchar(v)) |
| { error("invalid UTF character \\U%08x", v); |
| v = '?'; // recover with valid UTF character |
| } |
| c = v; |
| } |
| else |
| error("undefined escape hex sequence \\%c",c); |
| break; |
| |
| case '&': // named character entity |
| for (const utf8_t *idstart = ++p; 1; p++) |
| { |
| switch (*p) |
| { |
| case ';': |
| c = HtmlNamedEntity(idstart, p - idstart); |
| if (c == ~0U) |
| { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart); |
| c = ' '; |
| } |
| p++; |
| break; |
| |
| default: |
| if (isalpha(*p) || |
| (p != idstart && isdigit(*p))) |
| continue; |
| error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart); |
| break; |
| } |
| break; |
| } |
| break; |
| |
| case 0: |
| case 0x1A: // end of file |
| c = '\\'; |
| break; |
| |
| default: |
| if (isoctal((utf8_t)c)) |
| { unsigned v; |
| |
| n = 0; |
| v = 0; |
| do |
| { |
| v = v * 8 + (c - '0'); |
| c = *++p; |
| } while (++n < 3 && isoctal((utf8_t)c)); |
| c = v; |
| if (c > 0xFF) |
| error("escape octal sequence \\%03o is larger than \\377", c); |
| } |
| else |
| error("undefined escape sequence \\%c",c); |
| break; |
| } |
| return c; |
| } |
| |
| /************************************** |
| */ |
| |
| TOK Lexer::wysiwygStringConstant(Token *t, int tc) |
| { |
| int c; |
| Loc start = loc(); |
| |
| p++; |
| stringbuffer.reset(); |
| while (1) |
| { |
| c = *p++; |
| switch (c) |
| { |
| case '\n': |
| endOfLine(); |
| break; |
| |
| case '\r': |
| if (*p == '\n') |
| continue; // ignore |
| c = '\n'; // treat EndOfLine as \n character |
| endOfLine(); |
| break; |
| |
| case 0: |
| case 0x1A: |
| error("unterminated string constant starting at %s", start.toChars()); |
| t->ustring = (utf8_t *)const_cast<char *>(""); |
| t->len = 0; |
| t->postfix = 0; |
| return TOKstring; |
| |
| case '"': |
| case '`': |
| if (c == tc) |
| { |
| t->len = (unsigned)stringbuffer.offset; |
| stringbuffer.writeByte(0); |
| t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); |
| memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); |
| stringPostfix(t); |
| return TOKstring; |
| } |
| break; |
| |
| default: |
| if (c & 0x80) |
| { p--; |
| unsigned u = decodeUTF(); |
| p++; |
| if (u == PS || u == LS) |
| endOfLine(); |
| stringbuffer.writeUTF8(u); |
| continue; |
| } |
| break; |
| } |
| stringbuffer.writeByte(c); |
| } |
| } |
| |
| /************************************** |
| * Lex hex strings: |
| * x"0A ae 34FE BD" |
| */ |
| |
| TOK Lexer::hexStringConstant(Token *t) |
| { |
| unsigned c; |
| Loc start = loc(); |
| unsigned n = 0; |
| unsigned v = ~0; // dead assignment, needed to suppress warning |
| |
| p++; |
| stringbuffer.reset(); |
| while (1) |
| { |
| c = *p++; |
| switch (c) |
| { |
| case ' ': |
| case '\t': |
| case '\v': |
| case '\f': |
| continue; // skip white space |
| |
| case '\r': |
| if (*p == '\n') |
| continue; // ignore |
| // Treat isolated '\r' as if it were a '\n' |
| /* fall through */ |
| case '\n': |
| endOfLine(); |
| continue; |
| |
| case 0: |
| case 0x1A: |
| error("unterminated string constant starting at %s", start.toChars()); |
| t->ustring = (utf8_t *)const_cast<char *>(""); |
| t->len = 0; |
| t->postfix = 0; |
| return TOKxstring; |
| |
| case '"': |
| if (n & 1) |
| { error("odd number (%d) of hex characters in hex string", n); |
| stringbuffer.writeByte(v); |
| } |
| t->len = (unsigned)stringbuffer.offset; |
| stringbuffer.writeByte(0); |
| t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); |
| memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); |
| stringPostfix(t); |
| return TOKxstring; |
| |
| default: |
| if (c >= '0' && c <= '9') |
| c -= '0'; |
| else if (c >= 'a' && c <= 'f') |
| c -= 'a' - 10; |
| else if (c >= 'A' && c <= 'F') |
| c -= 'A' - 10; |
| else if (c & 0x80) |
| { p--; |
| unsigned u = decodeUTF(); |
| p++; |
| if (u == PS || u == LS) |
| endOfLine(); |
| else |
| error("non-hex character \\u%04x in hex string", u); |
| } |
| else |
| error("non-hex character '%c' in hex string", c); |
| if (n & 1) |
| { v = (v << 4) | c; |
| stringbuffer.writeByte(v); |
| } |
| else |
| v = c; |
| n++; |
| break; |
| } |
| } |
| } |
| |
| |
| /************************************** |
| * Lex delimited strings: |
| * q"(foo(xxx))" // "foo(xxx)" |
| * q"[foo(]" // "foo(" |
| * q"/foo]/" // "foo]" |
| * q"HERE |
| * foo |
| * HERE" // "foo\n" |
| * Input: |
| * p is on the " |
| */ |
| |
| TOK Lexer::delimitedStringConstant(Token *t) |
| { |
| unsigned c; |
| Loc start = loc(); |
| unsigned delimleft = 0; |
| unsigned delimright = 0; |
| unsigned nest = 1; |
| unsigned nestcount = ~0; // dead assignment, needed to suppress warning |
| Identifier *hereid = NULL; |
| unsigned blankrol = 0; |
| unsigned startline = 0; |
| |
| p++; |
| stringbuffer.reset(); |
| while (1) |
| { |
| c = *p++; |
| //printf("c = '%c'\n", c); |
| switch (c) |
| { |
| case '\n': |
| Lnextline: |
| endOfLine(); |
| startline = 1; |
| if (blankrol) |
| { blankrol = 0; |
| continue; |
| } |
| if (hereid) |
| { |
| stringbuffer.writeUTF8(c); |
| continue; |
| } |
| break; |
| |
| case '\r': |
| if (*p == '\n') |
| continue; // ignore |
| c = '\n'; // treat EndOfLine as \n character |
| goto Lnextline; |
| |
| case 0: |
| case 0x1A: |
| error("unterminated delimited string constant starting at %s", start.toChars()); |
| t->ustring = (utf8_t *)const_cast<char *>(""); |
| t->len = 0; |
| t->postfix = 0; |
| return TOKstring; |
| |
| default: |
| if (c & 0x80) |
| { p--; |
| c = decodeUTF(); |
| p++; |
| if (c == PS || c == LS) |
| goto Lnextline; |
| } |
| break; |
| } |
| if (delimleft == 0) |
| { delimleft = c; |
| nest = 1; |
| nestcount = 1; |
| if (c == '(') |
| delimright = ')'; |
| else if (c == '{') |
| delimright = '}'; |
| else if (c == '[') |
| delimright = ']'; |
| else if (c == '<') |
| delimright = '>'; |
| else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) |
| { // Start of identifier; must be a heredoc |
| Token tok; |
| p--; |
| scan(&tok); // read in heredoc identifier |
| if (tok.value != TOKidentifier) |
| { error("identifier expected for heredoc, not %s", tok.toChars()); |
| delimright = c; |
| } |
| else |
| { hereid = tok.ident; |
| //printf("hereid = '%s'\n", hereid->toChars()); |
| blankrol = 1; |
| } |
| nest = 0; |
| } |
| else |
| { delimright = c; |
| nest = 0; |
| if (isspace(c)) |
| error("delimiter cannot be whitespace"); |
| } |
| } |
| else |
| { |
| if (blankrol) |
| { error("heredoc rest of line should be blank"); |
| blankrol = 0; |
| continue; |
| } |
| if (nest == 1) |
| { |
| if (c == delimleft) |
| nestcount++; |
| else if (c == delimright) |
| { nestcount--; |
| if (nestcount == 0) |
| goto Ldone; |
| } |
| } |
| else if (c == delimright) |
| goto Ldone; |
| if (startline && isalpha(c) && hereid) |
| { Token tok; |
| const utf8_t *psave = p; |
| p--; |
| scan(&tok); // read in possible heredoc identifier |
| //printf("endid = '%s'\n", tok.ident->toChars()); |
| if (tok.value == TOKidentifier && tok.ident->equals(hereid)) |
| { /* should check that rest of line is blank |
| */ |
| goto Ldone; |
| } |
| p = psave; |
| } |
| stringbuffer.writeUTF8(c); |
| startline = 0; |
| } |
| } |
| |
| Ldone: |
| if (*p == '"') |
| p++; |
| else if (hereid) |
| error("delimited string must end in %s\"", hereid->toChars()); |
| else |
| error("delimited string must end in %c\"", delimright); |
| t->len = (unsigned)stringbuffer.offset; |
| stringbuffer.writeByte(0); |
| t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); |
| memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); |
| stringPostfix(t); |
| return TOKstring; |
| } |
| |
| /************************************** |
| * Lex delimited strings: |
| * q{ foo(xxx) } // " foo(xxx) " |
| * q{foo(} // "foo(" |
| * q{{foo}"}"} // "{foo}"}"" |
| * Input: |
| * p is on the q |
| */ |
| |
| TOK Lexer::tokenStringConstant(Token *t) |
| { |
| unsigned nest = 1; |
| Loc start = loc(); |
| const utf8_t *pstart = ++p; |
| |
| while (1) |
| { Token tok; |
| |
| scan(&tok); |
| switch (tok.value) |
| { |
| case TOKlcurly: |
| nest++; |
| continue; |
| |
| case TOKrcurly: |
| if (--nest == 0) |
| { |
| t->len = (unsigned)(p - 1 - pstart); |
| t->ustring = (utf8_t *)mem.xmalloc(t->len + 1); |
| memcpy(t->ustring, pstart, t->len); |
| t->ustring[t->len] = 0; |
| stringPostfix(t); |
| return TOKstring; |
| } |
| continue; |
| |
| case TOKeof: |
| error("unterminated token string constant starting at %s", start.toChars()); |
| t->ustring = (utf8_t *)const_cast<char *>(""); |
| t->len = 0; |
| t->postfix = 0; |
| return TOKstring; |
| |
| default: |
| continue; |
| } |
| } |
| } |
| |
| |
| |
| /************************************** |
| */ |
| |
| TOK Lexer::escapeStringConstant(Token *t) |
| { |
| unsigned c; |
| Loc start = loc(); |
| |
| p++; |
| stringbuffer.reset(); |
| while (1) |
| { |
| c = *p++; |
| switch (c) |
| { |
| case '\\': |
| switch (*p) |
| { |
| case 'u': |
| case 'U': |
| case '&': |
| c = escapeSequence(); |
| stringbuffer.writeUTF8(c); |
| continue; |
| |
| default: |
| c = escapeSequence(); |
| break; |
| } |
| break; |
| case '\n': |
| endOfLine(); |
| break; |
| |
| case '\r': |
| if (*p == '\n') |
| continue; // ignore |
| c = '\n'; // treat EndOfLine as \n character |
| endOfLine(); |
| break; |
| |
| case '"': |
| t->len = (unsigned)stringbuffer.offset; |
| stringbuffer.writeByte(0); |
| t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); |
| memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); |
| stringPostfix(t); |
| return TOKstring; |
| |
| case 0: |
| case 0x1A: |
| p--; |
| error("unterminated string constant starting at %s", start.toChars()); |
| t->ustring = (utf8_t *)const_cast<char *>(""); |
| t->len = 0; |
| t->postfix = 0; |
| return TOKstring; |
| |
| default: |
| if (c & 0x80) |
| { |
| p--; |
| c = decodeUTF(); |
| if (c == LS || c == PS) |
| { c = '\n'; |
| endOfLine(); |
| } |
| p++; |
| stringbuffer.writeUTF8(c); |
| continue; |
| } |
| break; |
| } |
| stringbuffer.writeByte(c); |
| } |
| } |
| |
| /************************************** |
| */ |
| |
| TOK Lexer::charConstant(Token *t) |
| { |
| unsigned c; |
| TOK tk = TOKcharv; |
| |
| //printf("Lexer::charConstant\n"); |
| p++; |
| c = *p++; |
| switch (c) |
| { |
| case '\\': |
| switch (*p) |
| { |
| case 'u': |
| t->uns64value = escapeSequence(); |
| tk = TOKwcharv; |
| break; |
| |
| case 'U': |
| case '&': |
| t->uns64value = escapeSequence(); |
| tk = TOKdcharv; |
| break; |
| |
| default: |
| t->uns64value = escapeSequence(); |
| break; |
| } |
| break; |
| case '\n': |
| L1: |
| endOfLine(); |
| /* fall through */ |
| case '\r': |
| case 0: |
| case 0x1A: |
| case '\'': |
| error("unterminated character constant"); |
| t->uns64value = '?'; |
| return tk; |
| |
| default: |
| if (c & 0x80) |
| { |
| p--; |
| c = decodeUTF(); |
| p++; |
| if (c == LS || c == PS) |
| goto L1; |
| if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) |
| tk = TOKwcharv; |
| else |
| tk = TOKdcharv; |
| } |
| t->uns64value = c; |
| break; |
| } |
| |
| if (*p != '\'') |
| { |
| error("unterminated character constant"); |
| t->uns64value = '?'; |
| return tk; |
| } |
| p++; |
| return tk; |
| } |
| |
| /*************************************** |
| * Get postfix of string literal. |
| */ |
| |
| void Lexer::stringPostfix(Token *t) |
| { |
| switch (*p) |
| { |
| case 'c': |
| case 'w': |
| case 'd': |
| t->postfix = *p; |
| p++; |
| break; |
| |
| default: |
| t->postfix = 0; |
| break; |
| } |
| } |
| |
| /************************************** |
| * Read in a number. |
| * If it's an integer, store it in tok.TKutok.Vlong. |
| * integers can be decimal, octal or hex |
| * Handle the suffixes U, UL, LU, L, etc. |
| * If it's double, store it in tok.TKutok.Vdouble. |
| * Returns: |
| * TKnum |
| * TKdouble,... |
| */ |
| |
| TOK Lexer::number(Token *t) |
| { |
| int base = 10; |
| const utf8_t *start = p; |
| unsigned c; |
| uinteger_t n = 0; // unsigned >=64 bit integer type |
| int d; |
| bool err = false; |
| bool overflow = false; |
| |
| c = *p; |
| if (c == '0') |
| { |
| ++p; |
| c = *p; |
| switch (c) |
| { |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| n = c - '0'; |
| ++p; |
| base = 8; |
| break; |
| |
| case 'x': |
| case 'X': |
| ++p; |
| base = 16; |
| break; |
| |
| case 'b': |
| case 'B': |
| ++p; |
| base = 2; |
| break; |
| |
| case '.': |
| if (p[1] == '.') |
| goto Ldone; // if ".." |
| if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) |
| goto Ldone; // if ".identifier" or ".unicode" |
| goto Lreal; // '.' is part of current token |
| |
| case 'i': |
| case 'f': |
| case 'F': |
| goto Lreal; |
| |
| case '_': |
| ++p; |
| base = 8; |
| break; |
| |
| case 'L': |
| if (p[1] == 'i') |
| goto Lreal; |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| while (1) |
| { |
| c = *p; |
| switch (c) |
| { |
| case '0': case '1': |
| ++p; |
| d = c - '0'; |
| break; |
| |
| case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| if (base == 2 && !err) |
| { |
| error("binary digit expected"); |
| err = true; |
| } |
| ++p; |
| d = c - '0'; |
| break; |
| |
| case '8': case '9': |
| ++p; |
| if (base < 10 && !err) |
| { |
| error("radix %d digit expected, not '%c'", base, c); |
| err = true; |
| } |
| d = c - '0'; |
| break; |
| |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| ++p; |
| if (base != 16) |
| { |
| if (c == 'e' || c == 'E' || c == 'f' || c == 'F') |
| goto Lreal; |
| if (!err) |
| { |
| error("radix %d digit expected, not '%c'", base, c); |
| err = true; |
| } |
| } |
| if (c >= 'a') |
| d = c + 10 - 'a'; |
| else |
| d = c + 10 - 'A'; |
| break; |
| |
| case 'L': |
| if (p[1] == 'i') |
| goto Lreal; |
| goto Ldone; |
| |
| case '.': |
| if (p[1] == '.') |
| goto Ldone; // if ".." |
| if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) |
| goto Ldone; // if ".identifier" or ".unicode" |
| goto Lreal; // otherwise as part of a floating point literal |
| |
| case 'p': |
| case 'P': |
| case 'i': |
| Lreal: |
| p = start; |
| return inreal(t); |
| |
| case '_': |
| ++p; |
| continue; |
| |
| default: |
| goto Ldone; |
| } |
| |
| uinteger_t n2 = n * base; |
| if ((n2 / base != n || n2 + d < n)) |
| { |
| overflow = true; |
| } |
| n = n2 + d; |
| |
| // if n needs more than 64 bits |
| if (sizeof(n) > 8 && |
| n > 0xFFFFFFFFFFFFFFFFULL) |
| { |
| overflow = true; |
| } |
| } |
| |
| Ldone: |
| |
| if (overflow && !err) |
| { |
| error("integer overflow"); |
| err = true; |
| } |
| |
| enum FLAGS |
| { |
| FLAGS_none = 0, |
| FLAGS_decimal = 1, // decimal |
| FLAGS_unsigned = 2, // u or U suffix |
| FLAGS_long = 4, // L suffix |
| }; |
| |
| unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none; |
| |
| // Parse trailing 'u', 'U', 'l' or 'L' in any combination |
| const utf8_t *psuffix = p; |
| while (1) |
| { |
| utf8_t f; |
| switch (*p) |
| { |
| case 'U': |
| case 'u': |
| f = FLAGS_unsigned; |
| goto L1; |
| |
| case 'l': |
| f = FLAGS_long; |
| error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); |
| goto L1; |
| |
| case 'L': |
| f = FLAGS_long; |
| L1: |
| p++; |
| if ((flags & f) && !err) |
| { |
| error("unrecognized token"); |
| err = true; |
| } |
| flags = (FLAGS) (flags | f); |
| continue; |
| default: |
| break; |
| } |
| break; |
| } |
| |
| if (base == 8 && n >= 8) |
| error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", |
| n, p - psuffix, psuffix, n, p - psuffix, psuffix); |
| |
| TOK result; |
| switch (flags) |
| { |
| case FLAGS_none: |
| /* Octal or Hexadecimal constant. |
| * First that fits: int, uint, long, ulong |
| */ |
| if (n & 0x8000000000000000LL) |
| result = TOKuns64v; |
| else if (n & 0xFFFFFFFF00000000LL) |
| result = TOKint64v; |
| else if (n & 0x80000000) |
| result = TOKuns32v; |
| else |
| result = TOKint32v; |
| break; |
| |
| case FLAGS_decimal: |
| /* First that fits: int, long, long long |
| */ |
| if (n & 0x8000000000000000LL) |
| { |
| if (!err) |
| { |
| error("signed integer overflow"); |
| err = true; |
| } |
| result = TOKuns64v; |
| } |
| else if (n & 0xFFFFFFFF80000000LL) |
| result = TOKint64v; |
| else |
| result = TOKint32v; |
| break; |
| |
| case FLAGS_unsigned: |
| case FLAGS_decimal | FLAGS_unsigned: |
| /* First that fits: uint, ulong |
| */ |
| if (n & 0xFFFFFFFF00000000LL) |
| result = TOKuns64v; |
| else |
| result = TOKuns32v; |
| break; |
| |
| case FLAGS_decimal | FLAGS_long: |
| if (n & 0x8000000000000000LL) |
| { |
| if (!err) |
| { |
| error("signed integer overflow"); |
| err = true; |
| } |
| result = TOKuns64v; |
| } |
| else |
| result = TOKint64v; |
| break; |
| |
| case FLAGS_long: |
| if (n & 0x8000000000000000LL) |
| result = TOKuns64v; |
| else |
| result = TOKint64v; |
| break; |
| |
| case FLAGS_unsigned | FLAGS_long: |
| case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: |
| result = TOKuns64v; |
| break; |
| |
| default: |
| assert(0); |
| } |
| t->uns64value = n; |
| return result; |
| } |
| |
| /************************************** |
| * Read in characters, converting them to real. |
| * Bugs: |
| * Exponent overflow not detected. |
| * Too much requested precision is not detected. |
| */ |
| |
| TOK Lexer::inreal(Token *t) |
| { |
| //printf("Lexer::inreal()\n"); |
| bool isWellformedString = true; |
| stringbuffer.reset(); |
| const utf8_t *pstart = p; |
| char hex = 0; |
| unsigned c = *p++; |
| |
| // Leading '0x' |
| if (c == '0') |
| { |
| c = *p++; |
| if (c == 'x' || c == 'X') |
| { |
| hex = true; |
| c = *p++; |
| } |
| } |
| |
| // Digits to left of '.' |
| while (1) |
| { |
| if (c == '.') |
| { |
| c = *p++; |
| break; |
| } |
| if (isdigit(c) || (hex && isxdigit(c)) || c == '_') |
| { |
| c = *p++; |
| continue; |
| } |
| break; |
| } |
| |
| // Digits to right of '.' |
| while (1) |
| { |
| if (isdigit(c) || (hex && isxdigit(c)) || c == '_') |
| { |
| c = *p++; |
| continue; |
| } |
| break; |
| } |
| |
| if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) |
| { |
| c = *p++; |
| if (c == '-' || c == '+') |
| { |
| c = *p++; |
| } |
| bool anyexp = false; |
| while (1) |
| { |
| if (isdigit(c)) |
| { |
| anyexp = true; |
| c = *p++; |
| continue; |
| } |
| if (c == '_') |
| { |
| c = *p++; |
| continue; |
| } |
| if (!anyexp) |
| { |
| error("missing exponent"); |
| isWellformedString = false; |
| } |
| break; |
| } |
| } |
| else if (hex) |
| { |
| error("exponent required for hex float"); |
| isWellformedString = false; |
| } |
| --p; |
| while (pstart < p) |
| { |
| if (*pstart != '_') |
| stringbuffer.writeByte(*pstart); |
| ++pstart; |
| } |
| |
| stringbuffer.writeByte(0); |
| const char *sbufptr = (char *)stringbuffer.data; |
| TOK result; |
| bool isOutOfRange = false; |
| t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero); |
| errno = 0; |
| switch (*p) |
| { |
| case 'F': |
| case 'f': |
| if (isWellformedString && !isOutOfRange) |
| isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr); |
| result = TOKfloat32v; |
| p++; |
| break; |
| |
| default: |
| if (isWellformedString && !isOutOfRange) |
| isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr); |
| result = TOKfloat64v; |
| break; |
| |
| case 'l': |
| error("use 'L' suffix instead of 'l'"); |
| /* fall through */ |
| case 'L': |
| result = TOKfloat80v; |
| p++; |
| break; |
| } |
| if (*p == 'i' || *p == 'I') |
| { |
| if (*p == 'I') |
| error("use 'i' suffix instead of 'I'"); |
| p++; |
| switch (result) |
| { |
| case TOKfloat32v: |
| result = TOKimaginary32v; |
| break; |
| case TOKfloat64v: |
| result = TOKimaginary64v; |
| break; |
| case TOKfloat80v: |
| result = TOKimaginary80v; |
| break; |
| default: break; |
| } |
| } |
| const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v); |
| if (isOutOfRange && !isLong) |
| { |
| const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; |
| error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix); |
| } |
| return result; |
| } |
| |
| /********************************************* |
| * parse: |
| * #line linnum [filespec] |
| * also allow __LINE__ for linnum, and __FILE__ for filespec |
| */ |
| |
| void Lexer::poundLine() |
| { |
| Token tok; |
| int linnum = this->scanloc.linnum; |
| char *filespec = NULL; |
| Loc loc = this->loc(); |
| |
| scan(&tok); |
| if (tok.value == TOKint32v || tok.value == TOKint64v) |
| { |
| int lin = (int)(tok.uns64value - 1); |
| if ((unsigned)lin != tok.uns64value - 1) |
| error("line number %lld out of range", (unsigned long long)tok.uns64value); |
| else |
| linnum = lin; |
| } |
| else if (tok.value == TOKline) |
| { |
| } |
| else |
| goto Lerr; |
| |
| while (1) |
| { |
| switch (*p) |
| { |
| case 0: |
| case 0x1A: |
| case '\n': |
| Lnewline: |
| this->scanloc.linnum = linnum; |
| if (filespec) |
| this->scanloc.filename = filespec; |
| return; |
| |
| case '\r': |
| p++; |
| if (*p != '\n') |
| { p--; |
| goto Lnewline; |
| } |
| continue; |
| |
| case ' ': |
| case '\t': |
| case '\v': |
| case '\f': |
| p++; |
| continue; // skip white space |
| |
| case '_': |
| if (memcmp(p, "__FILE__", 8) == 0) |
| { |
| p += 8; |
| filespec = mem.xstrdup(scanloc.filename); |
| continue; |
| } |
| goto Lerr; |
| |
| case '"': |
| if (filespec) |
| goto Lerr; |
| stringbuffer.reset(); |
| p++; |
| while (1) |
| { unsigned c; |
| |
| c = *p; |
| switch (c) |
| { |
| case '\n': |
| case '\r': |
| case 0: |
| case 0x1A: |
| goto Lerr; |
| |
| case '"': |
| stringbuffer.writeByte(0); |
| filespec = mem.xstrdup((char *)stringbuffer.data); |
| p++; |
| break; |
| |
| default: |
| if (c & 0x80) |
| { unsigned u = decodeUTF(); |
| if (u == PS || u == LS) |
| goto Lerr; |
| } |
| stringbuffer.writeByte(c); |
| p++; |
| continue; |
| } |
| break; |
| } |
| continue; |
| |
| default: |
| if (*p & 0x80) |
| { unsigned u = decodeUTF(); |
| if (u == PS || u == LS) |
| goto Lnewline; |
| } |
| goto Lerr; |
| } |
| } |
| |
| Lerr: |
| error(loc, "#line integer [\"filespec\"]\\n expected"); |
| } |
| |
| |
| /******************************************** |
| * Decode UTF character. |
| * Issue error messages for invalid sequences. |
| * Return decoded character, advance p to last character in UTF sequence. |
| */ |
| |
| unsigned Lexer::decodeUTF() |
| { |
| dchar_t u; |
| utf8_t c; |
| const utf8_t *s = p; |
| size_t len; |
| size_t idx; |
| const char *msg; |
| |
| c = *s; |
| assert(c & 0x80); |
| |
| // Check length of remaining string up to 6 UTF-8 characters |
| for (len = 1; len < 6 && s[len]; len++) |
| ; |
| |
| idx = 0; |
| msg = utf_decodeChar(s, len, &idx, &u); |
| p += idx - 1; |
| if (msg) |
| { |
| error("%s", msg); |
| } |
| return u; |
| } |
| |
| |
| /*************************************************** |
| * Parse doc comment embedded between t->ptr and p. |
| * Remove trailing blanks and tabs from lines. |
| * Replace all newlines with \n. |
| * Remove leading comment character from each line. |
| * Decide if it's a lineComment or a blockComment. |
| * Append to previous one for this token. |
| */ |
| |
| void Lexer::getDocComment(Token *t, unsigned lineComment) |
| { |
| /* ct tells us which kind of comment it is: '/', '*', or '+' |
| */ |
| utf8_t ct = t->ptr[2]; |
| |
| /* Start of comment text skips over / * *, / + +, or / / / |
| */ |
| const utf8_t *q = t->ptr + 3; // start of comment text |
| |
| const utf8_t *qend = p; |
| if (ct == '*' || ct == '+') |
| qend -= 2; |
| |
| /* Scan over initial row of ****'s or ++++'s or ////'s |
| */ |
| for (; q < qend; q++) |
| { |
| if (*q != ct) |
| break; |
| } |
| |
| /* Remove leading spaces until start of the comment |
| */ |
| int linestart = 0; |
| if (ct == '/') |
| { |
| while (q < qend && (*q == ' ' || *q == '\t')) |
| ++q; |
| } |
| else if (q < qend) |
| { |
| if (*q == '\r') |
| { |
| ++q; |
| if (q < qend && *q == '\n') |
| ++q; |
| linestart = 1; |
| } |
| else if (*q == '\n') |
| { |
| ++q; |
| linestart = 1; |
| } |
| } |
| |
| /* Remove trailing row of ****'s or ++++'s |
| */ |
| if (ct != '/') |
| { |
| for (; q < qend; qend--) |
| { |
| if (qend[-1] != ct) |
| break; |
| } |
| } |
| |
| /* Comment is now [q .. qend]. |
| * Canonicalize it into buf[]. |
| */ |
| OutBuffer buf; |
| |
| for (; q < qend; q++) |
| { |
| utf8_t c = *q; |
| |
| switch (c) |
| { |
| case '*': |
| case '+': |
| if (linestart && c == ct) |
| { linestart = 0; |
| /* Trim preceding whitespace up to preceding \n |
| */ |
| while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) |
| buf.offset--; |
| continue; |
| } |
| break; |
| |
| case ' ': |
| case '\t': |
| break; |
| |
| case '\r': |
| if (q[1] == '\n') |
| continue; // skip the \r |
| goto Lnewline; |
| |
| default: |
| if (c == 226) |
| { |
| // If LS or PS |
| if (q[1] == 128 && |
| (q[2] == 168 || q[2] == 169)) |
| { |
| q += 2; |
| goto Lnewline; |
| } |
| } |
| linestart = 0; |
| break; |
| |
| Lnewline: |
| c = '\n'; // replace all newlines with \n |
| /* fall through */ |
| case '\n': |
| linestart = 1; |
| |
| /* Trim trailing whitespace |
| */ |
| while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) |
| buf.offset--; |
| |
| break; |
| } |
| buf.writeByte(c); |
| } |
| |
| /* Trim trailing whitespace (if the last line does not have newline) |
| */ |
| if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) |
| { |
| while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) |
| buf.offset--; |
| } |
| |
| // Always end with a newline |
| if (!buf.offset || buf.data[buf.offset - 1] != '\n') |
| buf.writeByte('\n'); |
| |
| buf.writeByte(0); |
| |
| // It's a line comment if the start of the doc comment comes |
| // after other non-whitespace on the same line. |
| const utf8_t** dc = (lineComment && anyToken) |
| ? &t->lineComment |
| : &t->blockComment; |
| |
| // Combine with previous doc comment, if any |
| if (*dc) |
| *dc = combineComments(*dc, (utf8_t *)buf.data); |
| else |
| *dc = (utf8_t *)buf.extractData(); |
| } |
| |
| /******************************************** |
| * Combine two document comments into one, |
| * separated by a newline. |
| */ |
| |
| const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2) |
| { |
| //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); |
| |
| const utf8_t *c = c2; |
| |
| if (c1) |
| { |
| c = c1; |
| if (c2) |
| { |
| size_t len1 = strlen((const char *)c1); |
| size_t len2 = strlen((const char *)c2); |
| |
| int insertNewLine = 0; |
| if (len1 && c1[len1 - 1] != '\n') |
| { |
| ++len1; |
| insertNewLine = 1; |
| } |
| |
| utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1); |
| memcpy(p, c1, len1 - insertNewLine); |
| if (insertNewLine) |
| p[len1 - 1] = '\n'; |
| |
| p[len1] = '\n'; |
| |
| memcpy(p + len1 + 1, c2, len2); |
| p[len1 + 1 + len2] = 0; |
| c = p; |
| } |
| } |
| return c; |
| } |