blob: 5945da32259dfdc3715eaa8da0411eb18c97edb7 [file] [log] [blame]
/**
* Implements the lexical analyzer, which converts source code into lexical tokens.
*
* Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical)
*
* Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d)
* Documentation: https://dlang.org/phobos/dmd_lexer.html
* Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d
*/
module dmd.lexer;
import core.stdc.ctype;
import core.stdc.errno;
import core.stdc.stdarg;
import core.stdc.stdio;
import core.stdc.stdlib : getenv;
import core.stdc.string;
import core.stdc.time;
import dmd.entity;
import dmd.errors;
import dmd.globals;
import dmd.id;
import dmd.identifier;
import dmd.root.array;
import dmd.root.ctfloat;
import dmd.common.outbuffer;
import dmd.root.port;
import dmd.root.rmem;
import dmd.root.string;
import dmd.root.utf;
import dmd.tokens;
import dmd.utils;
nothrow:
version (DMDLIB)
{
version = LocOffset;
}
/***********************************************************
*/
class Lexer
{
private __gshared OutBuffer stringbuffer;
Loc scanloc; // for error messages
Loc prevloc; // location of token before current
const(char)* p; // current character
Token token;
// For ImportC
bool Ccompile; /// true if compiling ImportC
// The following are valid only if (Ccompile == true)
ubyte boolsize; /// size of a C _Bool, default 1
ubyte shortsize; /// size of a C short, default 2
ubyte intsize; /// size of a C int, default 4
ubyte longsize; /// size of C long, 4 or 8
ubyte long_longsize; /// size of a C long long, default 8
ubyte long_doublesize; /// size of C long double, 8 or D real.sizeof
ubyte wchar_tsize; /// size of C wchar_t, 2 or 4
private
{
const(char)* base; // pointer to start of buffer
const(char)* end; // pointer to last element of buffer
const(char)* line; // start of current line
bool doDocComment; // collect doc comment information
bool anyToken; // seen at least one token
bool commentToken; // comments are TOK.comment's
bool tokenizeNewlines; // newlines are turned into TOK.endOfLine's
version (DMDLIB)
{
bool whitespaceToken; // tokenize whitespaces
}
int inTokenStringConstant; // can be larger than 1 when in nested q{} strings
int lastDocLine; // last line of previous doc comment
Token* tokenFreelist;
}
nothrow:
/*********************
* Creates a Lexer for the source code base[begoffset..endoffset+1].
* The last character, base[endoffset], must be null (0) or EOF (0x1A).
*
* Params:
* filename = used for error messages
* base = source code, must be terminated by a null (0) or EOF (0x1A) character
* begoffset = starting offset into base[]
* endoffset = the last offset to read into base[]
* doDocComment = handle documentation comments
* commentToken = comments become TOK.comment's
*/
this(const(char)* filename, const(char)* base, size_t begoffset,
size_t endoffset, bool doDocComment, bool commentToken) pure
{
scanloc = Loc(filename, 1, 1);
// debug printf("Lexer::Lexer(%p)\n", base);
// debug printf("lexer.filename = %s\n", filename);
token = Token.init;
this.base = base;
this.end = base + endoffset;
p = base + begoffset;
line = p;
this.doDocComment = doDocComment;
this.commentToken = commentToken;
this.tokenizeNewlines = false;
this.inTokenStringConstant = 0;
this.lastDocLine = 0;
//initKeywords();
/* If first line starts with '#!', ignore the line
*/
if (p && p[0] == '#' && p[1] == '!')
{
p += 2;
while (1)
{
char c = *p++;
switch (c)
{
case 0:
case 0x1A:
p--;
goto case;
case '\n':
break;
default:
continue;
}
break;
}
endOfLine();
}
}
version (DMDLIB)
{
this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset,
bool doDocComment, bool commentToken, bool whitespaceToken)
{
this(filename, base, begoffset, endoffset, doDocComment, commentToken);
this.whitespaceToken = whitespaceToken;
}
bool empty() const pure @property @nogc @safe
{
return front() == TOK.endOfFile;
}
TOK front() const pure @property @nogc @safe
{
return token.value;
}
void popFront()
{
nextToken();
}
}
/// Returns: a newly allocated `Token`.
Token* allocateToken() pure nothrow @safe
{
if (tokenFreelist)
{
Token* t = tokenFreelist;
tokenFreelist = t.next;
t.next = null;
return t;
}
return new Token();
}
/// Frees the given token by returning it to the freelist.
private void releaseToken(Token* token) pure nothrow @nogc @safe
{
if (mem.isGCEnabled)
*token = Token.init;
token.next = tokenFreelist;
tokenFreelist = token;
}
final TOK nextToken()
{
prevloc = token.loc;
if (token.next)
{
Token* t = token.next;
memcpy(&token, t, Token.sizeof);
releaseToken(t);
}
else
{
scan(&token);
}
//printf(token.toChars());
return token.value;
}
/***********************
* Look ahead at next token's value.
*/
final TOK peekNext()
{
return peek(&token).value;
}
/***********************
* Look 2 tokens ahead at value.
*/
final TOK peekNext2()
{
Token* t = peek(&token);
return peek(t).value;
}
/****************************
* Turn next token in buffer into a token.
* Params:
* t = the token to set the resulting Token to
*/
final void scan(Token* t)
{
const lastLine = scanloc.linnum;
Loc startLoc;
t.blockComment = null;
t.lineComment = null;
while (1)
{
t.ptr = p;
//printf("p = %p, *p = '%c'\n",p,*p);
t.loc = loc();
switch (*p)
{
case 0:
case 0x1A:
t.value = TOK.endOfFile; // end of file
// Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile.
return;
case ' ':
// Skip 4 spaces at a time after aligning 'p' to a 4-byte boundary.
while ((cast(size_t)p) % uint.sizeof)
{
if (*p != ' ')
goto LendSkipFourSpaces;
p++;
}
while (*(cast(uint*)p) == 0x20202020) // ' ' == 0x20
p += 4;
// Skip over any remaining space on the line.
while (*p == ' ')
p++;
LendSkipFourSpaces:
version (DMDLIB)
{
if (whitespaceToken)
{
t.value = TOK.whitespace;
return;
}
}
continue; // skip white space
case '\t':
case '\v':
case '\f':
p++;
version (DMDLIB)
{
if (whitespaceToken)
{
t.value = TOK.whitespace;
return;
}
}
continue; // skip white space
case '\r':
p++;
if (*p != '\n') // if CR stands by itself
{
endOfLine();
if (tokenizeNewlines)
{
t.value = TOK.endOfLine;
tokenizeNewlines = false;
return;
}
}
version (DMDLIB)
{
if (whitespaceToken)
{
t.value = TOK.whitespace;
return;
}
}
continue; // skip white space
case '\n':
p++;
endOfLine();
if (tokenizeNewlines)
{
t.value = TOK.endOfLine;
tokenizeNewlines = false;
return;
}
version (DMDLIB)
{
if (whitespaceToken)
{
t.value = TOK.whitespace;
return;
}
}
continue; // skip white space
case '0':
if (!isZeroSecond(p[1])) // if numeric literal does not continue
{
++p;
t.unsvalue = 0;
t.value = TOK.int32Literal;
return;
}
goto Lnumber;
case '1': .. case '9':
if (!isDigitSecond(p[1])) // if numeric literal does not continue
{
t.unsvalue = *p - '0';
++p;
t.value = TOK.int32Literal;
return;
}
Lnumber:
t.value = number(t);
return;
case '\'':
if (issinglechar(p[1]) && p[2] == '\'')
{
t.unsvalue = p[1]; // simple one character literal
t.value = TOK.charLiteral;
p += 3;
}
else if (Ccompile)
{
clexerCharConstant(*t, 0);
}
else
{
t.value = charConstant(t);
}
return;
case 'u':
case 'U':
case 'L':
if (!Ccompile)
goto case_ident;
if (p[1] == '\'') // C wide character constant
{
char c = *p;
if (c == 'L') // convert L to u or U
c = (wchar_tsize == 4) ? 'u' : 'U';
++p;
clexerCharConstant(*t, c);
return;
}
else if (p[1] == '\"') // C wide string literal
{
const c = *p;
++p;
escapeStringConstant(t);
t.postfix = c == 'L' ? (wchar_tsize == 2 ? 'w' : 'd') :
c == 'u' ? 'w' :
'd';
return;
}
else if (p[1] == '8' && p[2] == '\"') // C UTF-8 string literal
{
p += 2;
escapeStringConstant(t);
return;
}
goto case_ident;
case 'r':
if (Ccompile || p[1] != '"')
goto case_ident;
p++;
goto case '`';
case '`':
if (Ccompile)
goto default;
wysiwygStringConstant(t);
return;
case 'q':
if (Ccompile)
goto case_ident;
if (p[1] == '"')
{
p++;
delimitedStringConstant(t);
return;
}
else if (p[1] == '{')
{
p++;
tokenStringConstant(t);
return;
}
else
goto case_ident;
case '"':
escapeStringConstant(t);
return;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
/*case 'q': case 'r':*/
case 's':
case 't':
//case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
//case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
//case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_':
case_ident:
{
while (1)
{
const c = *++p;
if (isidchar(c))
continue;
else if (c & 0x80)
{
const s = p;
const u = decodeUTF();
if (isUniAlpha(u))
continue;
error("char 0x%04x not allowed in identifier", u);
p = s;
}
break;
}
Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr));
t.ident = id;
t.value = cast(TOK)id.getValue();
anyToken = 1;
/* Different keywords for C and D
*/
if (Ccompile)
{
if (t.value != TOK.identifier)
{
t.value = Ckeywords[t.value]; // filter out D keywords
}
}
else if (t.value >= FirstCKeyword)
t.value = TOK.identifier; // filter out C keywords
else if (*t.ptr == '_') // if special identifier token
{
// Lazy initialization
TimeStampInfo.initialize(t.loc);
if (id == Id.DATE)
{
t.ustring = TimeStampInfo.date.ptr;
goto Lstr;
}
else if (id == Id.TIME)
{
t.ustring = TimeStampInfo.time.ptr;
goto Lstr;
}
else if (id == Id.VENDOR)
{
t.ustring = global.vendor.xarraydup.ptr;
goto Lstr;
}
else if (id == Id.TIMESTAMP)
{
t.ustring = TimeStampInfo.timestamp.ptr;
Lstr:
t.value = TOK.string_;
t.postfix = 0;
t.len = cast(uint)strlen(t.ustring);
}
else if (id == Id.VERSIONX)
{
t.value = TOK.int64Literal;
t.unsvalue = global.versionNumber();
}
else if (id == Id.EOFX)
{
t.value = TOK.endOfFile;
// Advance scanner to end of file
while (!(*p == 0 || *p == 0x1A))
p++;
}
}
//printf("t.value = %d\n",t.value);
return;
}
case '/':
p++;
switch (*p)
{
case '=':
p++;
t.value = TOK.divAssign;
return;
case '*':
p++;
startLoc = loc();
while (1)
{
while (1)
{
const c = *p;
switch (c)
{
case '/':
break;
case '\n':
endOfLine();
p++;
continue;
case '\r':
p++;
if (*p != '\n')
endOfLine();
continue;
case 0:
case 0x1A:
error("unterminated /* */ comment");
p = end;
t.loc = loc();
t.value = TOK.endOfFile;
return;
default:
if (c & 0x80)
{
const u = decodeUTF();
if (u == PS || u == LS)
endOfLine();
}
p++;
continue;
}
break;
}
p++;
if (p[-2] == '*' && p - 3 != t.ptr)
break;
}
if (commentToken)
{
t.loc = startLoc;
t.value = TOK.comment;
return;
}
else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
{
// if /** but not /**/
getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
lastDocLine = scanloc.linnum;
}
continue;
case '/': // do // style comments
startLoc = loc();
while (1)
{
const c = *++p;
switch (c)
{
case '\n':
break;
case '\r':
if (p[1] == '\n')
p++;
break;
case 0:
case 0x1A:
if (commentToken)
{
p = end;
t.loc = startLoc;
t.value = TOK.comment;
return;
}
if (doDocComment && t.ptr[2] == '/')
{
getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
lastDocLine = scanloc.linnum;
}
p = end;
t.loc = loc();
t.value = TOK.endOfFile;
return;
default:
if (c & 0x80)
{
const u = decodeUTF();
if (u == PS || u == LS)
break;
}
continue;
}
break;
}
if (commentToken)
{
version (DMDLIB) {}
else
{
p++;
endOfLine();
}
t.loc = startLoc;
t.value = TOK.comment;
return;
}
if (doDocComment && t.ptr[2] == '/')
{
getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
lastDocLine = scanloc.linnum;
}
p++;
endOfLine();
continue;
case '+':
if (!Ccompile)
{
int nest;
startLoc = loc();
p++;
nest = 1;
while (1)
{
char c = *p;
switch (c)
{
case '/':
p++;
if (*p == '+')
{
p++;
nest++;
}
continue;
case '+':
p++;
if (*p == '/')
{
p++;
if (--nest == 0)
break;
}
continue;
case '\r':
p++;
if (*p != '\n')
endOfLine();
continue;
case '\n':
endOfLine();
p++;
continue;
case 0:
case 0x1A:
error("unterminated /+ +/ comment");
p = end;
t.loc = loc();
t.value = TOK.endOfFile;
return;
default:
if (c & 0x80)
{
uint u = decodeUTF();
if (u == PS || u == LS)
endOfLine();
}
p++;
continue;
}
break;
}
if (commentToken)
{
t.loc = startLoc;
t.value = TOK.comment;
return;
}
if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
{
// if /++ but not /++/
getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1);
lastDocLine = scanloc.linnum;
}
continue;
}
break;
default:
break;
}
t.value = TOK.div;
return;
case '.':
p++;
if (isdigit(*p))
{
/* Note that we don't allow ._1 and ._ as being
* valid floating point numbers.
*/
p--;
t.value = inreal(t);
}
else if (p[0] == '.')
{
if (p[1] == '.')
{
p += 2;
t.value = TOK.dotDotDot;
}
else
{
p++;
t.value = TOK.slice;
}
}
else
t.value = TOK.dot;
return;
case '&':
p++;
if (*p == '=')
{
p++;
t.value = TOK.andAssign;
}
else if (*p == '&')
{
p++;
t.value = TOK.andAnd;
}
else
t.value = TOK.and;
return;
case '|':
p++;
if (*p == '=')
{
p++;
t.value = TOK.orAssign;
}
else if (*p == '|')
{
p++;
t.value = TOK.orOr;
}
else
t.value = TOK.or;
return;
case '-':
p++;
if (*p == '=')
{
p++;
t.value = TOK.minAssign;
}
else if (*p == '-')
{
p++;
t.value = TOK.minusMinus;
}
else if (*p == '>')
{
++p;
t.value = TOK.arrow;
}
else
t.value = TOK.min;
return;
case '+':
p++;
if (*p == '=')
{
p++;
t.value = TOK.addAssign;
}
else if (*p == '+')
{
p++;
t.value = TOK.plusPlus;
}
else
t.value = TOK.add;
return;
case '<':
p++;
if (*p == '=')
{
p++;
t.value = TOK.lessOrEqual; // <=
}
else if (*p == '<')
{
p++;
if (*p == '=')
{
p++;
t.value = TOK.leftShiftAssign; // <<=
}
else
t.value = TOK.leftShift; // <<
}
else if (*p == ':' && Ccompile)
{
++p;
t.value = TOK.leftBracket; // <:
}
else if (*p == '%' && Ccompile)
{
++p;
t.value = TOK.leftCurly; // <%
}
else
t.value = TOK.lessThan; // <
return;
case '>':
p++;
if (*p == '=')
{
p++;
t.value = TOK.greaterOrEqual; // >=
}
else if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOK.rightShiftAssign; // >>=
}
else if (*p == '>')
{
p++;
if (*p == '=')
{
p++;
t.value = TOK.unsignedRightShiftAssign; // >>>=
}
else
t.value = TOK.unsignedRightShift; // >>>
}
else
t.value = TOK.rightShift; // >>
}
else
t.value = TOK.greaterThan; // >
return;
case '!':
p++;
if (*p == '=')
{
p++;
t.value = TOK.notEqual; // !=
}
else
t.value = TOK.not; // !
return;
case '=':
p++;
if (*p == '=')
{
p++;
t.value = TOK.equal; // ==
}
else if (*p == '>')
{
p++;
t.value = TOK.goesTo; // =>
}
else
t.value = TOK.assign; // =
return;
case '~':
p++;
if (*p == '=')
{
p++;
t.value = TOK.concatenateAssign; // ~=
}
else
t.value = TOK.tilde; // ~
return;
case '^':
p++;
if (*p == '^')
{
p++;
if (*p == '=')
{
p++;
t.value = TOK.powAssign; // ^^=
}
else
t.value = TOK.pow; // ^^
}
else if (*p == '=')
{
p++;
t.value = TOK.xorAssign; // ^=
}
else
t.value = TOK.xor; // ^
return;
case '(':
p++;
t.value = TOK.leftParenthesis;
return;
case ')':
p++;
t.value = TOK.rightParenthesis;
return;
case '[':
p++;
t.value = TOK.leftBracket;
return;
case ']':
p++;
t.value = TOK.rightBracket;
return;
case '{':
p++;
t.value = TOK.leftCurly;
return;
case '}':
p++;
t.value = TOK.rightCurly;
return;
case '?':
p++;
t.value = TOK.question;
return;
case ',':
p++;
t.value = TOK.comma;
return;
case ';':
p++;
t.value = TOK.semicolon;
return;
case ':':
p++;
if (*p == ':')
{
++p;
t.value = TOK.colonColon;
}
else if (*p == '>' && Ccompile)
{
++p;
t.value = TOK.rightBracket;
}
else
t.value = TOK.colon;
return;
case '$':
p++;
t.value = TOK.dollar;
return;
case '@':
p++;
t.value = TOK.at;
return;
case '*':
p++;
if (*p == '=')
{
p++;
t.value = TOK.mulAssign;
}
else
t.value = TOK.mul;
return;
case '%':
p++;
if (*p == '=')
{
p++;
t.value = TOK.modAssign;
}
else if (*p == '>' && Ccompile)
{
++p;
t.value = TOK.rightCurly;
}
else if (*p == ':' && Ccompile)
{
goto case '#'; // %: means #
}
else
t.value = TOK.mod;
return;
case '#':
{
// https://issues.dlang.org/show_bug.cgi?id=22825
// Special token sequences are terminated by newlines,
// and should not be skipped over.
this.tokenizeNewlines = true;
p++;
if (parseSpecialTokenSequence())
continue;
t.value = TOK.pound;
return;
}
default:
{
dchar c = *p;
if (c & 0x80)
{
c = decodeUTF();
// Check for start of unicode identifier
if (isUniAlpha(c))
goto case_ident;
if (c == PS || c == LS)
{
endOfLine();
p++;
if (tokenizeNewlines)
{
t.value = TOK.endOfLine;
tokenizeNewlines = false;
return;
}
continue;
}
}
if (c < 0x80 && isprint(c))
error("character '%c' is not a valid token", c);
else
error("character 0x%02x is not a valid token", c);
p++;
continue;
}
}
}
}
final Token* peek(Token* ct)
{
Token* t;
if (ct.next)
t = ct.next;
else
{
t = allocateToken();
scan(t);
ct.next = t;
}
return t;
}
/*********************************
* tk is on the opening (.
* Look ahead and return token that is past the closing ).
*/
final Token* peekPastParen(Token* tk)
{
//printf("peekPastParen()\n");
int parens = 1;
int curlynest = 0;
while (1)
{
tk = peek(tk);
//tk.print();
switch (tk.value)
{
case TOK.leftParenthesis:
parens++;
continue;
case TOK.rightParenthesis:
--parens;
if (parens)
continue;
tk = peek(tk);
break;
case TOK.leftCurly:
curlynest++;
continue;
case TOK.rightCurly:
if (--curlynest >= 0)
continue;
break;
case TOK.semicolon:
if (curlynest)
continue;
break;
case TOK.endOfFile:
break;
default:
continue;
}
return tk;
}
}
/*******************************************
* Parse escape sequence.
*/
private uint escapeSequence()
{
return Lexer.escapeSequence(token.loc, p, Ccompile);
}
/********
* Parse the given string literal escape sequence into a single character.
* D https://dlang.org/spec/lex.html#escape_sequences
* C11 6.4.4.4
* Params:
* loc = location to use for error messages
* sequence = pointer to string with escape sequence to parse. Updated to
* point past the end of the escape sequence
* Ccompile = true for compile C11 escape sequences
* Returns:
* the escape sequence as a single character
*/
private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence, bool Ccompile)
{
const(char)* p = sequence; // cache sequence reference on stack
scope(exit) sequence = p;
uint c = *p;
int ndigits;
switch (c)
{
case '\'':
case '"':
case '?':
case '\\':
Lconsume:
p++;
break;
case 'a':
c = 7;
goto Lconsume;
case 'b':
c = 8;
goto Lconsume;
case 'f':
c = 12;
goto Lconsume;
case 'n':
c = 10;
goto Lconsume;
case 'r':
c = 13;
goto Lconsume;
case 't':
c = 9;
goto Lconsume;
case 'v':
c = 11;
goto Lconsume;
case 'u':
ndigits = 4;
goto Lhex;
case 'U':
ndigits = 8;
goto Lhex;
case 'x':
ndigits = 2;
Lhex:
p++;
c = *p;
if (ishex(cast(char)c))
{
uint v = 0;
int n = 0;
if (Ccompile && ndigits == 2)
{
/* C11 6.4.4.4-7 one to infinity hex digits
*/
do
{
if (isdigit(cast(char)c))
c -= '0';
else if (islower(c))
c -= 'a' - 10;
else
c -= 'A' - 10;
v = v * 16 + c;
c = *++p;
} while (ishex(cast(char)c));
}
else
{
while (1)
{
if (isdigit(cast(char)c))
c -= '0';
else if (islower(c))
c -= 'a' - 10;
else
c -= 'A' - 10;
v = v * 16 + c;
c = *++p;
if (++n == ndigits)
break;
if (!ishex(cast(char)c))
{
.error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits);
break;
}
}
if (ndigits != 2 && !utf_isValidDchar(v))
{
.error(loc, "invalid UTF character \\U%08x", v);
v = '?'; // recover with valid UTF character
}
}
c = v;
}
else
{
.error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c);
p++;
}
break;
case '&':
if (Ccompile)
goto default;
// named character entity
for (const idstart = ++p; 1; p++)
{
switch (*p)
{
case ';':
c = HtmlNamedEntity(idstart, p - idstart);
if (c == ~0)
{
.error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
c = '?';
}
p++;
break;
default:
if (isalpha(*p) || (p != idstart && isdigit(*p)))
continue;
.error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
c = '?';
break;
}
break;
}
break;
case 0:
case 0x1A:
// end of file
c = '\\';
break;
default:
if (isoctal(cast(char)c))
{
uint v = 0;
int n = 0;
do
{
v = v * 8 + (c - '0');
c = *++p;
}
while (++n < 3 && isoctal(cast(char)c));
c = v;
if (c > 0xFF)
.error(loc, "escape octal sequence \\%03o is larger than \\377", c);
}
else
{
.error(loc, "undefined escape sequence \\%c", c);
p++;
}
break;
}
return c;
}
/**
Lex a wysiwyg string. `p` must be pointing to the first character before the
contents of the string literal. The character pointed to by `p` will be used as
the terminating character (i.e. backtick or double-quote).
Params:
result = pointer to the token that accepts the result
*/
private void wysiwygStringConstant(Token* result)
{
result.value = TOK.string_;
Loc start = loc();
auto terminator = p[0];
p++;
stringbuffer.setsize(0);
while (1)
{
dchar c = p[0];
p++;
switch (c)
{
case '\n':
endOfLine();
break;
case '\r':
if (p[0] == '\n')
continue; // ignore
c = '\n'; // treat EndOfLine as \n character
endOfLine();
break;
case 0:
case 0x1A:
error("unterminated string constant starting at %s", start.toChars());
result.setString();
// rewind `p` so it points to the EOF character
p--;
return;
default:
if (c == terminator)
{
result.setString(stringbuffer);
stringPostfix(result);
return;
}
else if (c & 0x80)
{
p--;
const u = decodeUTF();
p++;
if (u == PS || u == LS)
endOfLine();
stringbuffer.writeUTF8(u);
continue;
}
break;
}
stringbuffer.writeByte(c);
}
}
/**
Lex a delimited string. Some examples of delimited strings are:
---
q"(foo(xxx))" // "foo(xxx)"
q"[foo$(LPAREN)]" // "foo$(LPAREN)"
q"/foo]/" // "foo]"
q"HERE
foo
HERE" // "foo\n"
---
It is assumed that `p` points to the opening double-quote '"'.
Params:
result = pointer to the token that accepts the result
*/
private void delimitedStringConstant(Token* result)
{
result.value = TOK.string_;
Loc start = loc();
dchar delimleft = 0;
dchar delimright = 0;
uint nest = 1;
uint nestcount = ~0; // dead assignment, needed to suppress warning
Identifier hereid = null;
uint blankrol = 0;
uint startline = 0;
p++;
stringbuffer.setsize(0);
while (1)
{
dchar c = *p++;
//printf("c = '%c'\n", c);
switch (c)
{
case '\n':
Lnextline:
endOfLine();
startline = 1;
if (blankrol)
{
blankrol = 0;
continue;
}
if (hereid)
{
stringbuffer.writeUTF8(c);
continue;
}
break;
case '\r':
if (*p == '\n')
continue; // ignore
c = '\n'; // treat EndOfLine as \n character
goto Lnextline;
case 0:
case 0x1A:
error("unterminated delimited string constant starting at %s", start.toChars());
result.setString();
// decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
p--;
return;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
p++;
if (c == PS || c == LS)
goto Lnextline;
}
break;
}
if (delimleft == 0)
{
delimleft = c;
nest = 1;
nestcount = 1;
if (c == '(')
delimright = ')';
else if (c == '{')
delimright = '}';
else if (c == '[')
delimright = ']';
else if (c == '<')
delimright = '>';
else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
{
// Start of identifier; must be a heredoc
Token tok;
p--;
scan(&tok); // read in heredoc identifier
if (tok.value != TOK.identifier)
{
error("identifier expected for heredoc, not %s", tok.toChars());
delimright = c;
}
else
{
hereid = tok.ident;
//printf("hereid = '%s'\n", hereid.toChars());
blankrol = 1;
}
nest = 0;
}
else
{
delimright = c;
nest = 0;
if (isspace(c))
error("delimiter cannot be whitespace");
}
}
else
{
if (blankrol)
{
error("heredoc rest of line should be blank");
blankrol = 0;
continue;
}
if (nest == 1)
{
if (c == delimleft)
nestcount++;
else if (c == delimright)
{
nestcount--;
if (nestcount == 0)
goto Ldone;
}
}
else if (c == delimright)
goto Ldone;
if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
{
Token tok;
auto psave = p;
p--;
scan(&tok); // read in possible heredoc identifier
//printf("endid = '%s'\n", tok.ident.toChars());
if (tok.value == TOK.identifier && tok.ident is hereid)
{
/* should check that rest of line is blank
*/
goto Ldone;
}
p = psave;
}
stringbuffer.writeUTF8(c);
startline = 0;
}
}
Ldone:
if (*p == '"')
p++;
else if (hereid)
error("delimited string must end in `%s\"`", hereid.toChars());
else if (isspace(delimright))
error("delimited string must end in `\"`");
else
error("delimited string must end in `%c\"`", delimright);
result.setString(stringbuffer);
stringPostfix(result);
}
/**
Lex a token string. Some examples of token strings are:
---
q{ foo(xxx) } // " foo(xxx) "
q{foo$(LPAREN)} // "foo$(LPAREN)"
q{{foo}"}"} // "{foo}"}""
---
It is assumed that `p` points to the opening curly-brace.
Params:
result = pointer to the token that accepts the result
*/
private void tokenStringConstant(Token* result)
{
result.value = TOK.string_;
uint nest = 1;
const start = loc();
const pstart = ++p;
inTokenStringConstant++;
scope(exit) inTokenStringConstant--;
while (1)
{
Token tok;
scan(&tok);
switch (tok.value)
{
case TOK.leftCurly:
nest++;
continue;
case TOK.rightCurly:
if (--nest == 0)
{
result.setString(pstart, p - 1 - pstart);
stringPostfix(result);
return;
}
continue;
case TOK.endOfFile:
error("unterminated token string constant starting at %s", start.toChars());
result.setString();
return;
default:
continue;
}
}
}
/**
Scan a quoted string while building the processed string value by
handling escape sequences. The result is returned in the given `t` token.
This function assumes that `p` currently points to the opening quote
of the string.
Params:
t = the token to set the resulting string to
* References:
* D https://dlang.org/spec/lex.html#double_quoted_strings
* ImportC C11 6.4.5
*/
private void escapeStringConstant(Token* t)
{
t.value = TOK.string_;
const start = loc();
const tc = *p++; // opening quote
stringbuffer.setsize(0);
while (1)
{
dchar c = *p++;
switch (c)
{
case '\\':
switch (*p)
{
case '&':
if (Ccompile)
goto default;
goto case;
case 'u':
case 'U':
c = escapeSequence();
stringbuffer.writeUTF8(c);
continue;
default:
c = escapeSequence();
break;
}
break;
case '\n':
endOfLine();
if (Ccompile)
goto Lunterminated;
break;
case '\r':
if (*p == '\n')
continue; // ignore
c = '\n'; // treat EndOfLine as \n character
endOfLine();
if (Ccompile)
goto Lunterminated;
break;
case '\'':
case '"':
if (c != tc)
goto default;
t.setString(stringbuffer);
if (!Ccompile)
stringPostfix(t);
return;
case 0:
case 0x1A:
// decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
p--;
Lunterminated:
error("unterminated string constant starting at %s", start.toChars());
t.setString();
return;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
if (c == LS || c == PS)
{
c = '\n';
endOfLine();
if (Ccompile)
goto Lunterminated;
}
p++;
stringbuffer.writeUTF8(c);
continue;
}
break;
}
stringbuffer.writeByte(c);
}
}
/**************************************
* Reference:
* https://dlang.org/spec/lex.html#characterliteral
*/
private TOK charConstant(Token* t)
{
TOK tk = TOK.charLiteral;
//printf("Lexer::charConstant\n");
p++;
dchar c = *p++;
switch (c)
{
case '\\':
switch (*p)
{
case 'u':
t.unsvalue = escapeSequence();
tk = TOK.wcharLiteral;
break;
case 'U':
case '&':
t.unsvalue = escapeSequence();
tk = TOK.dcharLiteral;
break;
default:
t.unsvalue = escapeSequence();
break;
}
break;
case '\n':
L1:
endOfLine();
goto case;
case '\r':
goto case '\'';
case 0:
case 0x1A:
// decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token).
p--;
goto case;
case '\'':
error("unterminated character constant");
t.unsvalue = '?';
return tk;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
p++;
if (c == LS || c == PS)
goto L1;
if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
tk = TOK.wcharLiteral;
else
tk = TOK.dcharLiteral;
}
t.unsvalue = c;
break;
}
if (*p != '\'')
{
while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' &&
*p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}')
{
if (*p & 0x80)
{
const s = p;
c = decodeUTF();
if (c == LS || c == PS)
{
p = s;
break;
}
}
p++;
}
if (*p == '\'')
{
error("character constant has multiple characters");
p++;
}
else
error("unterminated character constant");
t.unsvalue = '?';
return tk;
}
p++;
return tk;
}
/***************************************
* Lex C character constant.
* Parser is on the opening quote.
* Params:
* t = token to fill in
* prefix = one of `u`, `U` or 0.
* Reference:
* C11 6.4.4.4
*/
private void clexerCharConstant(ref Token t, char prefix)
{
escapeStringConstant(&t);
const(char)[] str = t.ustring[0 .. t.len];
const n = str.length;
const loc = t.loc;
if (n == 0)
{
error(loc, "empty character constant");
t.value = TOK.semicolon;
return;
}
uint u;
switch (prefix)
{
case 0:
if (n == 1) // fast case
{
u = str[0];
}
else if (n > 4)
error(loc, "max number of chars in character literal is 4, had %d",
cast(int)n);
else
{
foreach (i, c; str)
(cast(char*)&u)[n - 1 - i] = c;
}
break;
case 'u':
dchar d1;
size_t idx;
auto msg = utf_decodeChar(str, idx, d1);
dchar d2 = 0;
if (idx < n && !msg)
msg = utf_decodeChar(str, idx, d2);
if (msg)
error(loc, "%s", msg);
else if (idx < n)
error(loc, "max number of chars in 16 bit character literal is 2, had %d",
(n + 1) >> 1);
else if (d1 > 0x1_0000)
error(loc, "%d does not fit in 16 bits", d1);
else if (d2 > 0x1_0000)
error(loc, "%d does not fit in 16 bits", d2);
u = d1;
if (d2)
u = (d1 << 16) | d2;
break;
case 'U':
dchar d;
size_t idx;
auto msg = utf_decodeChar(str, idx, d);
if (msg)
error(loc, "%s", msg);
else if (idx < n)
error(loc, "max number of chars in 32 bit character literal is 1, had %d",
(n + 3) >> 2);
u = d;
break;
default:
assert(0);
}
t.value = n == 1 ? TOK.charLiteral : TOK.int32Literal;
t.unsvalue = u;
}
/***************************************
* Get postfix of string literal.
*/
private void stringPostfix(Token* t) pure @nogc
{
switch (*p)
{
case 'c':
case 'w':
case 'd':
t.postfix = *p;
p++;
break;
default:
t.postfix = 0;
break;
}
}
/**************************************
* Read in a number.
* If it's an integer, store it in tok.TKutok.Vlong.
* integers can be decimal, octal or hex
* Handle the suffixes U, UL, LU, L, etc.
* If it's double, store it in tok.TKutok.Vdouble.
* Returns:
* TKnum
* TKdouble,...
*/
private TOK number(Token* t)
{
int base = 10;
const start = p;
uinteger_t n = 0; // unsigned >=64 bit integer type
int d;
bool err = false;
bool overflow = false;
bool anyBinaryDigitsNoSingleUS = false;
bool anyHexDigitsNoSingleUS = false;
char errorDigit = 0;
dchar c = *p;
if (c == '0')
{
++p;
c = *p;
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
base = 8;
break;
case '8':
case '9':
errorDigit = cast(char) c;
base = 8;
break;
case 'x':
case 'X':
++p;
base = 16;
break;
case 'b':
case 'B':
if (Ccompile)
error("binary constants not allowed");
++p;
base = 2;
break;
case '.':
if (p[1] == '.')
goto Ldone; // if ".."
if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
{
if (Ccompile && (p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
goto Lreal; // if `0.f` or `0.L`
goto Ldone; // if ".identifier" or ".unicode"
}
goto Lreal; // '.' is part of current token
case 'i':
case 'f':
case 'F':
goto Lreal;
case '_':
if (Ccompile)
error("embedded `_` not allowed");
++p;
base = 8;
break;
case 'L':
if (p[1] == 'i')
goto Lreal;
break;
default:
break;
}
}
while (1)
{
c = *p;
switch (c)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
++p;
d = c - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
++p;
if (base != 16)
{
if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
goto Lreal;
}
if (c >= 'a')
d = c + 10 - 'a';
else
d = c + 10 - 'A';
break;
case 'L':
if (p[1] == 'i')
goto Lreal;
goto Ldone;
case '.':
if (p[1] == '.')
goto Ldone; // if ".."
if (base <= 10 && n > 0 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
{
if (Ccompile && base == 10 &&
(p[1] == 'e' || p[1] == 'E' || p[1] == 'f' || p[1] == 'F' || p[1] == 'l' || p[1] == 'L'))
goto Lreal; // if `1.e6` or `1.f` or `1.L`
goto Ldone; // if ".identifier" or ".unicode"
}
if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80))
goto Ldone; // if ".identifier" or ".unicode"
if (base == 2)
goto Ldone; // if ".identifier" or ".unicode"
goto Lreal; // otherwise as part of a floating point literal
case 'p':
case 'P':
case 'i':
Lreal:
p = start;
return inreal(t);
case '_':
if (Ccompile)
goto default;
++p;
continue;
default:
goto Ldone;
}
// got a digit here, set any necessary flags, check for errors
anyHexDigitsNoSingleUS = true;
anyBinaryDigitsNoSingleUS = true;
if (!errorDigit && d >= base)
{
errorDigit = cast(char) c;
}
// Avoid expensive overflow check if we aren't at risk of overflow
if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
n = n * base + d;
else
{
import core.checkedint : mulu, addu;
n = mulu(n, base, overflow);
n = addu(n, d, overflow);
}
}
Ldone:
if (errorDigit)
{
error("%s digit expected, not `%c`", base == 2 ? "binary".ptr :
base == 8 ? "octal".ptr :
"decimal".ptr, errorDigit);
err = true;
}
if (overflow && !err)
{
error("integer overflow");
err = true;
}
if ((base == 2 && !anyBinaryDigitsNoSingleUS) ||
(base == 16 && !anyHexDigitsNoSingleUS))
error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start);
t.unsvalue = n;
if (Ccompile)
return cnumber(base, n);
enum FLAGS : int
{
none = 0,
decimal = 1, // decimal
unsigned = 2, // u or U suffix
long_ = 4, // L suffix
}
FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none;
// Parse trailing 'u', 'U', 'l' or 'L' in any combination
const psuffix = p;
while (1)
{
FLAGS f;
switch (*p)
{
case 'U':
case 'u':
f = FLAGS.unsigned;
goto L1;
case 'l':
f = FLAGS.long_;
error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
goto L1;
case 'L':
f = FLAGS.long_;
L1:
p++;
if ((flags & f) && !err)
{
error("unrecognized token");
err = true;
}
flags = cast(FLAGS)(flags | f);
continue;
default:
break;
}
break;
}
if (base == 8 && n >= 8)
{
if (err)
// can't translate invalid octal value, just show a generic message
error("octal literals larger than 7 are no longer supported");
else
error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!\"%llo%.*s\"` instead",
n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix);
}
TOK result;
switch (flags)
{
case FLAGS.none:
/* Octal or Hexadecimal constant.
* First that fits: int, uint, long, ulong
*/
if (n & 0x8000000000000000L)
result = TOK.uns64Literal;
else if (n & 0xFFFFFFFF00000000L)
result = TOK.int64Literal;
else if (n & 0x80000000)
result = TOK.uns32Literal;
else
result = TOK.int32Literal;
break;
case FLAGS.decimal:
/* First that fits: int, long, long long
*/
if (n & 0x8000000000000000L)
{
result = TOK.uns64Literal;
}
else if (n & 0xFFFFFFFF80000000L)
result = TOK.int64Literal;
else
result = TOK.int32Literal;
break;
case FLAGS.unsigned:
case FLAGS.decimal | FLAGS.unsigned:
/* First that fits: uint, ulong
*/
if (n & 0xFFFFFFFF00000000L)
result = TOK.uns64Literal;
else
result = TOK.uns32Literal;
break;
case FLAGS.decimal | FLAGS.long_:
if (n & 0x8000000000000000L)
{
if (!err)
{
error("signed integer overflow");
err = true;
}
result = TOK.uns64Literal;
}
else
result = TOK.int64Literal;
break;
case FLAGS.long_:
if (n & 0x8000000000000000L)
result = TOK.uns64Literal;
else
result = TOK.int64Literal;
break;
case FLAGS.unsigned | FLAGS.long_:
case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
result = TOK.uns64Literal;
break;
default:
debug
{
printf("%x\n", flags);
}
assert(0);
}
return result;
}
/**************************************
* Lex C integer-suffix
* Params:
* base = number base
* n = raw integer value
* Returns:
* token value
*/
private TOK cnumber(int base, uinteger_t n)
{
/* C11 6.4.4.1
* Parse trailing suffixes:
* u or U
* l or L
* ll or LL
*/
enum FLAGS : uint
{
octalhex = 1, // octal or hexadecimal
decimal = 2, // decimal
unsigned = 4, // u or U suffix
long_ = 8, // l or L suffix
llong = 0x10 // ll or LL
}
FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.octalhex;
bool err;
Lsuffixes:
while (1)
{
FLAGS f;
const cs = *p;
switch (cs)
{
case 'U':
case 'u':
f = FLAGS.unsigned;
break;
case 'l':
case 'L':
f = FLAGS.long_;
if (cs == p[1])
{
f = FLAGS.long_ | FLAGS.llong;
++p;
}
break;
default:
break Lsuffixes;
}
++p;
if ((flags & f) && !err)
{
error("duplicate integer suffixes");
err = true;
}
flags = cast(FLAGS)(flags | f);
}
TOK result = TOK.int32Literal; // default
switch (flags)
{
/* Since D doesn't have a variable sized `long` or `unsigned long` type,
* this code deviates from C by picking D int, uint, long, or ulong instead
*/
case FLAGS.octalhex:
/* Octal or Hexadecimal constant.
* First that fits: int, unsigned, long, unsigned long,
* long long, unsigned long long
*/
if (n & 0x8000000000000000L)
result = TOK.uns64Literal; // unsigned long
else if (n & 0xFFFFFFFF00000000L)
result = TOK.int64Literal; // long
else if (n & 0x80000000)
result = TOK.uns32Literal;
else
result = TOK.int32Literal;
break;
case FLAGS.decimal:
/* First that fits: int, long, long long
*/
if (n & 0x8000000000000000L)
result = TOK.uns64Literal; // unsigned long
else if (n & 0xFFFFFFFF80000000L)
result = TOK.int64Literal; // long
else
result = TOK.int32Literal;
break;
case FLAGS.octalhex | FLAGS.unsigned:
case FLAGS.decimal | FLAGS.unsigned:
/* First that fits: unsigned, unsigned long, unsigned long long
*/
if (n & 0xFFFFFFFF00000000L)
result = TOK.uns64Literal; // unsigned long
else
result = TOK.uns32Literal;
break;
case FLAGS.decimal | FLAGS.long_:
/* First that fits: long, long long
*/
if (longsize == 4 || long_longsize == 4)
{
if (n & 0xFFFFFFFF_80000000L)
result = TOK.int64Literal;
else
result = TOK.int32Literal; // long
}
else
{
result = TOK.int64Literal; // long
}
break;
case FLAGS.octalhex | FLAGS.long_:
/* First that fits: long, unsigned long, long long,
* unsigned long long
*/
if (longsize == 4 || long_longsize == 4)
{
if (n & 0x8000000000000000L)
result = TOK.uns64Literal;
else if (n & 0xFFFFFFFF00000000L)
result = TOK.int64Literal;
else if (n & 0x80000000)
result = TOK.uns32Literal; // unsigned long
else
result = TOK.int32Literal; // long
}
else
{
if (n & 0x80000000_00000000L)
result = TOK.uns64Literal; // unsigned long
else
result = TOK.int64Literal; // long
}
break;
case FLAGS.octalhex | FLAGS.unsigned | FLAGS.long_:
case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_:
/* First that fits: unsigned long, unsigned long long
*/
if (longsize == 4 || long_longsize == 4)
{
if (n & 0xFFFFFFFF00000000L)
result = TOK.uns64Literal;
else
result = TOK.uns32Literal; // unsigned long
}
else
{
result = TOK.uns64Literal; // unsigned long
}
break;
case FLAGS.octalhex | FLAGS.long_ | FLAGS.llong:
/* First that fits: long long, unsigned long long
*/
if (n & 0x8000000000000000L)
result = TOK.uns64Literal;
else
result = TOK.int64Literal;
break;
case FLAGS.decimal | FLAGS.long_ | FLAGS.llong:
/* long long
*/
result = TOK.int64Literal;
break;
case FLAGS.octalhex | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
case FLAGS.decimal | FLAGS.long_ | FLAGS.unsigned | FLAGS.llong:
result = TOK.uns64Literal;
break;
default:
debug printf("%x\n",flags);
assert(0);
}
return result;
}
/**************************************
* Read in characters, converting them to real.
* Bugs:
* Exponent overflow not detected.
* Too much requested precision is not detected.
*/
private TOK inreal(Token* t)
{
//printf("Lexer::inreal()\n");
debug
{
assert(*p == '.' || isdigit(*p));
}
bool isWellformedString = true;
stringbuffer.setsize(0);
auto pstart = p;
bool hex = false;
dchar c = *p++;
// Leading '0x'
if (c == '0')
{
c = *p++;
if (c == 'x' || c == 'X')
{
hex = true;
c = *p++;
}
}
// Digits to left of '.'
while (1)
{
if (c == '.')
{
c = *p++;
break;
}
if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
{
c = *p++;
continue;
}
break;
}
// Digits to right of '.'
while (1)
{
if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
{
c = *p++;
continue;
}
break;
}
if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
{
c = *p++;
if (c == '-' || c == '+')
{
c = *p++;
}
bool anyexp = false;
while (1)
{
if (isdigit(c))
{
anyexp = true;
c = *p++;
continue;
}
if (c == '_')
{
if (Ccompile)
error("embedded `_` in numeric literals not allowed");
c = *p++;
continue;
}
if (!anyexp)
{
error("missing exponent");
isWellformedString = false;
}
break;
}
}
else if (hex)
{
error("exponent required for hex float");
isWellformedString = false;
}
--p;
while (pstart < p)
{
if (*pstart != '_')
stringbuffer.writeByte(*pstart);
++pstart;
}
stringbuffer.writeByte(0);
auto sbufptr = cast(const(char)*)stringbuffer[].ptr;
TOK result;
bool isOutOfRange = false;
t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero);
switch (*p)
{
case 'F':
case 'f':
if (isWellformedString && !isOutOfRange)
isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr);
result = TOK.float32Literal;
p++;
break;
default:
if (isWellformedString && !isOutOfRange)
isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr);
result = TOK.float64Literal;
break;
case 'l':
if (!Ccompile)
error("use 'L' suffix instead of 'l'");
goto case 'L';
case 'L':
++p;
if (Ccompile && long_doublesize == 8)
goto default;
result = TOK.float80Literal;
break;
}
if ((*p == 'i' || *p == 'I') && !Ccompile)
{
if (*p == 'I')
error("use 'i' suffix instead of 'I'");
p++;
switch (result)
{
case TOK.float32Literal:
result = TOK.imaginary32Literal;
break;
case TOK.float64Literal:
result = TOK.imaginary64Literal;
break;
case TOK.float80Literal:
result = TOK.imaginary80Literal;
break;
default:
break;
}
}
const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal);
if (isOutOfRange && !isLong && (!Ccompile || hex))
{
/* C11 6.4.4.2 doesn't actually care if it is not representable if it is not hex
*/
const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : "";
error(scanloc, "number `%s%s` is not representable", sbufptr, suffix);
}
debug
{
switch (result)
{
case TOK.float32Literal:
case TOK.float64Literal:
case TOK.float80Literal:
case TOK.imaginary32Literal:
case TOK.imaginary64Literal:
case TOK.imaginary80Literal:
break;
default:
assert(0);
}
}
return result;
}
final Loc loc() pure @nogc
{
scanloc.charnum = cast(uint)(1 + p - line);
version (LocOffset)
scanloc.fileOffset = cast(uint)(p - base);
return scanloc;
}
final void error(const(char)* format, ...)
{
va_list args;
va_start(args, format);
.verror(token.loc, format, args);
va_end(args);
}
final void error(const ref Loc loc, const(char)* format, ...)
{
va_list args;
va_start(args, format);
.verror(loc, format, args);
va_end(args);
}
final void deprecation(const(char)* format, ...)
{
va_list args;
va_start(args, format);
.vdeprecation(token.loc, format, args);
va_end(args);
}
/***************************************
* Parse special token sequence:
* Returns:
* true if the special token sequence was handled
* References:
* https://dlang.org/spec/lex.html#special-token-sequence
*/
bool parseSpecialTokenSequence()
{
Token n;
scan(&n);
if (n.value == TOK.identifier)
{
if (n.ident == Id.line)
{
poundLine(n, false);
return true;
}
else
{
const locx = loc();
warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars());
}
}
else if (n.value == TOK.if_)
{
error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
}
return false;
}
/*********************************************
* Parse line/file preprocessor directive:
* #line linnum [filespec]
* Allow __LINE__ for linnum, and __FILE__ for filespec.
* Accept linemarker format:
* # linnum [filespec] {flags}
* There can be zero or more flags, which are one of the digits 1..4, and
* must be in ascending order. The flags are ignored.
* Params:
* tok = token we're on, which is linnum of linemarker
* linemarker = true if line marker format and lexer is on linnum
* References:
* linemarker https://gcc.gnu.org/onlinedocs/gcc-11.1.0/cpp/Preprocessor-Output.html
*/
final void poundLine(ref Token tok, bool linemarker)
{
auto linnum = this.scanloc.linnum;
const(char)* filespec = null;
bool flags;
if (!linemarker)
scan(&tok);
if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal)
{
const lin = cast(int)(tok.unsvalue);
if (lin != tok.unsvalue)
{
error(tok.loc, "line number `%lld` out of range", cast(ulong)tok.unsvalue);
skipToNextLine();
return;
}
else
linnum = lin;
}
else if (tok.value == TOK.line) // #line __LINE__
{
}
else
{
error(tok.loc, "positive integer argument expected following `#line`");
if (tok.value != TOK.endOfLine)
skipToNextLine();
return;
}
while (1)
{
scan(&tok);
switch (tok.value)
{
case TOK.endOfFile:
case TOK.endOfLine:
if (!inTokenStringConstant)
{
this.scanloc.linnum = linnum;
if (filespec)
this.scanloc.filename = filespec;
}
return;
case TOK.file:
if (filespec || flags)
goto Lerr;
filespec = mem.xstrdup(scanloc.filename);
continue;
case TOK.string_:
if (filespec || flags)
goto Lerr;
if (tok.ptr[0] != '"' || tok.postfix != 0)
goto Lerr;
filespec = tok.ustring;
continue;
case TOK.int32Literal:
if (!filespec)
goto Lerr;
if (linemarker && tok.unsvalue >= 1 && tok.unsvalue <= 4)
{
flags = true; // linemarker flags seen
continue;
}
goto Lerr;
default:
goto Lerr;
}
}
Lerr:
if (filespec is null)
error(tok.loc, "invalid filename for `#line` directive");
else if (linemarker)
error(tok.loc, "invalid flag for line marker directive");
else if (!Ccompile)
error(tok.loc, "found `%s` when expecting new line following `#line` directive", tok.toChars());
if (tok.value != TOK.endOfLine)
skipToNextLine();
}
/***************************************
* Scan forward to start of next line.
*/
final void skipToNextLine()
{
while (1)
{
switch (*p)
{
case 0:
case 0x1A:
return; // do not advance p
case '\n':
++p;
break;
case '\r':
++p;
if (p[0] == '\n')
++p;
break;
default:
if (*p & 0x80)
{
const u = decodeUTF();
if (u == PS || u == LS)
{
++p;
break;
}
}
++p;
continue;
}
break;
}
endOfLine();
tokenizeNewlines = false;
}
/********************************************
* Decode UTF character.
* Issue error messages for invalid sequences.
* Return decoded character, advance p to last character in UTF sequence.
*/
private uint decodeUTF()
{
const s = p;
assert(*s & 0x80);
// Check length of remaining string up to 4 UTF-8 characters
size_t len;
for (len = 1; len < 4 && s[len]; len++)
{
}
size_t idx = 0;
dchar u;
const msg = utf_decodeChar(s[0 .. len], idx, u);
p += idx - 1;
if (msg)
{
error("%.*s", cast(int)msg.length, msg.ptr);
}
return u;
}
/***************************************************
* Parse doc comment embedded between t.ptr and p.
* Remove trailing blanks and tabs from lines.
* Replace all newlines with \n.
* Remove leading comment character from each line.
* Decide if it's a lineComment or a blockComment.
* Append to previous one for this token.
*
* If newParagraph is true, an extra newline will be
* added between adjoining doc comments.
*/
private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure
{
/* ct tells us which kind of comment it is: '/', '*', or '+'
*/
const ct = t.ptr[2];
/* Start of comment text skips over / * *, / + +, or / / /
*/
const(char)* q = t.ptr + 3; // start of comment text
const(char)* qend = p;
if (ct == '*' || ct == '+')
qend -= 2;
/* Scan over initial row of ****'s or ++++'s or ////'s
*/
for (; q < qend; q++)
{
if (*q != ct)
break;
}
/* Remove leading spaces until start of the comment
*/
int linestart = 0;
if (ct == '/')
{
while (q < qend && (*q == ' ' || *q == '\t'))
++q;
}
else if (q < qend)
{
if (*q == '\r')
{
++q;
if (q < qend && *q == '\n')
++q;
linestart = 1;
}
else if (*q == '\n')
{
++q;
linestart = 1;
}
}
/* Remove trailing row of ****'s or ++++'s
*/
if (ct != '/')
{
for (; q < qend; qend--)
{
if (qend[-1] != ct)
break;
}
}
/* Comment is now [q .. qend].
* Canonicalize it into buf[].
*/
OutBuffer buf;
void trimTrailingWhitespace()
{
const s = buf[];
auto len = s.length;
while (len && (s[len - 1] == ' ' || s[len - 1] == '\t'))
--len;
buf.setsize(len);
}
for (; q < qend; q++)
{
char c = *q;
switch (c)
{
case '*':
case '+':
if (linestart && c == ct)
{
linestart = 0;
/* Trim preceding whitespace up to preceding \n
*/
trimTrailingWhitespace();
continue;
}
break;
case ' ':
case '\t':
break;
case '\r':
if (q[1] == '\n')
continue; // skip the \r
goto Lnewline;
default:
if (c == 226)
{
// If LS or PS
if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
{
q += 2;
goto Lnewline;
}
}
linestart = 0;
break;
Lnewline:
c = '\n'; // replace all newlines with \n
goto case;
case '\n':
linestart = 1;
/* Trim trailing whitespace
*/
trimTrailingWhitespace();
break;
}
buf.writeByte(c);
}
/* Trim trailing whitespace (if the last line does not have newline)
*/
trimTrailingWhitespace();
// Always end with a newline
const s = buf[];
if (s.length == 0 || s[$ - 1] != '\n')
buf.writeByte('\n');
// It's a line comment if the start of the doc comment comes
// after other non-whitespace on the same line.
auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
// Combine with previous doc comment, if any
if (*dc)
*dc = combineComments(*dc, buf[], newParagraph).toDString();
else
*dc = buf.extractSlice(true);
}
/********************************************
* Combine two document comments into one,
* separated by an extra newline if newParagraph is true.
*/
static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure
{
//debug printf("Lexer::combineComments('%*.s', '%*.s', '%i')\n", cast(int) c1.length, c1.ptr, cast(int) c2.length, c2.ptr, newParagraph);
const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n'
if (!c1)
return c2.ptr;
if (!c2)
return c1.ptr;
int insertNewLine = 0;
if (c1.length && c1[$ - 1] != '\n')
insertNewLine = 1;
const retSize = c1.length + insertNewLine + newParagraphSize + c2.length;
auto p = cast(char*)mem.xmalloc_noscan(retSize + 1);
p[0 .. c1.length] = c1[];
if (insertNewLine)
p[c1.length] = '\n';
if (newParagraph)
p[c1.length + insertNewLine] = '\n';
p[retSize - c2.length .. retSize] = c2[];
p[retSize] = 0;
return p;
}
/**************************
* `p` should be at start of next line
*/
private void endOfLine() pure @nogc @safe
{
scanloc.linnum++;
line = p;
}
}
/******************************* Private *****************************************/
private:
/// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__`
private struct TimeStampInfo
{
private __gshared bool initdone = false;
// Note: Those properties need to be guarded by a call to `init`
// The API isn't safe, and quite brittle, but it was left this way
// over performance concerns.
// This is currently only called once, from the lexer.
__gshared char[11 + 1] date;
__gshared char[8 + 1] time;
__gshared char[24 + 1] timestamp;
public static void initialize(const ref Loc loc) nothrow
{
if (initdone)
return;
initdone = true;
time_t ct;
// https://issues.dlang.org/show_bug.cgi?id=20444
if (auto p = getenv("SOURCE_DATE_EPOCH"))
{
if (!ct.parseDigits(p.toDString()))
error(loc, "value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p);
}
else
.time(&ct);
const p = ctime(&ct);
assert(p);
sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
sprintf(&time[0], "%.8s", p + 11);
sprintf(&timestamp[0], "%.24s", p);
}
}
private enum LS = 0x2028; // UTF line separator
private enum PS = 0x2029; // UTF paragraph separator
/********************************************
* Do our own char maps
*/
private static immutable cmtable = ()
{
ubyte[256] table;
foreach (const c; 0 .. table.length)
{
if ('0' <= c && c <= '7')
table[c] |= CMoctal;
if (c_isxdigit(c))
table[c] |= CMhex;
if (c_isalnum(c) || c == '_')
table[c] |= CMidchar;
switch (c)
{
case 'x': case 'X':
case 'b': case 'B':
table[c] |= CMzerosecond;
break;
case '0': .. case '9':
case 'e': case 'E':
case 'f': case 'F':
case 'l': case 'L':
case 'p': case 'P':
case 'u': case 'U':
case 'i':
case '.':
case '_':
table[c] |= CMzerosecond | CMdigitsecond;
break;
default:
break;
}
switch (c)
{
case '\\':
case '\n':
case '\r':
case 0:
case 0x1A:
case '\'':
break;
default:
if (!(c & 0x80))
table[c] |= CMsinglechar;
break;
}
}
return table;
}();
private
{
enum CMoctal = 0x1;
enum CMhex = 0x2;
enum CMidchar = 0x4;
enum CMzerosecond = 0x8;
enum CMdigitsecond = 0x10;
enum CMsinglechar = 0x20;
}
private bool isoctal(const char c) pure @nogc @safe
{
return (cmtable[c] & CMoctal) != 0;
}
private bool ishex(const char c) pure @nogc @safe
{
return (cmtable[c] & CMhex) != 0;
}
private bool isidchar(const char c) pure @nogc @safe
{
return (cmtable[c] & CMidchar) != 0;
}
private bool isZeroSecond(const char c) pure @nogc @safe
{
return (cmtable[c] & CMzerosecond) != 0;
}
private bool isDigitSecond(const char c) pure @nogc @safe
{
return (cmtable[c] & CMdigitsecond) != 0;
}
private bool issinglechar(const char c) pure @nogc @safe
{
return (cmtable[c] & CMsinglechar) != 0;
}
private bool c_isxdigit(const int c) pure @nogc @safe
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'f') ||
( c >= 'A' && c <= 'F'));
}
private bool c_isalnum(const int c) pure @nogc @safe
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'z') ||
( c >= 'A' && c <= 'Z'));
}
/******************************* Unittest *****************************************/
unittest
{
import dmd.console;
nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
{
assert(0);
}
diagnosticHandler = &assertDiagnosticHandler;
static void test(T)(string sequence, T expected, bool Ccompile = false)
{
auto p = cast(const(char)*)sequence.ptr;
assert(expected == Lexer.escapeSequence(Loc.initial, p, Ccompile));
assert(p == sequence.ptr + sequence.length);
}
test(`'`, '\'');
test(`"`, '"');
test(`?`, '?');
test(`\`, '\\');
test(`0`, '\0');
test(`a`, '\a');
test(`b`, '\b');
test(`f`, '\f');
test(`n`, '\n');
test(`r`, '\r');
test(`t`, '\t');
test(`v`, '\v');
test(`x00`, 0x00);
test(`xff`, 0xff);
test(`xFF`, 0xff);
test(`xa7`, 0xa7);
test(`x3c`, 0x3c);
test(`xe2`, 0xe2);
test(`1`, '\1');
test(`42`, '\42');
test(`357`, '\357');
test(`u1234`, '\u1234');
test(`uf0e4`, '\uf0e4');
test(`U0001f603`, '\U0001f603');
test(`&quot;`, '"');
test(`&lt;`, '<');
test(`&gt;`, '>');
diagnosticHandler = null;
}
unittest
{
import dmd.console;
string expected;
bool gotError;
nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header,
const(char)* format, va_list ap, const(char)* p1, const(char)* p2)
{
assert(cast(Classification)headerColor == Classification.error);
gotError = true;
char[100] buffer = void;
auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)];
assert(expected == actual);
return true;
}
diagnosticHandler = &expectDiagnosticHandler;
void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength, bool Ccompile = false)
{
uint errors = global.errors;
gotError = false;
expected = expectedError;
auto p = cast(const(char)*)sequence.ptr;
auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p, Ccompile);
assert(gotError);
assert(expectedReturnValue == actualReturnValue);
auto actualScanLength = p - sequence.ptr;
assert(expectedScanLength == actualScanLength);
global.errors = errors;
}
test("c", `undefined escape sequence \c`, 'c', 1);
test("!", `undefined escape sequence \!`, '!', 1);
test("&quot;", `undefined escape sequence \&`, '&', 1, true);
test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2);
test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2);
test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3);
test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4);
test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2);
test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3);
test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4);
test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5);
test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6);
test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7);
test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8);
test("ud800" , `invalid UTF character \U0000d800`, '?', 5);
test("udfff" , `invalid UTF character \U0000dfff`, '?', 5);
test("U00110000", `invalid UTF character \U00110000`, '?', 9);
test("xg0" , `undefined escape hex sequence \xg`, 'g', 2);
test("ug000" , `undefined escape hex sequence \ug`, 'g', 2);
test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2);
test("&BAD;", `unnamed character entity &BAD;` , '?', 5);
test("&quot", `unterminated named entity &quot;`, '?', 5);
test("&quot", `unterminated named entity &quot;`, '?', 5);
test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3);
diagnosticHandler = null;
}
unittest
{
//printf("lexer.unittest\n");
/* Not much here, just trying things out.
*/
string text = "int"; // We rely on the implicit null-terminator
scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
TOK tok;
tok = lex1.nextToken();
//printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32);
assert(tok == TOK.int32);
tok = lex1.nextToken();
assert(tok == TOK.endOfFile);
tok = lex1.nextToken();
assert(tok == TOK.endOfFile);
tok = lex1.nextToken();
assert(tok == TOK.endOfFile);
}
unittest
{
// We don't want to see Lexer error output during these tests.
uint errors = global.startGagging();
scope(exit) global.endGagging(errors);
// Test malformed input: even malformed input should end in a TOK.endOfFile.
static immutable char[][] testcases =
[ // Testcase must end with 0 or 0x1A.
[0], // not malformed, but pathological
['\'', 0],
['\'', 0x1A],
['{', '{', 'q', '{', 0],
[0xFF, 0],
[0xFF, 0x80, 0],
[0xFF, 0xFF, 0],
[0xFF, 0xFF, 0],
['x', '"', 0x1A],
];
foreach (testcase; testcases)
{
scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0);
TOK tok = lex2.nextToken();
size_t iterations = 1;
while ((tok != TOK.endOfFile) && (iterations++ < testcase.length))
{
tok = lex2.nextToken();
}
assert(tok == TOK.endOfFile);
tok = lex2.nextToken();
assert(tok == TOK.endOfFile);
}
}