blob: 8a2c90f1dfd2c6507ced714b42d065ff52b7abaf [file] [log] [blame]
/* Compiler implementation of the D programming language
* Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved
* written by Walter Bright
* http://www.digitalmars.com
* Distributed under the Boost Software License, Version 1.0.
* http://www.boost.org/LICENSE_1_0.txt
* https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c
*/
/* Lexical Analyzer */
#include "root/dsystem.h" // for time() and ctime()
#include "root/rmem.h"
#include "mars.h"
#include "lexer.h"
#include "utf.h"
#include "identifier.h"
#include "id.h"
extern int HtmlNamedEntity(const utf8_t *p, size_t length);
#define LS 0x2028 // UTF line separator
#define PS 0x2029 // UTF paragraph separator
/********************************************
* Do our own char maps
*/
static unsigned char cmtable[256];
const int CMoctal = 0x1;
const int CMhex = 0x2;
const int CMidchar = 0x4;
inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; }
inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; }
inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; }
struct CMTableInitializer
{
CMTableInitializer();
};
static CMTableInitializer cmtableinitializer;
CMTableInitializer::CMTableInitializer()
{
for (unsigned c = 0; c < 256; c++)
{
if ('0' <= c && c <= '7')
cmtable[c] |= CMoctal;
if (isxdigit(c))
cmtable[c] |= CMhex;
if (isalnum(c) || c == '_')
cmtable[c] |= CMidchar;
}
}
/*************************** Lexer ********************************************/
OutBuffer Lexer::stringbuffer;
Lexer::Lexer(const char *filename,
const utf8_t *base, size_t begoffset, size_t endoffset,
bool doDocComment, bool commentToken)
{
scanloc = Loc(filename, 1, 1);
//printf("Lexer::Lexer(%p,%d)\n",base,length);
//printf("lexer.filename = %s\n", filename);
this->token = Token();
this->token.ptr = NULL;
this->token.value = TOKreserved;
this->token.blockComment = NULL;
this->token.lineComment = NULL;
this->base = base;
this->end = base + endoffset;
p = base + begoffset;
line = p;
this->doDocComment = doDocComment;
this->anyToken = 0;
this->commentToken = commentToken;
this->errors = false;
//initKeywords();
/* If first line starts with '#!', ignore the line
*/
if (p[0] == '#' && p[1] =='!')
{
p += 2;
while (1)
{
utf8_t c = *p++;
switch (c)
{
case 0:
case 0x1A:
p--;
/* fall through */
case '\n':
break;
default:
continue;
}
break;
}
endOfLine();
}
}
void Lexer::endOfLine()
{
scanloc.linnum++;
line = p;
}
void Lexer::error(const char *format, ...)
{
va_list ap;
va_start(ap, format);
::verror(token.loc, format, ap);
va_end(ap);
errors = true;
}
void Lexer::error(Loc loc, const char *format, ...)
{
va_list ap;
va_start(ap, format);
::verror(loc, format, ap);
va_end(ap);
errors = true;
}
void Lexer::deprecation(const char *format, ...)
{
va_list ap;
va_start(ap, format);
::vdeprecation(token.loc, format, ap);
va_end(ap);
if (global.params.useDeprecated == DIAGNOSTICerror)
errors = true;
}
TOK Lexer::nextToken()
{
if (token.next)
{
Token *t = token.next;
memcpy(&token,t,sizeof(Token));
t->free();
}
else
{
scan(&token);
}
//token.print();
return token.value;
}
Token *Lexer::peek(Token *ct)
{
Token *t;
if (ct->next)
t = ct->next;
else
{
t = Token::alloc();
scan(t);
ct->next = t;
}
return t;
}
/***********************
* Look ahead at next token's value.
*/
TOK Lexer::peekNext()
{
return peek(&token)->value;
}
/***********************
* Look 2 tokens ahead at value.
*/
TOK Lexer::peekNext2()
{
Token *t = peek(&token);
return peek(t)->value;
}
/*********************************
* tk is on the opening (.
* Look ahead and return token that is past the closing ).
*/
Token *Lexer::peekPastParen(Token *tk)
{
//printf("peekPastParen()\n");
int parens = 1;
int curlynest = 0;
while (1)
{
tk = peek(tk);
//tk->print();
switch (tk->value)
{
case TOKlparen:
parens++;
continue;
case TOKrparen:
--parens;
if (parens)
continue;
tk = peek(tk);
break;
case TOKlcurly:
curlynest++;
continue;
case TOKrcurly:
if (--curlynest >= 0)
continue;
break;
case TOKsemicolon:
if (curlynest)
continue;
break;
case TOKeof:
break;
default:
continue;
}
return tk;
}
}
/****************************
* Turn next token in buffer into a token.
*/
void Lexer::scan(Token *t)
{
unsigned lastLine = scanloc.linnum;
Loc startLoc;
t->blockComment = NULL;
t->lineComment = NULL;
while (1)
{
t->ptr = p;
//printf("p = %p, *p = '%c'\n",p,*p);
t->loc = loc();
switch (*p)
{
case 0:
case 0x1A:
t->value = TOKeof; // end of file
return;
case ' ':
case '\t':
case '\v':
case '\f':
p++;
continue; // skip white space
case '\r':
p++;
if (*p != '\n') // if CR stands by itself
endOfLine();
continue; // skip white space
case '\n':
p++;
endOfLine();
continue; // skip white space
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
t->value = number(t);
return;
case '\'':
t->value = charConstant(t);
return;
case 'r':
if (p[1] != '"')
goto case_ident;
p++;
/* fall through */
case '`':
t->value = wysiwygStringConstant(t, *p);
return;
case 'x':
if (p[1] != '"')
goto case_ident;
p++;
t->value = hexStringConstant(t);
return;
case 'q':
if (p[1] == '"')
{
p++;
t->value = delimitedStringConstant(t);
return;
}
else if (p[1] == '{')
{
p++;
t->value = tokenStringConstant(t);
return;
}
else
goto case_ident;
case '"':
t->value = escapeStringConstant(t);
return;
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': /*case 'q': case 'r':*/ case 's': case 't':
case 'u': case 'v': case 'w': /*case 'x':*/ case 'y':
case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case '_':
case_ident:
{ utf8_t c;
while (1)
{
c = *++p;
if (isidchar(c))
continue;
else if (c & 0x80)
{ const utf8_t *s = p;
unsigned u = decodeUTF();
if (isUniAlpha(u))
continue;
error("char 0x%04x not allowed in identifier", u);
p = s;
}
break;
}
Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr);
t->ident = id;
t->value = (TOK) id->getValue();
anyToken = 1;
if (*t->ptr == '_') // if special identifier token
{
static bool initdone = false;
static char date[11+1];
static char time[8+1];
static char timestamp[24+1];
if (!initdone) // lazy evaluation
{
initdone = true;
time_t ct;
::time(&ct);
char *p = ctime(&ct);
assert(p);
sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
sprintf(&time[0], "%.8s", p + 11);
sprintf(&timestamp[0], "%.24s", p);
}
if (id == Id::DATE)
{
t->ustring = (utf8_t *)date;
goto Lstr;
}
else if (id == Id::TIME)
{
t->ustring = (utf8_t *)time;
goto Lstr;
}
else if (id == Id::VENDOR)
{
t->ustring = (utf8_t *)const_cast<char *>(global.vendor);
goto Lstr;
}
else if (id == Id::TIMESTAMP)
{
t->ustring = (utf8_t *)timestamp;
Lstr:
t->value = TOKstring;
t->postfix = 0;
t->len = (unsigned)strlen((char *)t->ustring);
}
else if (id == Id::VERSIONX)
{ unsigned major = 0;
unsigned minor = 0;
bool point = false;
for (const char *p = global.version + 1; 1; p++)
{
c = *p;
if (isdigit((utf8_t)c))
minor = minor * 10 + c - '0';
else if (c == '.')
{
if (point)
break; // ignore everything after second '.'
point = true;
major = minor;
minor = 0;
}
else
break;
}
t->value = TOKint64v;
t->uns64value = major * 1000 + minor;
}
else if (id == Id::EOFX)
{
t->value = TOKeof;
// Advance scanner to end of file
while (!(*p == 0 || *p == 0x1A))
p++;
}
}
//printf("t->value = %d\n",t->value);
return;
}
case '/':
p++;
switch (*p)
{
case '=':
p++;
t->value = TOKdivass;
return;
case '*':
p++;
startLoc = loc();
while (1)
{
while (1)
{ utf8_t c = *p;
switch (c)
{
case '/':
break;
case '\n':
endOfLine();
p++;
continue;
case '\r':
p++;
if (*p != '\n')
endOfLine();
continue;
case 0:
case 0x1A:
error("unterminated /* */ comment");
p = end;
t->loc = loc();
t->value = TOKeof;
return;
default:
if (c & 0x80)
{ unsigned u = decodeUTF();
if (u == PS || u == LS)
endOfLine();
}
p++;
continue;
}
break;
}
p++;
if (p[-2] == '*' && p - 3 != t->ptr)
break;
}
if (commentToken)
{
t->loc = startLoc;
t->value = TOKcomment;
return;
}
else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
{ // if /** but not /**/
getDocComment(t, lastLine == startLoc.linnum);
}
continue;
case '/': // do // style comments
startLoc = loc();
while (1)
{ utf8_t c = *++p;
switch (c)
{
case '\n':
break;
case '\r':
if (p[1] == '\n')
p++;
break;
case 0:
case 0x1A:
if (commentToken)
{
p = end;
t->loc = startLoc;
t->value = TOKcomment;
return;
}
if (doDocComment && t->ptr[2] == '/')
getDocComment(t, lastLine == startLoc.linnum);
p = end;
t->loc = loc();
t->value = TOKeof;
return;
default:
if (c & 0x80)
{ unsigned u = decodeUTF();
if (u == PS || u == LS)
break;
}
continue;
}
break;
}
if (commentToken)
{
p++;
endOfLine();
t->loc = startLoc;
t->value = TOKcomment;
return;
}
if (doDocComment && t->ptr[2] == '/')
getDocComment(t, lastLine == startLoc.linnum);
p++;
endOfLine();
continue;
case '+':
{ int nest;
startLoc = loc();
p++;
nest = 1;
while (1)
{ utf8_t c = *p;
switch (c)
{
case '/':
p++;
if (*p == '+')
{
p++;
nest++;
}
continue;
case '+':
p++;
if (*p == '/')
{
p++;
if (--nest == 0)
break;
}
continue;
case '\r':
p++;
if (*p != '\n')
endOfLine();
continue;
case '\n':
endOfLine();
p++;
continue;
case 0:
case 0x1A:
error("unterminated /+ +/ comment");
p = end;
t->loc = loc();
t->value = TOKeof;
return;
default:
if (c & 0x80)
{ unsigned u = decodeUTF();
if (u == PS || u == LS)
endOfLine();
}
p++;
continue;
}
break;
}
if (commentToken)
{
t->loc = startLoc;
t->value = TOKcomment;
return;
}
if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
{ // if /++ but not /++/
getDocComment(t, lastLine == startLoc.linnum);
}
continue;
}
default:
break;
}
t->value = TOKdiv;
return;
case '.':
p++;
if (isdigit(*p))
{ /* Note that we don't allow ._1 and ._ as being
* valid floating point numbers.
*/
p--;
t->value = inreal(t);
}
else if (p[0] == '.')
{
if (p[1] == '.')
{ p += 2;
t->value = TOKdotdotdot;
}
else
{ p++;
t->value = TOKslice;
}
}
else
t->value = TOKdot;
return;
case '&':
p++;
if (*p == '=')
{ p++;
t->value = TOKandass;
}
else if (*p == '&')
{ p++;
t->value = TOKandand;
}
else
t->value = TOKand;
return;
case '|':
p++;
if (*p == '=')
{ p++;
t->value = TOKorass;
}
else if (*p == '|')
{ p++;
t->value = TOKoror;
}
else
t->value = TOKor;
return;
case '-':
p++;
if (*p == '=')
{ p++;
t->value = TOKminass;
}
else if (*p == '-')
{ p++;
t->value = TOKminusminus;
}
else
t->value = TOKmin;
return;
case '+':
p++;
if (*p == '=')
{ p++;
t->value = TOKaddass;
}
else if (*p == '+')
{ p++;
t->value = TOKplusplus;
}
else
t->value = TOKadd;
return;
case '<':
p++;
if (*p == '=')
{ p++;
t->value = TOKle; // <=
}
else if (*p == '<')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKshlass; // <<=
}
else
t->value = TOKshl; // <<
}
else if (*p == '>')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKleg; // <>=
}
else
t->value = TOKlg; // <>
}
else
t->value = TOKlt; // <
return;
case '>':
p++;
if (*p == '=')
{ p++;
t->value = TOKge; // >=
}
else if (*p == '>')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKshrass; // >>=
}
else if (*p == '>')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKushrass; // >>>=
}
else
t->value = TOKushr; // >>>
}
else
t->value = TOKshr; // >>
}
else
t->value = TOKgt; // >
return;
case '!':
p++;
if (*p == '=')
{ p++;
t->value = TOKnotequal; // !=
}
else if (*p == '<')
{ p++;
if (*p == '>')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKunord; // !<>=
}
else
t->value = TOKue; // !<>
}
else if (*p == '=')
{ p++;
t->value = TOKug; // !<=
}
else
t->value = TOKuge; // !<
}
else if (*p == '>')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKul; // !>=
}
else
t->value = TOKule; // !>
}
else
t->value = TOKnot; // !
return;
case '=':
p++;
if (*p == '=')
{ p++;
t->value = TOKequal; // ==
}
else if (*p == '>')
{ p++;
t->value = TOKgoesto; // =>
}
else
t->value = TOKassign; // =
return;
case '~':
p++;
if (*p == '=')
{ p++;
t->value = TOKcatass; // ~=
}
else
t->value = TOKtilde; // ~
return;
case '^':
p++;
if (*p == '^')
{ p++;
if (*p == '=')
{ p++;
t->value = TOKpowass; // ^^=
}
else
t->value = TOKpow; // ^^
}
else if (*p == '=')
{ p++;
t->value = TOKxorass; // ^=
}
else
t->value = TOKxor; // ^
return;
case '(': p++; t->value = TOKlparen; return;
case ')': p++; t->value = TOKrparen; return;
case '[': p++; t->value = TOKlbracket; return;
case ']': p++; t->value = TOKrbracket; return;
case '{': p++; t->value = TOKlcurly; return;
case '}': p++; t->value = TOKrcurly; return;
case '?': p++; t->value = TOKquestion; return;
case ',': p++; t->value = TOKcomma; return;
case ';': p++; t->value = TOKsemicolon; return;
case ':': p++; t->value = TOKcolon; return;
case '$': p++; t->value = TOKdollar; return;
case '@': p++; t->value = TOKat; return;
case '*':
p++;
if (*p == '=')
{ p++;
t->value = TOKmulass;
}
else
t->value = TOKmul;
return;
case '%':
p++;
if (*p == '=')
{ p++;
t->value = TOKmodass;
}
else
t->value = TOKmod;
return;
case '#':
{
p++;
Token n;
scan(&n);
if (n.value == TOKidentifier)
{
if (n.ident == Id::line)
{
poundLine();
continue;
}
else
{
const Loc locx = loc();
warning(locx, "C preprocessor directive `#%s` is not supported", n.ident->toChars());
}
}
else if (n.value == TOKif)
{
error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
}
t->value = TOKpound;
return;
}
default:
{ unsigned c = *p;
if (c & 0x80)
{ c = decodeUTF();
// Check for start of unicode identifier
if (isUniAlpha(c))
goto case_ident;
if (c == PS || c == LS)
{
endOfLine();
p++;
continue;
}
}
if (c < 0x80 && isprint(c))
error("character '%c' is not a valid token", c);
else
error("character 0x%02x is not a valid token", c);
p++;
continue;
}
}
}
}
/*******************************************
* Parse escape sequence.
*/
unsigned Lexer::escapeSequence()
{ unsigned c = *p;
int n;
int ndigits;
switch (c)
{
case '\'':
case '"':
case '?':
case '\\':
Lconsume:
p++;
break;
case 'a': c = 7; goto Lconsume;
case 'b': c = 8; goto Lconsume;
case 'f': c = 12; goto Lconsume;
case 'n': c = 10; goto Lconsume;
case 'r': c = 13; goto Lconsume;
case 't': c = 9; goto Lconsume;
case 'v': c = 11; goto Lconsume;
case 'u':
ndigits = 4;
goto Lhex;
case 'U':
ndigits = 8;
goto Lhex;
case 'x':
ndigits = 2;
Lhex:
p++;
c = *p;
if (ishex((utf8_t)c))
{ unsigned v;
n = 0;
v = 0;
while (1)
{
if (isdigit((utf8_t)c))
c -= '0';
else if (islower(c))
c -= 'a' - 10;
else
c -= 'A' - 10;
v = v * 16 + c;
c = *++p;
if (++n == ndigits)
break;
if (!ishex((utf8_t)c))
{ error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
break;
}
}
if (ndigits != 2 && !utf_isValidDchar(v))
{ error("invalid UTF character \\U%08x", v);
v = '?'; // recover with valid UTF character
}
c = v;
}
else
error("undefined escape hex sequence \\%c",c);
break;
case '&': // named character entity
for (const utf8_t *idstart = ++p; 1; p++)
{
switch (*p)
{
case ';':
c = HtmlNamedEntity(idstart, p - idstart);
if (c == ~0U)
{ error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
c = ' ';
}
p++;
break;
default:
if (isalpha(*p) ||
(p != idstart && isdigit(*p)))
continue;
error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart);
break;
}
break;
}
break;
case 0:
case 0x1A: // end of file
c = '\\';
break;
default:
if (isoctal((utf8_t)c))
{ unsigned v;
n = 0;
v = 0;
do
{
v = v * 8 + (c - '0');
c = *++p;
} while (++n < 3 && isoctal((utf8_t)c));
c = v;
if (c > 0xFF)
error("escape octal sequence \\%03o is larger than \\377", c);
}
else
error("undefined escape sequence \\%c",c);
break;
}
return c;
}
/**************************************
*/
TOK Lexer::wysiwygStringConstant(Token *t, int tc)
{
int c;
Loc start = loc();
p++;
stringbuffer.reset();
while (1)
{
c = *p++;
switch (c)
{
case '\n':
endOfLine();
break;
case '\r':
if (*p == '\n')
continue; // ignore
c = '\n'; // treat EndOfLine as \n character
endOfLine();
break;
case 0:
case 0x1A:
error("unterminated string constant starting at %s", start.toChars());
t->ustring = (utf8_t *)const_cast<char *>("");
t->len = 0;
t->postfix = 0;
return TOKstring;
case '"':
case '`':
if (c == tc)
{
t->len = (unsigned)stringbuffer.offset;
stringbuffer.writeByte(0);
t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return TOKstring;
}
break;
default:
if (c & 0x80)
{ p--;
unsigned u = decodeUTF();
p++;
if (u == PS || u == LS)
endOfLine();
stringbuffer.writeUTF8(u);
continue;
}
break;
}
stringbuffer.writeByte(c);
}
}
/**************************************
* Lex hex strings:
* x"0A ae 34FE BD"
*/
TOK Lexer::hexStringConstant(Token *t)
{
unsigned c;
Loc start = loc();
unsigned n = 0;
unsigned v = ~0; // dead assignment, needed to suppress warning
p++;
stringbuffer.reset();
while (1)
{
c = *p++;
switch (c)
{
case ' ':
case '\t':
case '\v':
case '\f':
continue; // skip white space
case '\r':
if (*p == '\n')
continue; // ignore
// Treat isolated '\r' as if it were a '\n'
/* fall through */
case '\n':
endOfLine();
continue;
case 0:
case 0x1A:
error("unterminated string constant starting at %s", start.toChars());
t->ustring = (utf8_t *)const_cast<char *>("");
t->len = 0;
t->postfix = 0;
return TOKxstring;
case '"':
if (n & 1)
{ error("odd number (%d) of hex characters in hex string", n);
stringbuffer.writeByte(v);
}
t->len = (unsigned)stringbuffer.offset;
stringbuffer.writeByte(0);
t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return TOKxstring;
default:
if (c >= '0' && c <= '9')
c -= '0';
else if (c >= 'a' && c <= 'f')
c -= 'a' - 10;
else if (c >= 'A' && c <= 'F')
c -= 'A' - 10;
else if (c & 0x80)
{ p--;
unsigned u = decodeUTF();
p++;
if (u == PS || u == LS)
endOfLine();
else
error("non-hex character \\u%04x in hex string", u);
}
else
error("non-hex character '%c' in hex string", c);
if (n & 1)
{ v = (v << 4) | c;
stringbuffer.writeByte(v);
}
else
v = c;
n++;
break;
}
}
}
/**************************************
* Lex delimited strings:
* q"(foo(xxx))" // "foo(xxx)"
* q"[foo(]" // "foo("
* q"/foo]/" // "foo]"
* q"HERE
* foo
* HERE" // "foo\n"
* Input:
* p is on the "
*/
TOK Lexer::delimitedStringConstant(Token *t)
{
unsigned c;
Loc start = loc();
unsigned delimleft = 0;
unsigned delimright = 0;
unsigned nest = 1;
unsigned nestcount = ~0; // dead assignment, needed to suppress warning
Identifier *hereid = NULL;
unsigned blankrol = 0;
unsigned startline = 0;
p++;
stringbuffer.reset();
while (1)
{
c = *p++;
//printf("c = '%c'\n", c);
switch (c)
{
case '\n':
Lnextline:
endOfLine();
startline = 1;
if (blankrol)
{ blankrol = 0;
continue;
}
if (hereid)
{
stringbuffer.writeUTF8(c);
continue;
}
break;
case '\r':
if (*p == '\n')
continue; // ignore
c = '\n'; // treat EndOfLine as \n character
goto Lnextline;
case 0:
case 0x1A:
error("unterminated delimited string constant starting at %s", start.toChars());
t->ustring = (utf8_t *)const_cast<char *>("");
t->len = 0;
t->postfix = 0;
return TOKstring;
default:
if (c & 0x80)
{ p--;
c = decodeUTF();
p++;
if (c == PS || c == LS)
goto Lnextline;
}
break;
}
if (delimleft == 0)
{ delimleft = c;
nest = 1;
nestcount = 1;
if (c == '(')
delimright = ')';
else if (c == '{')
delimright = '}';
else if (c == '[')
delimright = ']';
else if (c == '<')
delimright = '>';
else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
{ // Start of identifier; must be a heredoc
Token tok;
p--;
scan(&tok); // read in heredoc identifier
if (tok.value != TOKidentifier)
{ error("identifier expected for heredoc, not %s", tok.toChars());
delimright = c;
}
else
{ hereid = tok.ident;
//printf("hereid = '%s'\n", hereid->toChars());
blankrol = 1;
}
nest = 0;
}
else
{ delimright = c;
nest = 0;
if (isspace(c))
error("delimiter cannot be whitespace");
}
}
else
{
if (blankrol)
{ error("heredoc rest of line should be blank");
blankrol = 0;
continue;
}
if (nest == 1)
{
if (c == delimleft)
nestcount++;
else if (c == delimright)
{ nestcount--;
if (nestcount == 0)
goto Ldone;
}
}
else if (c == delimright)
goto Ldone;
if (startline && isalpha(c) && hereid)
{ Token tok;
const utf8_t *psave = p;
p--;
scan(&tok); // read in possible heredoc identifier
//printf("endid = '%s'\n", tok.ident->toChars());
if (tok.value == TOKidentifier && tok.ident->equals(hereid))
{ /* should check that rest of line is blank
*/
goto Ldone;
}
p = psave;
}
stringbuffer.writeUTF8(c);
startline = 0;
}
}
Ldone:
if (*p == '"')
p++;
else if (hereid)
error("delimited string must end in %s\"", hereid->toChars());
else
error("delimited string must end in %c\"", delimright);
t->len = (unsigned)stringbuffer.offset;
stringbuffer.writeByte(0);
t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return TOKstring;
}
/**************************************
* Lex delimited strings:
* q{ foo(xxx) } // " foo(xxx) "
* q{foo(} // "foo("
* q{{foo}"}"} // "{foo}"}""
* Input:
* p is on the q
*/
TOK Lexer::tokenStringConstant(Token *t)
{
unsigned nest = 1;
Loc start = loc();
const utf8_t *pstart = ++p;
while (1)
{ Token tok;
scan(&tok);
switch (tok.value)
{
case TOKlcurly:
nest++;
continue;
case TOKrcurly:
if (--nest == 0)
{
t->len = (unsigned)(p - 1 - pstart);
t->ustring = (utf8_t *)mem.xmalloc(t->len + 1);
memcpy(t->ustring, pstart, t->len);
t->ustring[t->len] = 0;
stringPostfix(t);
return TOKstring;
}
continue;
case TOKeof:
error("unterminated token string constant starting at %s", start.toChars());
t->ustring = (utf8_t *)const_cast<char *>("");
t->len = 0;
t->postfix = 0;
return TOKstring;
default:
continue;
}
}
}
/**************************************
*/
TOK Lexer::escapeStringConstant(Token *t)
{
unsigned c;
Loc start = loc();
p++;
stringbuffer.reset();
while (1)
{
c = *p++;
switch (c)
{
case '\\':
switch (*p)
{
case 'u':
case 'U':
case '&':
c = escapeSequence();
stringbuffer.writeUTF8(c);
continue;
default:
c = escapeSequence();
break;
}
break;
case '\n':
endOfLine();
break;
case '\r':
if (*p == '\n')
continue; // ignore
c = '\n'; // treat EndOfLine as \n character
endOfLine();
break;
case '"':
t->len = (unsigned)stringbuffer.offset;
stringbuffer.writeByte(0);
t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
stringPostfix(t);
return TOKstring;
case 0:
case 0x1A:
p--;
error("unterminated string constant starting at %s", start.toChars());
t->ustring = (utf8_t *)const_cast<char *>("");
t->len = 0;
t->postfix = 0;
return TOKstring;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
if (c == LS || c == PS)
{ c = '\n';
endOfLine();
}
p++;
stringbuffer.writeUTF8(c);
continue;
}
break;
}
stringbuffer.writeByte(c);
}
}
/**************************************
*/
TOK Lexer::charConstant(Token *t)
{
unsigned c;
TOK tk = TOKcharv;
//printf("Lexer::charConstant\n");
p++;
c = *p++;
switch (c)
{
case '\\':
switch (*p)
{
case 'u':
t->uns64value = escapeSequence();
tk = TOKwcharv;
break;
case 'U':
case '&':
t->uns64value = escapeSequence();
tk = TOKdcharv;
break;
default:
t->uns64value = escapeSequence();
break;
}
break;
case '\n':
L1:
endOfLine();
/* fall through */
case '\r':
case 0:
case 0x1A:
case '\'':
error("unterminated character constant");
t->uns64value = '?';
return tk;
default:
if (c & 0x80)
{
p--;
c = decodeUTF();
p++;
if (c == LS || c == PS)
goto L1;
if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
tk = TOKwcharv;
else
tk = TOKdcharv;
}
t->uns64value = c;
break;
}
if (*p != '\'')
{
error("unterminated character constant");
t->uns64value = '?';
return tk;
}
p++;
return tk;
}
/***************************************
* Get postfix of string literal.
*/
void Lexer::stringPostfix(Token *t)
{
switch (*p)
{
case 'c':
case 'w':
case 'd':
t->postfix = *p;
p++;
break;
default:
t->postfix = 0;
break;
}
}
/**************************************
* Read in a number.
* If it's an integer, store it in tok.TKutok.Vlong.
* integers can be decimal, octal or hex
* Handle the suffixes U, UL, LU, L, etc.
* If it's double, store it in tok.TKutok.Vdouble.
* Returns:
* TKnum
* TKdouble,...
*/
TOK Lexer::number(Token *t)
{
int base = 10;
const utf8_t *start = p;
unsigned c;
uinteger_t n = 0; // unsigned >=64 bit integer type
int d;
bool err = false;
bool overflow = false;
c = *p;
if (c == '0')
{
++p;
c = *p;
switch (c)
{
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
n = c - '0';
++p;
base = 8;
break;
case 'x':
case 'X':
++p;
base = 16;
break;
case 'b':
case 'B':
++p;
base = 2;
break;
case '.':
if (p[1] == '.')
goto Ldone; // if ".."
if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
goto Ldone; // if ".identifier" or ".unicode"
goto Lreal; // '.' is part of current token
case 'i':
case 'f':
case 'F':
goto Lreal;
case '_':
++p;
base = 8;
break;
case 'L':
if (p[1] == 'i')
goto Lreal;
break;
default:
break;
}
}
while (1)
{
c = *p;
switch (c)
{
case '0': case '1':
++p;
d = c - '0';
break;
case '2': case '3':
case '4': case '5': case '6': case '7':
if (base == 2 && !err)
{
error("binary digit expected");
err = true;
}
++p;
d = c - '0';
break;
case '8': case '9':
++p;
if (base < 10 && !err)
{
error("radix %d digit expected, not '%c'", base, c);
err = true;
}
d = c - '0';
break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
++p;
if (base != 16)
{
if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
goto Lreal;
if (!err)
{
error("radix %d digit expected, not '%c'", base, c);
err = true;
}
}
if (c >= 'a')
d = c + 10 - 'a';
else
d = c + 10 - 'A';
break;
case 'L':
if (p[1] == 'i')
goto Lreal;
goto Ldone;
case '.':
if (p[1] == '.')
goto Ldone; // if ".."
if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
goto Ldone; // if ".identifier" or ".unicode"
goto Lreal; // otherwise as part of a floating point literal
case 'p':
case 'P':
case 'i':
Lreal:
p = start;
return inreal(t);
case '_':
++p;
continue;
default:
goto Ldone;
}
uinteger_t n2 = n * base;
if ((n2 / base != n || n2 + d < n))
{
overflow = true;
}
n = n2 + d;
// if n needs more than 64 bits
if (sizeof(n) > 8 &&
n > 0xFFFFFFFFFFFFFFFFULL)
{
overflow = true;
}
}
Ldone:
if (overflow && !err)
{
error("integer overflow");
err = true;
}
enum FLAGS
{
FLAGS_none = 0,
FLAGS_decimal = 1, // decimal
FLAGS_unsigned = 2, // u or U suffix
FLAGS_long = 4, // L suffix
};
unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
// Parse trailing 'u', 'U', 'l' or 'L' in any combination
const utf8_t *psuffix = p;
while (1)
{
utf8_t f;
switch (*p)
{
case 'U':
case 'u':
f = FLAGS_unsigned;
goto L1;
case 'l':
f = FLAGS_long;
error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
goto L1;
case 'L':
f = FLAGS_long;
L1:
p++;
if ((flags & f) && !err)
{
error("unrecognized token");
err = true;
}
flags = (FLAGS) (flags | f);
continue;
default:
break;
}
break;
}
if (base == 8 && n >= 8)
error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead",
n, p - psuffix, psuffix, n, p - psuffix, psuffix);
TOK result;
switch (flags)
{
case FLAGS_none:
/* Octal or Hexadecimal constant.
* First that fits: int, uint, long, ulong
*/
if (n & 0x8000000000000000LL)
result = TOKuns64v;
else if (n & 0xFFFFFFFF00000000LL)
result = TOKint64v;
else if (n & 0x80000000)
result = TOKuns32v;
else
result = TOKint32v;
break;
case FLAGS_decimal:
/* First that fits: int, long, long long
*/
if (n & 0x8000000000000000LL)
{
if (!err)
{
error("signed integer overflow");
err = true;
}
result = TOKuns64v;
}
else if (n & 0xFFFFFFFF80000000LL)
result = TOKint64v;
else
result = TOKint32v;
break;
case FLAGS_unsigned:
case FLAGS_decimal | FLAGS_unsigned:
/* First that fits: uint, ulong
*/
if (n & 0xFFFFFFFF00000000LL)
result = TOKuns64v;
else
result = TOKuns32v;
break;
case FLAGS_decimal | FLAGS_long:
if (n & 0x8000000000000000LL)
{
if (!err)
{
error("signed integer overflow");
err = true;
}
result = TOKuns64v;
}
else
result = TOKint64v;
break;
case FLAGS_long:
if (n & 0x8000000000000000LL)
result = TOKuns64v;
else
result = TOKint64v;
break;
case FLAGS_unsigned | FLAGS_long:
case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
result = TOKuns64v;
break;
default:
assert(0);
}
t->uns64value = n;
return result;
}
/**************************************
* Read in characters, converting them to real.
* Bugs:
* Exponent overflow not detected.
* Too much requested precision is not detected.
*/
TOK Lexer::inreal(Token *t)
{
//printf("Lexer::inreal()\n");
bool isWellformedString = true;
stringbuffer.reset();
const utf8_t *pstart = p;
char hex = 0;
unsigned c = *p++;
// Leading '0x'
if (c == '0')
{
c = *p++;
if (c == 'x' || c == 'X')
{
hex = true;
c = *p++;
}
}
// Digits to left of '.'
while (1)
{
if (c == '.')
{
c = *p++;
break;
}
if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
{
c = *p++;
continue;
}
break;
}
// Digits to right of '.'
while (1)
{
if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
{
c = *p++;
continue;
}
break;
}
if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
{
c = *p++;
if (c == '-' || c == '+')
{
c = *p++;
}
bool anyexp = false;
while (1)
{
if (isdigit(c))
{
anyexp = true;
c = *p++;
continue;
}
if (c == '_')
{
c = *p++;
continue;
}
if (!anyexp)
{
error("missing exponent");
isWellformedString = false;
}
break;
}
}
else if (hex)
{
error("exponent required for hex float");
isWellformedString = false;
}
--p;
while (pstart < p)
{
if (*pstart != '_')
stringbuffer.writeByte(*pstart);
++pstart;
}
stringbuffer.writeByte(0);
const char *sbufptr = (char *)stringbuffer.data;
TOK result;
bool isOutOfRange = false;
t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero);
errno = 0;
switch (*p)
{
case 'F':
case 'f':
if (isWellformedString && !isOutOfRange)
isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr);
result = TOKfloat32v;
p++;
break;
default:
if (isWellformedString && !isOutOfRange)
isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr);
result = TOKfloat64v;
break;
case 'l':
error("use 'L' suffix instead of 'l'");
/* fall through */
case 'L':
result = TOKfloat80v;
p++;
break;
}
if (*p == 'i' || *p == 'I')
{
if (*p == 'I')
error("use 'i' suffix instead of 'I'");
p++;
switch (result)
{
case TOKfloat32v:
result = TOKimaginary32v;
break;
case TOKfloat64v:
result = TOKimaginary64v;
break;
case TOKfloat80v:
result = TOKimaginary80v;
break;
default: break;
}
}
const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v);
if (isOutOfRange && !isLong)
{
const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix);
}
return result;
}
/*********************************************
* parse:
* #line linnum [filespec]
* also allow __LINE__ for linnum, and __FILE__ for filespec
*/
void Lexer::poundLine()
{
Token tok;
int linnum = this->scanloc.linnum;
char *filespec = NULL;
Loc loc = this->loc();
scan(&tok);
if (tok.value == TOKint32v || tok.value == TOKint64v)
{
int lin = (int)(tok.uns64value - 1);
if ((unsigned)lin != tok.uns64value - 1)
error("line number %lld out of range", (unsigned long long)tok.uns64value);
else
linnum = lin;
}
else if (tok.value == TOKline)
{
}
else
goto Lerr;
while (1)
{
switch (*p)
{
case 0:
case 0x1A:
case '\n':
Lnewline:
this->scanloc.linnum = linnum;
if (filespec)
this->scanloc.filename = filespec;
return;
case '\r':
p++;
if (*p != '\n')
{ p--;
goto Lnewline;
}
continue;
case ' ':
case '\t':
case '\v':
case '\f':
p++;
continue; // skip white space
case '_':
if (memcmp(p, "__FILE__", 8) == 0)
{
p += 8;
filespec = mem.xstrdup(scanloc.filename);
continue;
}
goto Lerr;
case '"':
if (filespec)
goto Lerr;
stringbuffer.reset();
p++;
while (1)
{ unsigned c;
c = *p;
switch (c)
{
case '\n':
case '\r':
case 0:
case 0x1A:
goto Lerr;
case '"':
stringbuffer.writeByte(0);
filespec = mem.xstrdup((char *)stringbuffer.data);
p++;
break;
default:
if (c & 0x80)
{ unsigned u = decodeUTF();
if (u == PS || u == LS)
goto Lerr;
}
stringbuffer.writeByte(c);
p++;
continue;
}
break;
}
continue;
default:
if (*p & 0x80)
{ unsigned u = decodeUTF();
if (u == PS || u == LS)
goto Lnewline;
}
goto Lerr;
}
}
Lerr:
error(loc, "#line integer [\"filespec\"]\\n expected");
}
/********************************************
* Decode UTF character.
* Issue error messages for invalid sequences.
* Return decoded character, advance p to last character in UTF sequence.
*/
unsigned Lexer::decodeUTF()
{
dchar_t u;
utf8_t c;
const utf8_t *s = p;
size_t len;
size_t idx;
const char *msg;
c = *s;
assert(c & 0x80);
// Check length of remaining string up to 6 UTF-8 characters
for (len = 1; len < 6 && s[len]; len++)
;
idx = 0;
msg = utf_decodeChar(s, len, &idx, &u);
p += idx - 1;
if (msg)
{
error("%s", msg);
}
return u;
}
/***************************************************
* Parse doc comment embedded between t->ptr and p.
* Remove trailing blanks and tabs from lines.
* Replace all newlines with \n.
* Remove leading comment character from each line.
* Decide if it's a lineComment or a blockComment.
* Append to previous one for this token.
*/
void Lexer::getDocComment(Token *t, unsigned lineComment)
{
/* ct tells us which kind of comment it is: '/', '*', or '+'
*/
utf8_t ct = t->ptr[2];
/* Start of comment text skips over / * *, / + +, or / / /
*/
const utf8_t *q = t->ptr + 3; // start of comment text
const utf8_t *qend = p;
if (ct == '*' || ct == '+')
qend -= 2;
/* Scan over initial row of ****'s or ++++'s or ////'s
*/
for (; q < qend; q++)
{
if (*q != ct)
break;
}
/* Remove leading spaces until start of the comment
*/
int linestart = 0;
if (ct == '/')
{
while (q < qend && (*q == ' ' || *q == '\t'))
++q;
}
else if (q < qend)
{
if (*q == '\r')
{
++q;
if (q < qend && *q == '\n')
++q;
linestart = 1;
}
else if (*q == '\n')
{
++q;
linestart = 1;
}
}
/* Remove trailing row of ****'s or ++++'s
*/
if (ct != '/')
{
for (; q < qend; qend--)
{
if (qend[-1] != ct)
break;
}
}
/* Comment is now [q .. qend].
* Canonicalize it into buf[].
*/
OutBuffer buf;
for (; q < qend; q++)
{
utf8_t c = *q;
switch (c)
{
case '*':
case '+':
if (linestart && c == ct)
{ linestart = 0;
/* Trim preceding whitespace up to preceding \n
*/
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
buf.offset--;
continue;
}
break;
case ' ':
case '\t':
break;
case '\r':
if (q[1] == '\n')
continue; // skip the \r
goto Lnewline;
default:
if (c == 226)
{
// If LS or PS
if (q[1] == 128 &&
(q[2] == 168 || q[2] == 169))
{
q += 2;
goto Lnewline;
}
}
linestart = 0;
break;
Lnewline:
c = '\n'; // replace all newlines with \n
/* fall through */
case '\n':
linestart = 1;
/* Trim trailing whitespace
*/
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
buf.offset--;
break;
}
buf.writeByte(c);
}
/* Trim trailing whitespace (if the last line does not have newline)
*/
if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
{
while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
buf.offset--;
}
// Always end with a newline
if (!buf.offset || buf.data[buf.offset - 1] != '\n')
buf.writeByte('\n');
buf.writeByte(0);
// It's a line comment if the start of the doc comment comes
// after other non-whitespace on the same line.
const utf8_t** dc = (lineComment && anyToken)
? &t->lineComment
: &t->blockComment;
// Combine with previous doc comment, if any
if (*dc)
*dc = combineComments(*dc, (utf8_t *)buf.data);
else
*dc = (utf8_t *)buf.extractData();
}
/********************************************
* Combine two document comments into one,
* separated by a newline.
*/
const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2)
{
//printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
const utf8_t *c = c2;
if (c1)
{
c = c1;
if (c2)
{
size_t len1 = strlen((const char *)c1);
size_t len2 = strlen((const char *)c2);
int insertNewLine = 0;
if (len1 && c1[len1 - 1] != '\n')
{
++len1;
insertNewLine = 1;
}
utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1);
memcpy(p, c1, len1 - insertNewLine);
if (insertNewLine)
p[len1 - 1] = '\n';
p[len1] = '\n';
memcpy(p + len1 + 1, c2, len2);
p[len1 + 1 + len2] = 0;
c = p;
}
}
return c;
}