gcc/json-parsing.cc - gcc.git - Git at Google

 /* JSON parsing
    Copyright (C) 2017-2025 Free Software Foundation, Inc.
    Contributed by David Malcolm <dmalcolm@redhat.com>.

 This file is part of GCC.

 GCC is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
 Software Foundation; either version 3, or (at your option) any later
 version.

 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or
 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */

 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
 #include "json-parsing.h"
 #include "pretty-print.h"
 #include "math.h"
 #include "selftest.h"

 using namespace json;

 /* Declarations relating to parsing JSON, all within an
    anonymous namespace.  */

 namespace {

 /* A typedef representing a single unicode character.  */

 typedef unsigned unichar;

 /* An enum for discriminating different kinds of JSON token.  */

 enum token_id
 {
   TOK_ERROR,

   TOK_EOF,

   /* Punctuation.  */
   TOK_OPEN_SQUARE,
   TOK_OPEN_CURLY,
   TOK_CLOSE_SQUARE,
   TOK_CLOSE_CURLY,
   TOK_COLON,
   TOK_COMMA,

   /* Literal names.  */
   TOK_TRUE,
   TOK_FALSE,
   TOK_NULL,

   TOK_STRING,
   TOK_FLOAT_NUMBER,
   TOK_INTEGER_NUMBER
 };

 /* Human-readable descriptions of enum token_id.  */

 static const char *token_id_name[] = {
   "error",
   "EOF",
   "'['",
   "'{'",
   "']'",
   "'}'",
   "':'",
   "','",
   "'true'",
   "'false'",
   "'null'",
   "string",
   "number",
   "number"
 };

 /* Tokens within the JSON lexer.  */

 struct token
 {
   /* The kind of token.  */
   enum token_id id;

   /* The location of this token within the unicode
      character stream.  */
   location_map::range range;

   union
   {
     /* Value for TOK_ERROR and TOK_STRING.  */
     char *string;

     /* Value for TOK_FLOAT_NUMBER.  */
     double float_number;

     /* Value for TOK_INTEGER_NUMBER.  */
     long integer_number;
   } u;
 };

 /* A class for lexing JSON.  */

 class lexer
 {
  public:
   lexer (bool support_comments);
   ~lexer ();

   std::unique_ptr<error> add_utf8 (size_t length, const char *utf8_buf);

   const token *peek ();

   void consume ();

  private:
   bool get_char (unichar &out_char, location_map::point *out_point);
   void unget_char ();
   location_map::point get_next_point () const;
   static void dump_token (FILE *outf, const token *tok);
   void lex_token (token *out);
   void lex_string (token *out);
   void lex_number (token *out, unichar first_char);
   bool rest_of_literal (token *out, const char *suffix);
   std::unique_ptr<error> make_error (const char *msg);
   bool consume_single_line_comment (token *out);
   bool consume_multiline_comment (token *out);

  private:
   auto_vec<unichar> m_buffer;
   int m_next_char_idx;
   int m_next_char_line;
   int m_next_char_column;
   int m_prev_line_final_column; /* for handling unget_char after a '\n'.  */

   static const int MAX_TOKENS = 1;
   token m_next_tokens[MAX_TOKENS];
   int m_num_next_tokens;

   bool m_support_comments;
 };

 /* A class for parsing JSON.  */

 class parser
 {
  public:
   parser (location_map *out_loc_map,
 	  bool support_comments);
   ~parser ();

   std::unique_ptr<error>
   add_utf8 (size_t length, const char *utf8_buf);

   parser_result_t parse_value (int depth);
   parser_result_t parse_object (int depth);
   parser_result_t parse_array (int depth);

   std::unique_ptr<error>
   require_eof ();

  private:
   location_map::point get_next_token_start ();
   location_map::point get_next_token_end ();

   std::unique_ptr<error>
   require (enum token_id tok_id);

   result<enum token_id, std::unique_ptr<error>>
   require_one_of (enum token_id tok_id_a, enum token_id tok_id_b);

   std::unique_ptr<error>
   error_at (const location_map::range &r,
 	    const char *fmt, ...) ATTRIBUTE_PRINTF_3;

   void maybe_record_range (json::value *jv, const location_map::range &r);
   void maybe_record_range (json::value *jv,
 			   const location_map::point &start,
 			   const location_map::point &end);

  private:
   lexer m_lexer;
   location_map *m_loc_map;
 };

 } // anonymous namespace for parsing implementation

 /* Parser implementation.  */

 /* lexer's ctor.  */

 lexer::lexer (bool support_comments)
 : m_buffer (), m_next_char_idx (0),
   m_next_char_line (1), m_next_char_column (0),
   m_prev_line_final_column (-1),
   m_num_next_tokens (0),
   m_support_comments (support_comments)
 {
 }

 /* lexer's dtor.  */

 lexer::~lexer ()
 {
   while (m_num_next_tokens > 0)
     consume ();
 }

 /* Peek the next token.  */

 const token *
 lexer::peek ()
 {
   if (m_num_next_tokens == 0)
     {
       lex_token (&m_next_tokens[0]);
       m_num_next_tokens++;
     }
   return &m_next_tokens[0];
 }

 /* Consume the next token.  */

 void
 lexer::consume ()
 {
   if (m_num_next_tokens == 0)
     peek ();

   gcc_assert (m_num_next_tokens > 0);
   gcc_assert (m_num_next_tokens <= MAX_TOKENS);

   if (0)
     {
       fprintf (stderr, "consuming token: ");
       dump_token (stderr, &m_next_tokens[0]);
       fprintf (stderr, "\n");
     }

   if (m_next_tokens[0].id == TOK_ERROR
       || m_next_tokens[0].id == TOK_STRING)
     free (m_next_tokens[0].u.string);

   m_num_next_tokens--;
   memmove (&m_next_tokens[0], &m_next_tokens[1],
 	   sizeof (token) * m_num_next_tokens);
 }

 /* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this lexer's
    buffer.
    Return null if successful, or the error if there was a problem.  */

 std::unique_ptr<error>
 lexer::add_utf8 (size_t length, const char *utf8_buf)
 {
   /* Adapted from charset.c:one_utf8_to_cppchar.  */
   static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
   static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

   const uchar *inbuf = (const unsigned char *) (utf8_buf);
   const uchar **inbufp = &inbuf;
   size_t *inbytesleftp = &length;

   while (length > 0)
     {
       unichar c;
       const uchar *inbuf = *inbufp;
       size_t nbytes, i;

       c = *inbuf;
       if (c < 0x80)
 	{
 	  m_buffer.safe_push (c);
 	  *inbytesleftp -= 1;
 	  *inbufp += 1;
 	  continue;
 	}

       /* The number of leading 1-bits in the first byte indicates how many
 	 bytes follow.  */
       for (nbytes = 2; nbytes < 7; nbytes++)
 	if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
 	  goto found;
       return make_error ("ill-formed UTF-8 sequence");
     found:

       if (*inbytesleftp < nbytes)
 	return make_error ("ill-formed UTF-8 sequence");

       c = (c & masks[nbytes-1]);
       inbuf++;
       for (i = 1; i < nbytes; i++)
 	{
 	  unichar n = *inbuf++;
 	  if ((n & 0xC0) != 0x80)
 	    return make_error ("ill-formed UTF-8 sequence");
 	  c = ((c << 6) + (n & 0x3F));
 	}

       /* Make sure the shortest possible encoding was used.  */
       if ((   c <=      0x7F && nbytes > 1)
 	  || (c <=     0x7FF && nbytes > 2)
 	  || (c <=    0xFFFF && nbytes > 3)
 	  || (c <=  0x1FFFFF && nbytes > 4)
 	  || (c <= 0x3FFFFFF && nbytes > 5))
 	return make_error ("ill-formed UTF-8:"
 			   " shortest possible encoding not used");

       /* Make sure the character is valid.  */
       if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
 	return make_error ("ill-formed UTF-8: invalid character");

       m_buffer.safe_push (c);
       *inbufp = inbuf;
       *inbytesleftp -= nbytes;
     }
   return nullptr;
 }

 /* Attempt to get the next unicode character from this lexer's buffer.
    If successful, write it to OUT_CHAR, and its location to *OUT_POINT,
    and return true.
    Otherwise, return false.  */

 bool
 lexer::get_char (unichar &out_char, location_map::point *out_point)
 {
   if (m_next_char_idx >= (int)m_buffer.length ())
     return false;

   if (out_point)
     *out_point = get_next_point ();
   out_char = m_buffer[m_next_char_idx++];

   if (out_char == '\n')
     {
       m_next_char_line++;
       m_prev_line_final_column = m_next_char_column;
       m_next_char_column = 0;
     }
   else
     m_next_char_column++;

   return true;
 }

 /* Undo the last successful get_char.  */

 void
 lexer::unget_char ()
 {
   --m_next_char_idx;
   if (m_next_char_column > 0)
     --m_next_char_column;
   else
     {
       m_next_char_line--;
       m_next_char_column = m_prev_line_final_column;
       /* We don't support more than one unget_char in a row.  */
       gcc_assert (m_prev_line_final_column != -1);
       m_prev_line_final_column = -1;
     }
 }

 /* Get the location of the next char.  */

 location_map::point
 lexer::get_next_point () const
 {
   location_map::point result;
   result.m_unichar_idx = m_next_char_idx;
   result.m_line = m_next_char_line;
   result.m_column = m_next_char_column;
   return result;
 }

 /* Print a textual representation of TOK to OUTF.
    This is intended for debugging the lexer and parser,
    rather than for user-facing output.  */

 void
 lexer::dump_token (FILE *outf, const token *tok)
 {
   switch (tok->id)
     {
     case TOK_ERROR:
       fprintf (outf, "TOK_ERROR (\"%s\")", tok->u.string);
       break;

     case TOK_EOF:
       fprintf (outf, "TOK_EOF");
       break;

     case TOK_OPEN_SQUARE:
       fprintf (outf, "TOK_OPEN_SQUARE");
       break;

     case TOK_OPEN_CURLY:
       fprintf (outf, "TOK_OPEN_CURLY");
       break;

     case TOK_CLOSE_SQUARE:
       fprintf (outf, "TOK_CLOSE_SQUARE");
       break;

     case TOK_CLOSE_CURLY:
       fprintf (outf, "TOK_CLOSE_CURLY");
       break;

     case TOK_COLON:
       fprintf (outf, "TOK_COLON");
       break;

     case TOK_COMMA:
       fprintf (outf, "TOK_COMMA");
       break;

     case TOK_TRUE:
       fprintf (outf, "TOK_TRUE");
       break;

     case TOK_FALSE:
       fprintf (outf, "TOK_FALSE");
       break;

     case TOK_NULL:
       fprintf (outf, "TOK_NULL");
       break;

     case TOK_STRING:
       fprintf (outf, "TOK_STRING (\"%s\")", tok->u.string);
       break;

     case TOK_FLOAT_NUMBER:
       fprintf (outf, "TOK_FLOAT_NUMBER (%f)", tok->u.float_number);
       break;

     case TOK_INTEGER_NUMBER:
       fprintf (outf, "TOK_INTEGER_NUMBER (%ld)", tok->u.integer_number);
       break;

     default:
       gcc_unreachable ();
       break;
     }
 }

 /* Treat "//" as a comment to the end of the line.

    This isn't compliant with the JSON spec,
    but is very handy for writing DejaGnu tests.

    Return true if EOF and populate *OUT, false otherwise.  */

 bool
 lexer::consume_single_line_comment (token *out)
 {
   while (1)
     {
       unichar next_char;
       if (!get_char (next_char, nullptr))
 	{
 	  out->id = TOK_EOF;
 	  location_map::point p = get_next_point ();
 	  out->range.m_start = p;
 	  out->range.m_end = p;
 	  return true;
 	}
       if (next_char == '\n')
 	return false;
     }
 }

 /* Treat '/' '*' as a multiline comment until the next closing '*' '/'.

    This isn't compliant with the JSON spec,
    but is very handy for writing DejaGnu tests.

    Return true if EOF and populate *OUT, false otherwise.  */

 bool
 lexer::consume_multiline_comment (token *out)
 {
   while (1)
     {
       unichar next_char;
       if (!get_char (next_char, nullptr))
 	{
 	  out->id = TOK_ERROR;
 	  gcc_unreachable (); // TODO
 	  location_map::point p = get_next_point ();
 	  out->range.m_start = p;
 	  out->range.m_end = p;
 	  return true;
 	}
       if (next_char != '*')
 	continue;
       if (!get_char (next_char, nullptr))
 	{
 	  out->id = TOK_ERROR;
 	  gcc_unreachable (); // TODO
 	  location_map::point p = get_next_point ();
 	  out->range.m_start = p;
 	  out->range.m_end = p;
 	  return true;
 	}
       if (next_char == '/')
 	return false;
     }
 }

 /* Attempt to lex the input buffer, writing the next token to OUT.
    On errors, TOK_ERROR (or TOK_EOF) is written to OUT.  */

 void
 lexer::lex_token (token *out)
 {
   /* Skip to next non-whitespace char.  */
   unichar next_char;
   location_map::point start_point;
   while (1)
     {
       if (!get_char (next_char, &start_point))
 	{
 	  out->id = TOK_EOF;
 	  location_map::point p = get_next_point ();
 	  out->range.m_start = p;
 	  out->range.m_end = p;
 	  return;
 	}
       if (m_support_comments)
 	if (next_char == '/')
 	  {
 	    location_map::point point;
 	    unichar next_next_char;
 	    if (get_char (next_next_char, &point))
 	      {
 		switch (next_next_char)
 		  {
 		  case '/':
 		    if (consume_single_line_comment (out))
 		      return;
 		    continue;
 		  case '*':
 		    if (consume_multiline_comment (out))
 		      return;
 		    continue;
 		  default:
 		    /* A stray single '/'.  Break out of loop, so that we
 		       handle it below as an unexpected character.  */
 		    goto non_whitespace;
 		  }
 	      }
 	  }
       if (next_char != ' '
 	  && next_char != '\t'
 	  && next_char != '\n'
 	  && next_char != '\r')
 	break;
     }

  non_whitespace:

   out->range.m_start = start_point;
   out->range.m_end = start_point;

   switch (next_char)
     {
     case '[':
       out->id = TOK_OPEN_SQUARE;
       break;

     case '{':
       out->id = TOK_OPEN_CURLY;
       break;

     case ']':
       out->id = TOK_CLOSE_SQUARE;
       break;

     case '}':
       out->id = TOK_CLOSE_CURLY;
       break;

     case ':':
       out->id = TOK_COLON;
       break;

     case ',':
       out->id = TOK_COMMA;
       break;

     case '"':
       lex_string (out);
       break;

     case '-':
     case '0':
     case '1':
     case '2':
     case '3':
     case '4':
     case '5':
     case '6':
     case '7':
     case '8':
     case '9':
       lex_number (out, next_char);
       break;

     case 't':
       /* Handle literal "true".  */
       if (rest_of_literal (out, "rue"))
 	{
 	  out->id = TOK_TRUE;
 	  break;
 	}
       else
 	goto err;

     case 'f':
       /* Handle literal "false".  */
       if (rest_of_literal (out, "alse"))
 	{
 	  out->id = TOK_FALSE;
 	  break;
 	}
       else
 	goto err;

     case 'n':
       /* Handle literal "null".  */
       if (rest_of_literal (out, "ull"))
 	{
 	  out->id = TOK_NULL;
 	  break;
 	}
       else
 	goto err;

     err:
     default:
       out->id = TOK_ERROR;
       out->u.string = xasprintf ("unexpected character: '%c'", next_char);
       break;
     }
 }

 /* Having consumed an open-quote character from the lexer's buffer, attempt
    to lex the rest of a JSON string, writing the result to OUT (or TOK_ERROR)
    if an error occurred.
    (ECMA-404 section 9; RFC 7159 section 7).  */

 void
 lexer::lex_string (token *out)
 {
   auto_vec<unichar> content;
   bool still_going = true;
   while (still_going)
     {
       unichar uc;
       if (!get_char (uc, &out->range.m_end))
 	{
 	  out->id = TOK_ERROR;
 	  out->range.m_end = get_next_point ();
 	  out->u.string = xstrdup ("EOF within string");
 	  return;
 	}
       switch (uc)
 	{
 	case '"':
 	  still_going = false;
 	  break;
 	case '\\':
 	  {
 	    unichar next_char;
 	    if (!get_char (next_char, &out->range.m_end))
 	      {
 		out->id = TOK_ERROR;
 		out->range.m_end = get_next_point ();
 		out->u.string = xstrdup ("EOF within string");;
 		return;
 	      }
 	    switch (next_char)
 	      {
 	      case '"':
 	      case '\\':
 	      case '/':
 		content.safe_push (next_char);
 		break;

 	      case 'b':
 		content.safe_push ('\b');
 		break;

 	      case 'f':
 		content.safe_push ('\f');
 		break;

 	      case 'n':
 		content.safe_push ('\n');
 		break;

 	      case 'r':
 		content.safe_push ('\r');
 		break;

 	      case 't':
 		content.safe_push ('\t');
 		break;

 	      case 'u':
 		{
 		  unichar result = 0;
 		  for (int i = 0; i < 4; i++)
 		    {
 		      unichar hexdigit;
 		      if (!get_char (hexdigit, &out->range.m_end))
 			{
 			  out->id = TOK_ERROR;
 			  out->range.m_end = get_next_point ();
 			  out->u.string = xstrdup ("EOF within string");
 			  return;
 			}
 		      result <<= 4;
 		      if (hexdigit >= '0' && hexdigit <= '9')
 			result += hexdigit - '0';
 		      else if (hexdigit >= 'a' && hexdigit <= 'f')
 			result += (hexdigit - 'a') + 10;
 		      else if (hexdigit >= 'A' && hexdigit <= 'F')
 			result += (hexdigit - 'A') + 10;
 		      else
 			{
 			  out->id = TOK_ERROR;
 			  out->range.m_start = out->range.m_end;
 			  out->u.string = xstrdup ("bogus hex char");
 			  return;
 			}
 		    }
 		  content.safe_push (result);
 		}
 		break;

 	      default:
 		out->id = TOK_ERROR;
 		out->u.string = xstrdup ("unrecognized escape char");
 		return;
 	      }
 	  }
 	  break;

 	default:
 	  /* Reject unescaped control characters U+0000 through U+001F
 	     (ECMA-404 section 9 para 1; RFC 7159 section 7 para 1).  */
 	  if (uc <= 0x1f)
 	    {
 		out->id = TOK_ERROR;
 		out->range.m_start = out->range.m_end;
 		out->u.string = xstrdup ("unescaped control char");
 		return;
 	    }

 	  /* Otherwise, add regular unicode code point.  */
 	  content.safe_push (uc);
 	  break;
 	}
     }

   out->id = TOK_STRING;

   auto_vec<char> utf8_buf;
   // Adapted from libcpp/charset.c:one_cppchar_to_utf8
   for (unsigned i = 0; i < content.length (); i++)
     {
       static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
       static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
       size_t nbytes;
       uchar buf[6], *p = &buf[6];
       unichar c = content[i];

       nbytes = 1;
       if (c < 0x80)
 	*--p = c;
       else
 	{
 	  do
 	    {
 	      *--p = ((c & 0x3F) | 0x80);
 	      c >>= 6;
 	      nbytes++;
 	    }
 	  while (c >= 0x3F || (c & limits[nbytes-1]));
 	  *--p = (c | masks[nbytes-1]);
 	}

       while (p < &buf[6])
 	utf8_buf.safe_push (*p++);
     }

   out->u.string = XNEWVEC (char, utf8_buf.length () + 1);
   for (unsigned i = 0; i < utf8_buf.length (); i++)
     out->u.string[i] = utf8_buf[i];
   out->u.string[utf8_buf.length ()] = '\0';
 }

 /* Having consumed FIRST_CHAR, an initial digit or '-' character from
    the lexer's buffer attempt to lex the rest of a JSON number, writing
    the result to OUT (or TOK_ERROR) if an error occurred.
    (ECMA-404 section 8; RFC 7159 section 6).  */

 void
 lexer::lex_number (token *out, unichar first_char)
 {
   bool negate = false;
   double value = 0.0;
   if (first_char == '-')
     {
       negate = true;
       if (!get_char (first_char, &out->range.m_end))
 	{
 	  out->id = TOK_ERROR;
 	  out->range.m_start = out->range.m_end;
 	  out->u.string = xstrdup ("expected digit");
 	  return;
 	}
     }

   if (first_char == '0')
     value = 0.0;
   else if (!ISDIGIT (first_char))
     {
       out->id = TOK_ERROR;
       out->range.m_start = out->range.m_end;
       out->u.string = xstrdup ("expected digit");
       return;
     }
   else
     {
       /* Got a nonzero digit; expect zero or more digits.  */
       value = first_char - '0';
       while (1)
 	{
 	  unichar uc;
 	  location_map::point point;
 	  if (!get_char (uc, &point))
 	    break;
 	  if (ISDIGIT (uc))
 	    {
 	      value *= 10;
 	      value += uc -'0';
 	      out->range.m_end = point;
 	      continue;
 	    }
 	  else
 	    {
 	      unget_char ();
 	      break;
 	    }
 	}
     }

   /* Optional '.', followed by one or more decimals.  */
   unichar next_char;
   location_map::point point;
   if (get_char (next_char, &point))
     {
       if (next_char == '.')
 	{
 	  /* Parse decimal digits.  */
 	  bool had_digit = false;
 	  double digit_factor = 0.1;
 	  while (get_char (next_char, &point))
 	    {
 	      if (!ISDIGIT (next_char))
 		{
 		  unget_char ();
 		  break;
 		}
 	      value += (next_char - '0') * digit_factor;
 	      digit_factor *= 0.1;
 	      had_digit = true;
 	      out->range.m_end = point;
 	    }
 	  if (!had_digit)
 	    {
 	      out->id = TOK_ERROR;
 	      out->range.m_start = point;
 	      out->range.m_start = point;
 	      out->u.string = xstrdup ("expected digit");
 	      return;
 	    }
 	}
       else
 	unget_char ();
     }

   /* Parse 'e' and 'E'.  */
   unichar exponent_char;
   if (get_char (exponent_char, &point))
     {
       if (exponent_char == 'e' || exponent_char == 'E')
 	{
 	  /* Optional +/-.  */
 	  unichar sign_char;
 	  int exponent = 0;
 	  bool negate_exponent = false;
 	  bool had_exponent_digit = false;
 	  if (!get_char (sign_char, &point))
 	    {
 	      out->id = TOK_ERROR;
 	      out->range.m_start = point;
 	      out->range.m_start = point;
 	      out->u.string = xstrdup ("EOF within exponent");
 	      return;
 	    }
 	  if (sign_char == '-')
 	    negate_exponent = true;
 	  else if (sign_char == '+')
 	    ;
 	  else if (ISDIGIT (sign_char))
 	    {
 	      exponent = sign_char - '0';
 	      had_exponent_digit = true;
 	    }
 	  else
 	    {
 	      out->id = TOK_ERROR;
 	      out->range.m_start = point;
 	      out->range.m_start = point;
 	      out->u.string
 		= xstrdup ("expected '-','+' or digit within exponent");
 	      return;
 	    }
 	  out->range.m_end = point;

 	  /* One or more digits (we might have seen the digit above,
 	     though).  */
 	  while (1)
 	    {
 	      unichar uc;
 	      location_map::point point;
 	      if (!get_char (uc, &point))
 		break;
 	      if (ISDIGIT (uc))
 		{
 		  exponent *= 10;
 		  exponent += uc -'0';
 		  had_exponent_digit = true;
 		  out->range.m_end = point;
 		  continue;
 		}
 	      else
 		{
 		  unget_char ();
 		  break;
 		}
 	    }
 	  if (!had_exponent_digit)
 	    {
 	      out->id = TOK_ERROR;
 	      out->range.m_start = point;
 	      out->range.m_start = point;
 	      out->u.string = xstrdup ("expected digit within exponent");
 	      return;
 	    }
 	  if (negate_exponent)
 	    exponent = -exponent;
 	  value = value * pow (10, exponent);
 	}
       else
 	unget_char ();
     }

   if (negate)
     value = -value;

   if (value == (long)value)
     {
       out->id = TOK_INTEGER_NUMBER;
       out->u.integer_number = value;
     }
   else
     {
       out->id = TOK_FLOAT_NUMBER;
       out->u.float_number = value;
     }
 }

 /* Determine if the next characters to be lexed match SUFFIX.
    SUFFIX must be pure ASCII and not contain newlines.
    If so, consume the characters and return true.
    Otherwise, return false.  */

 bool
 lexer::rest_of_literal (token *out, const char *suffix)
 {
   int suffix_idx = 0;
   int buf_idx = m_next_char_idx;
   while (1)
     {
       if (suffix[suffix_idx] == '\0')
 	{
 	  m_next_char_idx += suffix_idx;
 	  m_next_char_column += suffix_idx;
 	  out->range.m_end.m_unichar_idx += suffix_idx;
 	  out->range.m_end.m_column += suffix_idx;
 	  return true;
 	}
       if (buf_idx >= (int)m_buffer.length ())
 	return false;
       /* This assumes that suffix is ASCII.  */
       if (m_buffer[buf_idx] != (unichar)suffix[suffix_idx])
 	return false;
       buf_idx++;
       suffix_idx++;
     }
 }

 /* Create a new error instance for MSG, using the location of the next
    character for the location of the error.  */

 std::unique_ptr<error>
 lexer::make_error (const char *msg)
 {
   location_map::point p;
   p.m_unichar_idx = m_next_char_idx;
   p.m_line = m_next_char_line;
   p.m_column = m_next_char_column;
   location_map::range r;
   r.m_start = p;
   r.m_end = p;
   return std::make_unique<error> (r, xstrdup (msg));
 }

 /* parser's ctor.  */

 parser::parser (location_map *out_loc_map,
 		bool support_comments)
 : m_lexer (support_comments), m_loc_map (out_loc_map)
 {
 }

 /* parser's dtor.  */

 parser::~parser ()
 {
   if (m_loc_map)
     m_loc_map->on_finished_parsing ();
 }

 /* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this parser's
    lexer's buffer.  */

 std::unique_ptr<error>
 parser::add_utf8 (size_t length, const char *utf8_buf)
 {
   return m_lexer.add_utf8 (length, utf8_buf);
 }

 /* Parse a JSON value (object, array, number, string, or literal).
    (ECMA-404 section 5; RFC 7159 section 3).  */

 parser_result_t
 parser::parse_value (int depth)
 {
   const token *tok = m_lexer.peek ();

   /* Avoid stack overflow with deeply-nested inputs; RFC 7159 section 9
      states: "An implementation may set limits on the maximum depth
      of nesting.".

      Ideally we'd avoid this limit (e.g. by rewriting parse_value,
      parse_object, and parse_array into a single function with a vec of
      state).  */
   const int MAX_DEPTH = 100;
   if (depth >= MAX_DEPTH)
     return error_at (tok->range, "maximum nesting depth exceeded: %i",
 		     MAX_DEPTH);

   switch (tok->id)
     {
     case TOK_OPEN_CURLY:
       return parse_object (depth);

     case TOK_STRING:
       {
 	auto val = std::make_unique<string> (tok->u.string);
 	m_lexer.consume ();
 	maybe_record_range (val.get (), tok->range);
 	return parser_result_t (std::move (val));
       }

     case TOK_OPEN_SQUARE:
       return parse_array (depth);

     case TOK_FLOAT_NUMBER:
       {
 	auto val = std::make_unique<float_number> (tok->u.float_number);
 	m_lexer.consume ();
 	maybe_record_range (val.get (), tok->range);
 	return parser_result_t (std::move (val));
       }

     case TOK_INTEGER_NUMBER:
       {
 	auto val = std::make_unique<integer_number> (tok->u.integer_number);
 	m_lexer.consume ();
 	maybe_record_range (val.get (), tok->range);
 	return parser_result_t (std::move (val));
       }

     case TOK_TRUE:
       {
 	auto val = std::make_unique<literal> (JSON_TRUE);
 	m_lexer.consume ();
 	maybe_record_range (val.get (), tok->range);
 	return parser_result_t (std::move (val));
       }

     case TOK_FALSE:
       {
 	auto val = std::make_unique<literal> (JSON_FALSE);
 	m_lexer.consume ();
 	maybe_record_range (val.get (), tok->range);
 	return parser_result_t (std::move (val));
       }

     case TOK_NULL:
       {
 	auto val = std::make_unique<literal> (JSON_NULL);
 	m_lexer.consume ();
 	maybe_record_range (val.get (), tok->range);
 	return parser_result_t (std::move (val));
       }

     case TOK_ERROR:
       return error_at (tok->range, "invalid JSON token: %s", tok->u.string);

     default:
       return error_at (tok->range, "expected a JSON value but got %s",
 		       token_id_name[tok->id]);
     }
 }

 /* Parse a JSON object.
    (ECMA-404 section 6; RFC 7159 section 4).  */

 parser_result_t
 parser::parse_object (int depth)
 {
   location_map::point start = get_next_token_start ();

   require (TOK_OPEN_CURLY);

   auto obj = std::make_unique<object> ();

   const token *tok = m_lexer.peek ();
   if (tok->id == TOK_CLOSE_CURLY)
     {
       location_map::point end = get_next_token_end ();
       maybe_record_range (obj.get (), start, end);
       if (auto err = require (TOK_CLOSE_CURLY))
 	return parser_result_t (std::move (err));
       return parser_result_t (std::move (obj));
     }
   if (tok->id != TOK_STRING)
     return error_at (tok->range,
 		     "expected string for object key after '{'; got %s",
 		     token_id_name[tok->id]);
   while (true)
     {
       tok = m_lexer.peek ();
       if (tok->id != TOK_STRING)
 	return error_at (tok->range,
 			 "expected string for object key after ','; got %s",
 			 token_id_name[tok->id]);
       label_text key = label_text::take (xstrdup (tok->u.string));
       m_lexer.consume ();

       if (auto err = require (TOK_COLON))
 	return parser_result_t (std::move (err));

       parser_result_t r = parse_value (depth + 1);
       if (r.m_err)
 	return r;
       if (!r.m_val)
 	return parser_result_t (std::move (obj));

       /* We don't enforce uniqueness for keys.  */
       obj->set (key.get (), std::move (r.m_val));

       location_map::point end = get_next_token_end ();
       result<enum token_id, std::unique_ptr<error>> result
 	(require_one_of (TOK_COMMA, TOK_CLOSE_CURLY));
       if (result.m_err)
 	return parser_result_t (std::move (result.m_err));
       if (result.m_val == TOK_COMMA)
 	continue;
       else
 	{
 	  /* TOK_CLOSE_CURLY.  */
 	  maybe_record_range (obj.get (), start, end);
 	  return parser_result_t (std::move (obj));
 	}
     }
 }

 /* Parse a JSON array.
    (ECMA-404 section 7; RFC 7159 section 5).  */

 parser_result_t
 parser::parse_array (int depth)
 {
   location_map::point start = get_next_token_start ();
   if (auto err = require (TOK_OPEN_SQUARE))
     return parser_result_t (std::move (err));

   auto arr = std::make_unique<array> ();

   const token *tok = m_lexer.peek ();
   if (tok->id == TOK_CLOSE_SQUARE)
     {
       location_map::point end = get_next_token_end ();
       maybe_record_range (arr.get (), start, end);
       m_lexer.consume ();
       return parser_result_t (std::move (arr));
     }

   while (true)
     {
       parser_result_t r = parse_value (depth + 1);
       if (r.m_err)
 	return r;

       arr->append (std::move (r.m_val));

       location_map::point end = get_next_token_end ();
       result<enum token_id, std::unique_ptr<error>> result
 	(require_one_of (TOK_COMMA, TOK_CLOSE_SQUARE));
       if (result.m_err)
 	return parser_result_t (std::move (result.m_err));
       if (result.m_val == TOK_COMMA)
 	continue;
       else
 	{
 	  /* TOK_CLOSE_SQUARE.  */
 	  maybe_record_range (arr.get (), start, end);
 	  return parser_result_t (std::move (arr));
 	}
     }
 }

 /* Get the start point of the next token.  */

 location_map::point
 parser::get_next_token_start ()
 {
   const token *tok = m_lexer.peek ();
   return tok->range.m_start;
 }

 /* Get the end point of the next token.  */

 location_map::point
 parser::get_next_token_end ()
 {
   const token *tok = m_lexer.peek ();
   return tok->range.m_end;
 }

 /* Require an EOF, or fail if there is surplus input.  */

 std::unique_ptr<error>
 parser::require_eof ()
 {
   return require (TOK_EOF);
 }

 /* Consume the next token, issuing an error if it is not of kind TOK_ID.  */

 std::unique_ptr<error>
 parser::require (enum token_id tok_id)
 {
   const token *tok = m_lexer.peek ();
   if (tok->id != tok_id)
     {
       if (tok->id == TOK_ERROR)
 	return error_at (tok->range,
 			 "expected %s; got bad token: %s",
 			 token_id_name[tok_id], tok->u.string);
       else
 	return error_at (tok->range,
 			 "expected %s; got %s", token_id_name[tok_id],
 			 token_id_name[tok->id]);
     }
   m_lexer.consume ();
   return nullptr;
 }

 /* Consume the next token, issuing an error if it is not of
    kind TOK_ID_A or TOK_ID_B.
    Return which kind it was.  */

 result<enum token_id, std::unique_ptr<error>>
 parser::require_one_of (enum token_id tok_id_a, enum token_id tok_id_b)
 {
   const token *tok = m_lexer.peek ();
   if ((tok->id != tok_id_a)
       && (tok->id != tok_id_b))
     {
       if (tok->id == TOK_ERROR)
 	return error_at (tok->range, "expected %s or %s; got bad token: %s",
 			 token_id_name[tok_id_a], token_id_name[tok_id_b],
 			 tok->u.string);
       else
 	return error_at (tok->range, "expected %s or %s; got %s",
 			 token_id_name[tok_id_a], token_id_name[tok_id_b],
 			 token_id_name[tok->id]);
     }
   enum token_id id = tok->id;
   m_lexer.consume ();
   return result<enum token_id, std::unique_ptr<error>> (id);
 }

 /* Genarate a parsing error.  */

 std::unique_ptr<error>
 parser::error_at (const location_map::range &r, const char *fmt, ...)
 {
   va_list ap;
   va_start (ap, fmt);
   char *formatted_msg = xvasprintf (fmt, ap);
   va_end (ap);

   return std::make_unique<error> (r, formatted_msg);
 }

 /* Record that JV has range R within the input file.  */

 void
 parser::maybe_record_range (json::value *jv, const location_map::range &r)
 {
   if (m_loc_map)
     m_loc_map->record_range_for_value (jv, r);
 }

 /* Record that JV has range START to END within the input file.  */

 void
 parser::maybe_record_range (json::value *jv,
 			    const location_map::point &start,
 			    const location_map::point &end)
 {
   if (m_loc_map)
     {
       location_map::range r;
       r.m_start = start;
       r.m_end = end;
       m_loc_map->record_range_for_value (jv, r);
     }
 }

 /* Attempt to parse the UTF-8 encoded buffer at UTF8_BUF
    of the given LENGTH.
    If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
    buffer, as an extension to JSON, otherwise forbid them.
    If successful, return an json::value in the result.
    if there was a problem, return a json::error in the result.
    If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
    source locations of nodes seen during parsing.  */

 parser_result_t
 json::parse_utf8_string (size_t length,
 			 const char *utf8_buf,
 			 bool allow_comments,
 			 location_map *out_loc_map)
 {
   parser p (out_loc_map, allow_comments);
   if (auto err = p.add_utf8 (length, utf8_buf))
     return parser_result_t (std::move (err));
   parser_result_t r = p.parse_value (0);
   if (r.m_err)
     return r;
   if (auto err = p.require_eof ())
     return parser_result_t (std::move (err));
   return r;
 }

 /* Attempt to parse the nil-terminated UTF-8 encoded buffer at
    UTF8_BUF.
    If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
    buffer, as an extension to JSON, otherwise forbid them.
    If successful, return a non-NULL json::value *.
    if there was a problem, return NULL and write an error
    message to err_out, which must be deleted by the caller.
    If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
    source locations of nodes seen during parsing.  */

 json::parser_result_t
 json::parse_utf8_string (const char *utf8,
 			 bool allow_comments,
 			 location_map *out_loc_map)
 {
   return parse_utf8_string (strlen (utf8), utf8, allow_comments,
 			    out_loc_map);
 }


 #if CHECKING_P

 namespace selftest {

 /* Selftests.  */

 #define ASSERT_PRINT_EQ(JV, FORMATTED, EXPECTED_JSON)	\
   assert_print_eq (SELFTEST_LOCATION, JV, FORMATTED, EXPECTED_JSON)

 /* Implementation detail of ASSERT_RANGE_EQ.  */

 static void
 assert_point_eq (const location &loc,
 		 const location_map::point &actual_point,
 		 size_t exp_unichar_idx, int exp_line, int exp_column)
 {
   ASSERT_EQ_AT (loc, actual_point.m_unichar_idx, exp_unichar_idx);
   ASSERT_EQ_AT (loc, actual_point.m_line, exp_line);
   ASSERT_EQ_AT (loc, actual_point.m_column, exp_column);
 }

 /* Implementation detail of ASSERT_RANGE_EQ.  */

 static void
 assert_range_eq (const location &loc,
 		 const location_map::range &actual_range,
 		 /* Expected location.  */
 		 size_t start_unichar_idx, int start_line, int start_column,
 		 size_t end_unichar_idx, int end_line, int end_column)
 {
   assert_point_eq (loc, actual_range.m_start,
 		   start_unichar_idx, start_line, start_column);
   assert_point_eq (loc, actual_range.m_end,
 		   end_unichar_idx, end_line, end_column);
 }

 /* Assert that ACTUAL_RANGE starts at
    (START_UNICHAR_IDX, START_LINE, START_COLUMN)
    and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN).  */

 #define ASSERT_RANGE_EQ(ACTUAL_RANGE, \
 			START_UNICHAR_IDX, START_LINE, START_COLUMN,	\
 			END_UNICHAR_IDX, END_LINE, END_COLUMN)		\
   assert_range_eq ((SELFTEST_LOCATION), (ACTUAL_RANGE), \
 		   (START_UNICHAR_IDX), (START_LINE), (START_COLUMN),	\
 		   (END_UNICHAR_IDX), (END_LINE), (END_COLUMN))

 /* Implementation detail of ASSERT_ERR_EQ.  */

 static void
 assert_err_eq (const location &loc,
 	       const json::error *actual_err,
 	       /* Expected location.  */
 	       size_t start_unichar_idx, int start_line, int start_column,
 	       size_t end_unichar_idx, int end_line, int end_column,
 	       const char *expected_msg)
 {
   ASSERT_TRUE_AT (loc, actual_err);
   const location_map::range &actual_range = actual_err->get_range ();
   ASSERT_EQ_AT (loc, actual_range.m_start.m_unichar_idx, start_unichar_idx);
   ASSERT_EQ_AT (loc, actual_range.m_start.m_line, start_line);
   ASSERT_EQ_AT (loc, actual_range.m_start.m_column, start_column);
   ASSERT_EQ_AT (loc, actual_range.m_end.m_unichar_idx, end_unichar_idx);
   ASSERT_EQ_AT (loc, actual_range.m_end.m_line, end_line);
   ASSERT_EQ_AT (loc, actual_range.m_end.m_column, end_column);
   ASSERT_STREQ_AT (loc, actual_err->get_msg (), expected_msg);
 }

 /* Assert that ACTUAL_ERR is a non-NULL json::error *,
    with message EXPECTED_MSG, and that its location starts
    at (START_UNICHAR_IDX, START_LINE, START_COLUMN)
    and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN).  */

 #define ASSERT_ERR_EQ(ACTUAL_ERR, \
 		      START_UNICHAR_IDX, START_LINE, START_COLUMN,	\
 		      END_UNICHAR_IDX, END_LINE, END_COLUMN,	\
 		      EXPECTED_MSG)		    \
   assert_err_eq ((SELFTEST_LOCATION), (ACTUAL_ERR), \
 		 (START_UNICHAR_IDX), (START_LINE), (START_COLUMN),	\
 		 (END_UNICHAR_IDX), (END_LINE), (END_COLUMN),	\
 		 (EXPECTED_MSG))

 /* Verify that the JSON lexer works as expected.  */

 static void
 test_lexer ()
 {
   lexer l (false);
   const char *str
     /*  0         1         2         3         4         .  */
     /*  01234567890123456789012345678901234567890123456789.  */
     = ("    1066   -1  \n"
        "    -273.15 1e6\n"
        "  [   ] null   true  false  {  }  \"foo\" \n");
   auto err = l.add_utf8 (strlen (str), str);
   ASSERT_EQ (err, nullptr);

   /* Line 1.  */
   {
     const size_t line_offset = 0;

     /* Expect token: "1066" in columns 4-7.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
       ASSERT_EQ (tok->u.integer_number, 1066);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 4, 1, 4,
 		       line_offset + 7, 1, 7);
       l.consume ();
     }
     /* Expect token: "-1" in columns 11-12.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
       ASSERT_EQ (tok->u.integer_number, -1);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 11, 1, 11,
 		       line_offset + 12, 1, 12);
       l.consume ();
     }
   }

   /* Line 2.  */
   {
     const size_t line_offset = 16;

     /* Expect token: "-273.15" in columns 4-10.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_FLOAT_NUMBER);
       ASSERT_EQ (int(tok->u.float_number), int(-273.15));
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 4, 2, 4,
 		       line_offset + 10, 2, 10);
       l.consume ();
     }
     /* Expect token: "1e6" in columns 12-14.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
       ASSERT_EQ (tok->u.integer_number, 1000000);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 12, 2, 12,
 		       line_offset + 14, 2, 14);
       l.consume ();
     }
   }

   /* Line 3.  */
   {
     const size_t line_offset = 32;

     /* Expect token: "[".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_OPEN_SQUARE);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 2, 3, 2,
 		       line_offset + 2, 3, 2);
       l.consume ();
     }
     /* Expect token: "]".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_CLOSE_SQUARE);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 6, 3, 6,
 		       line_offset + 6, 3, 6);
       l.consume ();
     }
     /* Expect token: "null".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_NULL);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 8, 3, 8,
 		       line_offset + 11, 3, 11);
       l.consume ();
     }
     /* Expect token: "true".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_TRUE);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 15, 3, 15,
 		       line_offset + 18, 3, 18);
       l.consume ();
     }
     /* Expect token: "false".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_FALSE);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 21, 3, 21,
 		       line_offset + 25, 3, 25);
       l.consume ();
     }
     /* Expect token: "{".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_OPEN_CURLY);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 28, 3, 28,
 		       line_offset + 28, 3, 28);
       l.consume ();
     }
     /* Expect token: "}".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_CLOSE_CURLY);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 31, 3, 31,
 		       line_offset + 31, 3, 31);
       l.consume ();
     }
     /* Expect token: "\"foo\"".  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_STRING);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 34, 3, 34,
 		       line_offset + 38, 3, 38);
       l.consume ();
     }
   }
 }

 /* Verify that the JSON lexer complains about single-line comments
    when comments are disabled.  */

 static void
 test_lexing_unsupported_single_line_comment ()
 {
   lexer l (false);
   const char *str
     /*  0         1         2         3         4         .  */
     /*  01234567890123456789012345678901234567890123456789.  */
     = ("    1066   // Hello world\n");
   auto err = l.add_utf8 (strlen (str), str);
   ASSERT_EQ (err, nullptr);

   /* Line 1.  */
   {
     const size_t line_offset = 0;
     const int line_1 = 1;

     /* Expect token: "1066" in columns 4-7.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
       ASSERT_EQ (tok->u.integer_number, 1066);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 4, line_1, 4,
 		       line_offset + 7, line_1, 7);
       l.consume ();
     }

     /* Expect error.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_ERROR);
       ASSERT_STREQ (tok->u.string, "unexpected character: '/'");
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 11, line_1, 11,
 		       line_offset + 11, line_1, 11);
       l.consume ();
     }
   }
 }

 /* Verify that the JSON lexer complains about multiline comments
    when comments are disabled.  */

 static void
 test_lexing_unsupported_multiline_comment ()
 {
   lexer l (false);
   const char *str
     /*  0         1         2         3         4         .  */
     /*  01234567890123456789012345678901234567890123456789.  */
     = ("    1066   /* Hello world\n"
        " continuation of comment\n"
        " end of comment */  42\n");
   auto err = l.add_utf8 (strlen (str), str);
   ASSERT_EQ (err, nullptr);

   /* Line 1.  */
   {
     const size_t line_offset = 0;
     const int line_1 = 1;

     /* Expect token: "1066" in line 1, columns 4-7.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
       ASSERT_EQ (tok->u.integer_number, 1066);
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 4, line_1, 4,
 		       line_offset + 7, line_1, 7);
       l.consume ();
     }

     /* Expect error.  */
     {
       const token *tok = l.peek ();
       ASSERT_EQ (tok->id, TOK_ERROR);
       ASSERT_STREQ (tok->u.string, "unexpected character: '/'");
       ASSERT_RANGE_EQ (tok->range,
 		       line_offset + 11, line_1, 11,
 		       line_offset + 11, line_1, 11);
       l.consume ();
     }
   }
 }

 /* Verify that the JSON lexer handles single-line comments
    when comments are enabled.  */

 static void
 test_lexing_supported_single_line_comment ()
 {
   lexer l (true);
   const char *str
     /*  0         1         2         3         4         .  */
     /*  01234567890123456789012345678901234567890123456789.  */
     = ("    1066   // Hello world\n"
        "     42   // etc\n");
   auto err = l.add_utf8 (strlen (str), str);
   ASSERT_EQ (err, nullptr);

   const size_t line_1_offset = 0;
   const size_t line_2_offset = 26;
   const size_t line_3_offset = line_2_offset + 17;

   /* Expect token: "1066" in line 1, columns 4-7.  */
   {
     const int line_1 = 1;
     const token *tok = l.peek ();
     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
     ASSERT_EQ (tok->u.integer_number, 1066);
     ASSERT_RANGE_EQ (tok->range,
 		     line_1_offset + 4, line_1, 4,
 		     line_1_offset + 7, line_1, 7);
     l.consume ();
   }

   /* Expect token: "42" in line 2, columns 5-6.  */
   {
     const int line_2 = 2;
     const token *tok = l.peek ();
     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
     ASSERT_EQ (tok->u.integer_number, 42);
     ASSERT_RANGE_EQ (tok->range,
 		     line_2_offset + 5, line_2, 5,
 		     line_2_offset + 6, line_2, 6);
     l.consume ();
   }

   /* Expect EOF.  */
   {
     const int line_3 = 3;
     const token *tok = l.peek ();
     ASSERT_EQ (tok->id, TOK_EOF);
     ASSERT_RANGE_EQ (tok->range,
 		     line_3_offset + 0, line_3, 0,
 		     line_3_offset + 0, line_3, 0);
     l.consume ();
   }
 }

 /* Verify that the JSON lexer handles multiline comments
    when comments are enabled.  */

 static void
 test_lexing_supported_multiline_comment ()
 {
   lexer l (true);
   const char *str
     /*  0         1         2         3         4         .  */
     /*  01234567890123456789012345678901234567890123456789.  */
     = ("    1066   /* Hello world\n"
        " continuation of comment\n"
        " end of comment */  42\n");
   auto err = l.add_utf8 (strlen (str), str);
   ASSERT_EQ (err, nullptr);

   const size_t line_1_offset = 0;
   const size_t line_2_offset = 26;
   const size_t line_3_offset = line_2_offset + 25;
   const size_t line_4_offset = line_3_offset + 23;

   /* Expect token: "1066" in line 1, columns 4-7.  */
   {
     const int line_1 = 1;
     const token *tok = l.peek ();
     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
     ASSERT_EQ (tok->u.integer_number, 1066);
     ASSERT_RANGE_EQ (tok->range,
 		     line_1_offset + 4, line_1, 4,
 		     line_1_offset + 7, line_1, 7);
     l.consume ();
   }

   /* Expect token: "42" in line 3, columns 20-21.  */
   {
     const int line_3 = 3;
     const token *tok = l.peek ();
     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
     ASSERT_EQ (tok->u.integer_number, 42);
     ASSERT_RANGE_EQ (tok->range,
 		     line_3_offset + 20, line_3, 20,
 		     line_3_offset + 21, line_3, 21);
     l.consume ();
   }

   /* Expect EOF.  */
   {
     const int line_4 = 4;
     const token *tok = l.peek ();
     ASSERT_EQ (tok->id, TOK_EOF);
     ASSERT_RANGE_EQ (tok->range,
 		     line_4_offset + 0, line_4, 0,
 		     line_4_offset + 0, line_4, 0);
     l.consume ();
   }
 }

 /* Helper class for writing JSON parsing testcases.
    Attempts to parse a string in ctor, and captures the result (either
    a json::value or a json::error), and a location map.  */

 struct parser_testcase
 {
 public:
   parser_testcase (const char *utf8_string, bool allow_comments = false)
   : m_loc_map (),
     m_result (parse_utf8_string (utf8_string, allow_comments, &m_loc_map))
   {
   }

   const json::value *get_value () const { return m_result.m_val.get (); }
   const json::error *get_error () const { return m_result.m_err.get (); }

   const location_map::range *
   get_range_for_value (const json::value *jv) const
   {
     return m_loc_map.get_range_for_value (jv);
   }

 private:
   /* Concrete implementation of location_map for use in
      JSON parsing selftests.  */
   class test_location_map : public location_map
   {
   public:
     void record_range_for_value (json::value *jv, const range &r) final override
     {
       m_map.put (jv, r);
     }

     range *get_range_for_value (const json::value *jv) const
     {
       return const_cast<hash_map<const json::value *, range> &> (m_map)
 	.get (jv);
     }

   private:
     hash_map<const json::value *, range> m_map;
   };

   test_location_map m_loc_map;
   json::parser_result_t m_result;
 };

 /* Verify that parse_utf8_string works as expected.  */

 static void
 test_parse_string ()
 {
   const int line_1 = 1;

   {
     parser_testcase tc ("\"foo\"");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (jv->get_kind (), JSON_STRING);
     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (), "foo");
     ASSERT_PRINT_EQ (*jv, true, "\"foo\"");
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     4, line_1, 4);
   }

   {
     const char *contains_quotes = "\"before \\\"quoted\\\" after\"";
     parser_testcase tc (contains_quotes);
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (jv->get_kind (), JSON_STRING);
     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
 		  "before \"quoted\" after");
     ASSERT_PRINT_EQ (*jv, true, contains_quotes);
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     24, line_1, 24);
   }

   /* Test of non-ASCII input.  This string is the Japanese word "mojibake",
      written as C octal-escaped UTF-8.  */
   const char *mojibake = (/* Opening quote.  */
 			  "\""
 			  /* U+6587 CJK UNIFIED IDEOGRAPH-6587
 			     UTF-8: 0xE6 0x96 0x87
 			     C octal escaped UTF-8: \346\226\207.  */
 			  "\346\226\207"
 			  /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
 			     UTF-8: 0xE5 0xAD 0x97
 			     C octal escaped UTF-8: \345\255\227.  */
 			  "\345\255\227"
 			  /* U+5316 CJK UNIFIED IDEOGRAPH-5316
 			     UTF-8: 0xE5 0x8C 0x96
 			     C octal escaped UTF-8: \345\214\226.  */
 			  "\345\214\226"
 			  /* U+3051 HIRAGANA LETTER KE
 			     UTF-8: 0xE3 0x81 0x91
 			     C octal escaped UTF-8: \343\201\221.  */
 			  "\343\201\221"
 			  /* Closing quote.  */
 			  "\"");
   {
     parser_testcase tc (mojibake);
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (jv->get_kind (), JSON_STRING);
     /* Result of get_string should be UTF-8 encoded, without quotes.  */
     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
 		  "\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
     /* Result of dump should be UTF-8 encoded, with quotes.  */
     ASSERT_PRINT_EQ (*jv, false, mojibake);
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     5, line_1, 5);
   }

   /* Test of \u-escaped unicode.  This is "mojibake" again, as above.  */
   {
     const char *escaped_unicode = "\"\\u6587\\u5b57\\u5316\\u3051\"";
     parser_testcase tc (escaped_unicode);
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (jv->get_kind (), JSON_STRING);
     /* Result of get_string should be UTF-8 encoded, without quotes.  */
     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
 		  "\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
     /* Result of dump should be UTF-8 encoded, with quotes.  */
     ASSERT_PRINT_EQ (*jv, false, mojibake);
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     25, line_1, 25);
   }
 }

 /* Verify that we can parse various kinds of JSON numbers.  */

 static void
 test_parse_number ()
 {
   const int line_1 = 1;

   {
     parser_testcase tc ("42");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
     ASSERT_EQ (as_a <const json::integer_number *> (jv)->get (), 42.0);
     ASSERT_PRINT_EQ (*jv, true, "42");
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     1, line_1, 1);
   }

   /* Negative number.  */
   {
     parser_testcase tc ("-17");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
     ASSERT_EQ (as_a<const json::integer_number *> (jv)->get (), -17.0);
     ASSERT_PRINT_EQ (*jv, true, "-17");
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     2, line_1, 2);
   }

   /* Decimal.  */
   {
     parser_testcase tc ("3.141");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_EQ (JSON_FLOAT, jv->get_kind ());
     ASSERT_NEAR (3.141, ((const json::float_number *)jv)->get (), 0.001);
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     4, line_1, 4);
   }

   /* Exponents.  */
   {
     {
       parser_testcase tc ("3.141e+0");
       ASSERT_EQ (tc.get_error (), nullptr);
       const json::value *jv = tc.get_value ();
       ASSERT_EQ (jv->get_kind (), JSON_FLOAT);
       ASSERT_NEAR (as_a <const json::float_number *> (jv)->get (), 3.141, 0.1);
       auto range = tc.get_range_for_value (jv);
       ASSERT_TRUE (range);
       ASSERT_RANGE_EQ (*range,
 		       0, line_1, 0,
 		       7, line_1, 7);
     }
     {
       parser_testcase tc ("42e2");
       ASSERT_EQ (tc.get_error (), nullptr);
       const json::value *jv = tc.get_value ();
       ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
       ASSERT_EQ (as_a <const json::integer_number *> (jv)->get (), 4200);
       ASSERT_PRINT_EQ (*jv, true, "4200");
       auto range = tc.get_range_for_value (jv);
       ASSERT_TRUE (range);
       ASSERT_RANGE_EQ (*range,
 		       0, line_1, 0,
 		       3, line_1, 3);
     }
     {
       parser_testcase tc ("42e-1");
       ASSERT_EQ (tc.get_error (), nullptr);
       const json::value *jv = tc.get_value ();
       ASSERT_EQ (jv->get_kind (), JSON_FLOAT);
       ASSERT_NEAR (as_a <const json::float_number *> (jv)->get (), 4.2, 0.1);
       auto range = tc.get_range_for_value (jv);
       ASSERT_TRUE (range);
       ASSERT_RANGE_EQ (*range,
 		       0, line_1, 0,
 		       4, line_1, 4);
     }
   }
 }

 /* Verify that JSON array parsing works.  */

 static void
 test_parse_array ()
 {
   const int line_1 = 1;

   parser_testcase tc ("[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
   ASSERT_EQ (tc.get_error (), nullptr);
   const json::value *jv = tc.get_value ();
   ASSERT_EQ (jv->get_kind (), JSON_ARRAY);
   const json::array *arr = as_a <const json::array *> (jv);
   ASSERT_EQ (arr->length (), 10);
   auto range = tc.get_range_for_value (jv);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   0, line_1, 0,
 		   29, line_1, 29);
   for (int i = 0; i < 10; i++)
     {
       json::value *element = arr->get (i);
       ASSERT_EQ (element->get_kind (), JSON_INTEGER);
       ASSERT_EQ (as_a <json::integer_number *> (element)->get (), i);
       range = tc.get_range_for_value (element);
       ASSERT_TRUE (range);
       const int offset = 1 + (i * 3);
       ASSERT_RANGE_EQ (*range,
 		       offset, line_1, offset,
 		       offset, line_1, offset);
     }
   ASSERT_PRINT_EQ (*jv, false, "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
 }

 /* Verify that JSON object parsing works.  */

 static void
 test_parse_object ()
 {
   const int line_1 = 1;
   std::unique_ptr<error> err;
   /*                   0            1            2         3  .  */
   /*                   01 2345 678 9012 345 6789 0123456789012.  */
   parser_testcase tc ("{\"foo\": \"bar\", \"baz\": [42, null]}");

   ASSERT_EQ (tc.get_error (), nullptr);
   const json::value *jv = tc.get_value ();
   ASSERT_NE (jv, nullptr);
   ASSERT_EQ (jv->get_kind (), JSON_OBJECT);
   auto range = tc.get_range_for_value (jv);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   0, line_1, 0,
 		   32, line_1, 32);
   const json::object *jo = static_cast <const json::object *> (jv);

   json::value *foo_value = jo->get ("foo");
   ASSERT_NE (foo_value, nullptr);
   ASSERT_EQ (foo_value->get_kind (), JSON_STRING);
   ASSERT_STREQ (as_a <json::string *> (foo_value)->get_string (), "bar");
   range = tc.get_range_for_value (foo_value);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   8, line_1, 8,
 		   12, line_1, 12);

   json::value *baz_value = jo->get ("baz");
   ASSERT_NE (baz_value, nullptr);
   ASSERT_EQ (baz_value->get_kind (), JSON_ARRAY);
   range = tc.get_range_for_value (baz_value);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   22, line_1, 22,
 		   31, line_1, 31);

   json::array *baz_array = as_a <json::array *> (baz_value);
   ASSERT_EQ (baz_array->length (), 2);

   json::value *element0 = baz_array->get (0);
   ASSERT_EQ (as_a <json::integer_number *> (element0)->get (), 42);
   range = tc.get_range_for_value (element0);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   23, line_1, 23,
 		   24, line_1, 24);

   json::value *element1 = baz_array->get (1);
   ASSERT_EQ (element1->get_kind (), JSON_NULL);
   range = tc.get_range_for_value (element1);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   27, line_1, 27,
 		   30, line_1, 30);
 }

 /* Verify that the JSON literals "true", "false" and "null" are parsed
    correctly.  */

 static void
 test_parse_literals ()
 {
   const int line_1 = 1;
   {
     parser_testcase tc ("true");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_NE (jv, nullptr);
     ASSERT_EQ (jv->get_kind (), JSON_TRUE);
     ASSERT_PRINT_EQ (*jv, false, "true");
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     3, line_1, 3);
   }

   {
     parser_testcase tc ("false");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_NE (jv, nullptr);
     ASSERT_EQ (jv->get_kind (), JSON_FALSE);
     ASSERT_PRINT_EQ (*jv, false, "false");
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     4, line_1, 4);
   }

   {
     parser_testcase tc ("null");
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_NE (jv, nullptr);
     ASSERT_EQ (jv->get_kind (), JSON_NULL);
     ASSERT_PRINT_EQ (*jv, false, "null");
     auto range = tc.get_range_for_value (jv);
     ASSERT_TRUE (range);
     ASSERT_RANGE_EQ (*range,
 		     0, line_1, 0,
 		     3, line_1, 3);
   }
 }

 /* Verify that we can parse a simple JSON-RPC request.  */

 static void
 test_parse_jsonrpc ()
 {
   std::unique_ptr<error> err;
   const char *request
     /*  0           1            2           3          4.  */
     /*  01 23456789 012 3456 789 0123456 789 012345678 90.  */
     = ("{\"jsonrpc\": \"2.0\", \"method\": \"subtract\",\n"
     /*  0           1         2           3          4.  */
     /*  0 1234567 8901234567890 1234 56789012345678 90.  */
        " \"params\": [42, 23], \"id\": 1}");
   const int line_1 = 1;
   const int line_2 = 2;
   const size_t line_2_offset = 41;
   parser_testcase tc (request);
   ASSERT_EQ (tc.get_error (), nullptr);
   const json::value *jv = tc.get_value ();
   ASSERT_NE (jv, nullptr);
   auto range = tc.get_range_for_value (jv);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   0, line_1, 0,
 		   line_2_offset + 28, line_2, 28);
 }

 /* Verify that we can parse an empty JSON object.  */

 static void
 test_parse_empty_object ()
 {
   const int line_1 = 1;
   std::unique_ptr<error> err;
   parser_testcase tc ("{}");
   ASSERT_EQ (tc.get_error (), nullptr);
   const json::value *jv = tc.get_value ();
   ASSERT_NE (jv, nullptr);
   ASSERT_EQ (jv->get_kind (), JSON_OBJECT);
   ASSERT_PRINT_EQ (*jv, true, "{}");
   auto range = tc.get_range_for_value (jv);
   ASSERT_TRUE (range);
   ASSERT_RANGE_EQ (*range,
 		   0, line_1, 0,
 		   1, line_1, 1);
 }

 /* Verify that comment-parsing can be enabled or disabled.  */

 static void
 test_parsing_comments ()
 {
   const char *str = ("// foo\n"
 		     "/*...\n"
 		     "...*/ 42 // bar\n"
 		     "/* etc */\n");

   /* Parsing with comment support disabled.  */
   {
     parser_testcase tc (str);
     ASSERT_NE (tc.get_error (), nullptr);
     ASSERT_STREQ (tc.get_error ()->get_msg (),
 		  "invalid JSON token: unexpected character: '/'");
     ASSERT_EQ (tc.get_value (), nullptr);
   }

   /* Parsing with comment support enabled.  */
   {
     parser_testcase tc (str, true);
     ASSERT_EQ (tc.get_error (), nullptr);
     const json::value *jv = tc.get_value ();
     ASSERT_NE (jv, nullptr);
     ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
     ASSERT_EQ (((const json::integer_number *)jv)->get (), 42);
   }
 }

 /* Verify that we can parse an empty JSON string.  */

 static void
 test_error_empty_string ()
 {
   const int line_1 = 1;
   parser_testcase tc ("");
   ASSERT_ERR_EQ (tc.get_error (),
 		 0, line_1, 0,
 		 0, line_1, 0,
 		 "expected a JSON value but got EOF");
   ASSERT_EQ (tc.get_value (), nullptr);
 }

 /* Verify that JSON parsing gracefully handles an invalid token.  */

 static void
 test_error_bad_token ()
 {
   const int line_1 = 1;
   parser_testcase tc ("  not valid ");
   ASSERT_ERR_EQ (tc.get_error (),
 		 2, line_1, 2,
 		 2, line_1, 2,
 		 "invalid JSON token: unexpected character: 'n'");
   ASSERT_EQ (tc.get_value (), nullptr);
 }

 /* Verify that JSON parsing gracefully handles a missing comma
    within an object.  */

 static void
 test_error_object_with_missing_comma ()
 {
   const int line_1 = 1;
   /*                  0           1           2.  */
   /*                  01 2345 6789012 3456 7890.  */
   const char *json = "{\"foo\" : 42 \"bar\"";
   parser_testcase tc (json);
   ASSERT_ERR_EQ (tc.get_error (),
 		 12, line_1, 12,
 		 16, line_1, 16,
 		 "expected ',' or '}'; got string");
   ASSERT_EQ (tc.get_value (), nullptr);
 }

 /* Verify that JSON parsing gracefully handles a missing comma
    within an array.  */

 static void
 test_error_array_with_missing_comma ()
 {
   const int line_1 = 1;
   /*                  01234567.  */
   const char *json = "[0, 1 42]";
   parser_testcase tc (json);
   ASSERT_ERR_EQ (tc.get_error (),
 		 6, line_1, 6,
 		 7, line_1, 7,
 		 "expected ',' or ']'; got number");
   ASSERT_EQ (tc.get_value (), nullptr);
 }

 /* Run all of the selftests within this file.  */

 void
 json_parser_cc_tests ()
 {
   test_lexer ();
   test_lexing_unsupported_single_line_comment ();
   test_lexing_unsupported_multiline_comment ();
   test_lexing_supported_single_line_comment ();
   test_lexing_supported_multiline_comment ();
   test_parse_string ();
   test_parse_number ();
   test_parse_array ();
   test_parse_object ();
   test_parse_literals ();
   test_parse_jsonrpc ();
   test_parse_empty_object ();
   test_parsing_comments ();
   test_error_empty_string ();
   test_error_bad_token ();
   test_error_object_with_missing_comma ();
   test_error_array_with_missing_comma ();
 }

 } // namespace selftest

 #endif /* #if CHECKING_P */