gcc/cobol/scan_ante.h - gcc.git - Git at Google

 /*
  * Copyright (c) 2021-2025 Symas Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of the Symas Corporation nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 /*
  * Flex override
  */
 static void /* yynoreturn */ yy_fatal_error ( const char* msg  );

 static void inline
 die_fatal_error( const char msg[] ) {
   cbl_internal_error("scan.o: %s",  msg);
   yy_fatal_error(msg);
 }

 #define YY_FATAL_ERROR(msg) die_fatal_error((msg))

 /*
  * External functions
  */

 void parser_enter_file(const char *filename);
 void parser_leave_file();

 bool is_fixed_format();
 bool include_debug();
 int lexer_input( char buf[], int max_size, FILE *input );

 const char * keyword_str( int token );

 int repository_function_tok( const char name[] );

 void cobol_set_indicator_column( int column );

 void next_sentence_label(cbl_label_t*);

 int repeat_count( const char picture[] );

 size_t program_level();

 int ydfparse(void);

 FILE * copy_mode_start();

 /*
  * Public functions and data
  */

 cbl_label_t *next_sentence;

 static bool echo_on = false;

 void
 lexer_echo( bool tf ) {
   echo_on = tf;
 }

 bool
 lexer_echo() {
   return echo_on;
 }

 // IBM says a picture can be up to 50 bytes, not 1000 words.
 // ISO says a picture can be up to 63 bytes.  We allow for a NUL terminator.
 static char orig_picture[PICTURE_MAX];
 static char orig_number[80];

 const char *
 original_picture() {
   const char *out = xstrdup(orig_picture);
   assert(orig_picture[0] != '\0');
   return out;
 }

 char *
 original_number( char input[] = NULL ) {
   if( input ) {
     if(sizeof(orig_number) < strlen(input) ) return NULL;
     strcpy(orig_number, input);
     return input;
   }
   char *out = xstrdup(orig_number);
   assert(orig_number[0] != '\0');
   return out;
 }

 /*
  * Local functions
  */
 static const char * start_condition_str( int sc );
 static const char * start_condition_is();

 static bool nonspace( char ch ) { return !ISSPACE(ch); }

 static int
 numstr_of( const char string[], radix_t radix = decimal_e ) {
   yylval.numstr.radix = radix;
   ydflval.string = yylval.numstr.string = xstrdup(string);
   char *comma = strchr(yylval.numstr.string, ',');
   if( comma && comma[1] == '\0' ) *comma = '\0';
   if( ! original_number(yylval.numstr.string) ) {
     error_msg(yylloc, "input inconceivably long");
     return NO_CONDITION;
   }

   const char *input = yylval.numstr.string;
   auto eoinput = input + strlen(input);
   auto p = std::find_if( input, eoinput,
                          []( char ch ) { return ch == 'e' || ch == 'E';} );

   if( p < eoinput ) {
     if( eoinput == std::find(input, eoinput, symbol_decimal_point()) ) {
       // no decimal point: 1E0 is a valid user-defined name
       ydflval.string = yylval.string = yylval.numstr.string;
       return NAME;
     }
     assert(input < p);
     // "The literal to the left of the 'E' represents the significand. It may
     //  be signed and shall include a decimal point. The significand shall be
     //  from 1 to 36 digits in length."
     if( p == std::find(input, p, symbol_decimal_point()) ) {
       return NO_CONDITION;
     }
     auto nx = std::count_if(input, p, fisdigit);
     if( 36 < nx ) {
       error_msg(yylloc, "significand of %s has more than 36 digits (%td)", input, nx);
       return NO_CONDITION;
     }

     // "The literal to the right of the 'E' represents the exponent. It may be
     //  signed and shall have a maximum of four digits and no decimal point. "
     // "The maximum permitted value and minimum permitted value of the
     //  exponent is implementor-defined." (We allow 9999.)
     nx = std::count_if(p, eoinput, fisdigit);
     if( 4 < nx ) {
       error_msg(yylloc, "exponent %qs more than 4 digits", ++p);
       return NO_CONDITION;
     }
     if( eoinput != std::find(p, eoinput, symbol_decimal_point()) ) {
       error_msg(yylloc, "exponent %qs includes decimal point", ++p);
       return NO_CONDITION;
     }

     // "If all the digits in the significand are zero, then all the digits of
     //  the exponent shall also be zero and neither significand nor exponent
     //  shall have a negative sign."
     bool zero_signficand = std::all_of( input, p,
                                         []( char ch ) {
                                           return !ISDIGIT(ch) || ch == '0'; } );
     if( zero_signficand ) {
       if( p != std::find(input, p, '-') ) {
         error_msg(yylloc, "zero significand of %s "
                  "cannot be negative", input);
         return NO_CONDITION;
       }
       if( eoinput != std::find(p, eoinput, '-') ) {
         error_msg(yylloc, "exponent of zero significand of %s "
                  "cannot be negative", input);
         return NO_CONDITION;
       }
     }
   }
   if( 1 < std::count(input, eoinput, symbol_decimal_point()) ) {
     error_msg(yylloc, "invalid numeric literal %qs", ++p);
     return NO_CONDITION;
   }

   return NUMSTR;
 }

 static char *
 null_trim( char name[] ) {
   auto p = std::find_if( name, name + strlen(name), fisspace );
   if( p < name + strlen(name) ) *p = '\0';
   return name;
 }

 /*
  * CDF management
  */
 static int final_token;

 static inline const char *
 boolalpha( bool tf ) { return tf? "True" : "False"; }

 struct cdf_status_t {
   int lineno;
   const char *filename;
   int  token;
   bool parsing;
   cdf_status_t()
     : lineno(yylineno), filename(cobol_filename())
     , token(0), parsing(true)
   {}
   cdf_status_t( int token, bool parsing )
     : lineno(yylineno), filename(cobol_filename())
     , token(token), parsing(parsing)
   {}
   bool toggle() { return parsing = ! parsing; }

   const char * str() const {
     static char line[132];
     snprintf(line, sizeof(line), "%s:%d: %s, parsing %s",
              filename, lineno, keyword_str(token), boolalpha(parsing));
     return line;
   }
   static const char * as_string( const cdf_status_t& status ) {
     return status.str();
   }
 };

 /*
  * Scanning status is true if tokens are being parsed and false if not (because
  * CDF is skipping some code).  Because CDF status is nested, status is true
  * only if the whole stack is true. That is, if B is stacked on A, and A is
  * false, then all of B is skipped, regardless of >>IF and >>ELSE for B.
  */
 static bool run_cdf( int token );

 static class parsing_status_t : public std::stack<cdf_status_t> {
   typedef int (parser_t)(void);
   struct parsing_state_t {
     bool at_eof, expect_field_level;
     int pending_token;
     parser_t *parser;
     parsing_state_t()
       : at_eof(false)
       , expect_field_level(true)
       , pending_token(0)
       , parser(yyparse)
     {}
   } state, shadow;

  public:
   bool on() const { // true only if all true
     bool parsing = std::all_of( c.begin(), c.end(),
                                []( const auto& status ) { return status.parsing; } );
     return parsing;
   }

   bool feed_a_parser() const {
     return on() || state.parser == ydfparse;
   }

   void need_level( bool tf ) {     state.expect_field_level = tf; }
   bool need_level() const { return state.expect_field_level; }

   void parser_save( parser_t * new_parser ) {
     shadow = state;
     state.parser = new_parser;
   }
   void parser_restore() {
     state.parser = shadow.parser;
   }

   void inject_token( int token ) { state.pending_token = token; }
   int pending_token() {
     int token = state.pending_token;
     state.pending_token = 0;
     return token;
   }

   void at_eof( bool tf ) { state.at_eof = shadow.at_eof = tf; assert(tf); }
   bool at_eof() const { return state.at_eof; }

   bool in_cdf() const { return state.parser == ydfparse; }
   bool normal() const { return on() && state.parser == yyparse; }

   void splat() const {
     int i=0;
     for( const auto& status : c ) {
       yywarn( "%d %s", ++i, status.str() );
     }
   }
 } parsing;

 // Used only by parser, so scanner_normal() obviously true.
 void field_done() { orig_picture[0] = '\0'; parsing.need_level(true); }

 static int scanner_token() {
   if( parsing.empty() ) {
     error_msg(yylloc, "%<>>ELSE%> or %<>>END-IF%> without %<>>IF%>");
     return NO_CONDITION;
   }
   return parsing.top().token;
 }

 bool scanner_parsing() { return parsing.on(); }
 bool scanner_normal()  { return parsing.normal(); }

 void scanner_parsing( int token, bool tf ) {
   parsing.push( cdf_status_t(token, tf) );
   if( yydebug ) {
     yywarn("%s: parsing now %s, depth %zu",
             keyword_str(token), boolalpha(parsing.on()), parsing.size());
     parsing.splat();
   }
 }
 void scanner_parsing_toggle() {
   if( parsing.empty() ) {
     error_msg(yylloc, "%<>>ELSE%> without %<>>IF%>");
     return;
   }
   parsing.top().toggle();
   if( yydebug ) {
     yywarn("%s: parsing now %s",
             keyword_str(CDF_ELSE), boolalpha(parsing.on()));
   }
 }
 void scanner_parsing_pop() {
   if( parsing.empty() ) {
     error_msg(yylloc, "%<>>END-IF%> without %<>>IF%>");
     return;
   }
   parsing.pop();
   if( yydebug ) {
     yywarn("%s: parsing now %s, depth %zu",
             keyword_str(CDF_END_IF), boolalpha(parsing.on()),
 	   parsing.size());
     parsing.splat();
   }
 }


 static bool level_needed() {
   return scanner_normal() && parsing.need_level();
 }

 static void level_found() {
   if( scanner_normal() ) parsing.need_level(false);
 }

 /*
  * Trim the scanned location by the amount about to re-scanned.
  * Must be a macro because it expands yyless.
  */
 #define myless(N)				\
   do {						\
     auto n(N);					\
     trim_location(n);				\
     yyless(n);					\
   } while(0)

 class enter_leave_t {
   typedef void( parser_enter_file_f)(const char *filename);
   typedef void (parser_leave_file_f)();
   parser_enter_file_f *entering;
   parser_leave_file_f *leaving;
   const char *filename;

  public:
   enter_leave_t() : entering(NULL), leaving(NULL), filename(NULL) {}
   enter_leave_t(  parser_enter_file_f *entering, const char *filename )
     : entering(entering), leaving(NULL), filename(filename)
   {}
   explicit enter_leave_t(parser_leave_file_f *leaving)
     : entering(NULL), leaving(leaving), filename(NULL) {}

   void notify() {
     if( entering ) {
       cobol_filename(filename, 0);
       if( yy_flex_debug ) dbgmsg("starting line %4d of %s",
                                  yylineno, filename);
       entering(filename);
       gcc_assert(leaving == NULL);
     }
     if( leaving ) {
       cobol_filename_restore();
       if( yy_flex_debug ) dbgmsg("resuming line %4d of %s",
                                  yylineno, cobol_filename());
       leaving();
       gcc_assert(entering == NULL);
     }
   }
 };

 static class input_file_status_t {
   std::queue <enter_leave_t> inputs;
  public:
   void enter(const char *filename) {
     inputs.push( enter_leave_t(parser_enter_file, filename) );
   }
   void leave() {
     inputs.push( enter_leave_t(parser_leave_file) );
   }
   void notify() {
     while( ! inputs.empty() ) {
       auto enter_leave = inputs.front();
       enter_leave.notify();
       inputs.pop();
     }
   }
 } input_file_status;

 void input_file_status_notify() { input_file_status.notify(); }

 /*
  * parse.y and cdf.y each define a 4-integer struct to hold a token's location.
  * parse.y uses   YYLTYPE  yylloc;
  * cdf.y   uses YDFLLTYPE ydflloc;
  *
  * The structs have identical definitions with different types and of course
  * names.  We define "conversion" between them for convenience.
  *
  * Each parser expects its location value to be updated whenever it calls
  * yylex().  Therefore, here in the lexer we set both locations as each token
  * is scanned, so that both parsers see the same location.
  */
 static YDFLTYPE
 ydfltype_of( const YYLTYPE& loc ) {
   YDFLTYPE output {
     loc.first_line,   loc.first_column,
     loc.last_line,    loc.last_column };
   return output;
 }

 /*
  * After the input filename and yylineno are set, update the location of the
  * scanned token.
  */
 static void
 update_location( const YYLTYPE *ploc = nullptr ) {
   YYLTYPE loc = {
     yylloc.last_line, yylloc.last_column,
     yylineno,         yylloc.last_column + yyleng
   };
   if( ploc ) loc = *ploc;

   const char *p = static_cast<char*>(memrchr(yytext, '\n', yyleng));
   if( p ) {
     loc.last_column = (yytext + yyleng) - p;
   }

   yylloc = loc;
   ydflloc = ydfltype_of(yylloc);

   dbgmsg("  SC: %s location (%d,%d) to (%d,%d)",
          start_condition_is(),
          yylloc.first_line, yylloc.first_column,
          yylloc.last_line,  yylloc.last_column);
 }

 static void
 reset_location() {
   static const YYLTYPE loc { yylineno, 1, yylineno, 1 };
   update_location(&loc);
 }

 #define YY_USER_ACTION update_location();

 static void
 trim_location( int nkeep) {
   gcc_assert( 0 <= nkeep && nkeep <= yyleng );
   struct { char *p, *pend;
     size_t size() const { return pend - p; }
   } rescan = { yytext + nkeep, yytext + yyleng };

   auto nline = std::count(rescan.p, rescan.pend, '\n');
   dbgmsg("%s:%d: yyless(%d), rescan '%.*s' (" HOST_SIZE_T_PRINT_UNSIGNED
          " lines, " HOST_SIZE_T_PRINT_UNSIGNED " bytes)",
          __func__, __LINE__,
          nkeep,
          int(rescan.size()), rescan.p,
          (fmt_size_t)nline, (fmt_size_t)rescan.size());
   if( nline ) {
     gcc_assert( yylloc.first_line + nline <= yylloc.last_line );
     yylloc.last_line -= int(nline);
     gcc_assert( yylloc.first_line <= yylloc.last_line );
     char *p = static_cast<char*>(memrchr(rescan.p, '\n', rescan.size()));
     yylloc.last_column = rescan.pend - ++p;
     return;
   }

   gcc_assert( int(rescan.size()) < yylloc.last_column );
   yylloc.last_column -= rescan.size();
   if( yylloc.last_column < yylloc.first_column ) {
     yylloc.first_column = 1;
   }

   location_dump(__func__, __LINE__, "yylloc", yylloc);
 }

 static void
 update_location_col( const char str[], int correction = 0) {
   auto col = yylloc.last_column - strlen(str) + correction;
   if( col > 0 ) {
     yylloc.first_column = col;
   }
   location_dump(__func__, __LINE__, "yylloc", yylloc);
 }

 #define not_implemented(...) cbl_unimplemented_at(yylloc, __VA_ARGS__)

 #define YY_USER_INIT do {			\
     static YYLTYPE ones = {1,1, 1,1};		\
     yylloc = ones;                              \
     ydflloc = ydfltype_of(yylloc);              \
   } while(0)

 /*
  * YY_DECL is the generated lexer.  The parser calls yylex().  yylex() invokes
  * next_token(), which calls this lexer function.  The Flex-generated code
  * updates neither yylval nor yylloc.  That job is left to the actions.
  *
  * The parser relies on yylex to set yylval and yylloc each time it is
  * called. It maintains a separate copy for each term, and uses
  * YYLLOC_DEFAULT() to update the location of nonterminals.
  */
 #define YY_DECL int lexer(void)

 # define YY_INPUT(buf, result, max_size)                        \
 {                                                               \
   if( 0 == (result = lexer_input(buf, max_size, yyin)) )        \
     result = YY_NULL;                                           \
 }

 #define bcomputable(T, C)                               \
     yylval.computational.type=T,                        \
     yylval.computational.capacity=C,                    \
     yylval.computational.signable=true, BINARY_INTEGER
 #define scomputable(T, C)                               \
     yylval.computational.type=T,                        \
     yylval.computational.capacity=C,                    \
     yylval.computational.signable=true, COMPUTATIONAL
 #define ucomputable(T, C)                               \
     yylval.computational.type=T,                        \
     yylval.computational.capacity=C,                    \
     yylval.computational.signable=false, COMPUTATIONAL

 static char *tmpstring = NULL;

 #define PROGRAM current_program_index()

 // map of alias => canonical
 static std::map <std::string, std::string> keyword_aliases;

 const std::string&
 keyword_alias_add( const std::string& keyword, const std::string& alias ) {
   auto p = keyword_aliases.find(alias);
   if( p != keyword_aliases.end() ) return p->second; // error: do not overwrite
   return keyword_aliases[alias] = keyword;
 }

 /*
  * Because numeric USAGE types don't have distinct tokens and may have aliases,
  * we keep a table of their canonical names, which we use if we encounter an
  * alias.
  */
 struct bint_t {
   int token;
   cbl_field_type_t type;
   uint32_t capacity;
   bool signable;
 };
 static const std::map <std::string, bint_t > binary_integers {
   { "COMP-X", { COMPUTATIONAL, FldNumericBin5, 0xFF, false } },
   { "COMP-6", { COMPUTATIONAL, FldPacked, 0, false } },
   { "COMP-5", { COMPUTATIONAL, FldNumericBin5, 0, false } },
   { "COMP-4", { COMPUTATIONAL, FldNumericBinary, 0, true } },
   { "COMP-2", { COMPUTATIONAL, FldFloat, 8, false } },
   { "COMP-1", { COMPUTATIONAL, FldFloat, 4, false } },
   { "COMP", { COMPUTATIONAL, FldNumericBinary, 0, false } },
   { "COMPUTATIONAL-X", { COMPUTATIONAL, FldNumericBin5, 0xFF, false } },
   { "COMPUTATIONAL-6", { COMPUTATIONAL, FldPacked, 0, false } },
   { "COMPUTATIONAL-5", { COMPUTATIONAL, FldNumericBin5, 0, false } },
   { "COMPUTATIONAL-4", { COMPUTATIONAL, FldNumericBinary, 0, true } },
   { "COMPUTATIONAL-2", { COMPUTATIONAL, FldFloat, 8, false } },
   { "COMPUTATIONAL-1", { COMPUTATIONAL, FldFloat, 4, false } },
   { "COMPUTATIONAL", { COMPUTATIONAL, FldNumericBinary, 0, false } },
   { "BINARY", { BINARY_INTEGER, FldNumericBinary, 0, true } },
   { "BINARY-CHAR", { BINARY_INTEGER, FldNumericBin5, 1, true } },
   { "BINARY-SHORT", { BINARY_INTEGER, FldNumericBin5, 2, true } },
   { "BINARY-LONG", { BINARY_INTEGER, FldNumericBin5, 4, true } },
   { "BINARY-DOUBLE", { BINARY_INTEGER, FldNumericBin5, 8, true } },
   { "BINARY-LONG-LONG", { BINARY_INTEGER, FldNumericBin5, 8, true } },
   { "FLOAT-BINARY-32", { COMPUTATIONAL, FldFloat, 4, false } },
   { "FLOAT-BINARY-64", { COMPUTATIONAL, FldFloat, 8, false } },
   { "FLOAT-BINARY-128", { COMPUTATIONAL, FldFloat, 16, false } },
   { "FLOAT-EXTENDED", { COMPUTATIONAL, FldFloat, 16, false } },
   { "FLOAT-LONG", { COMPUTATIONAL, FldFloat, 8, false } },
   { "FLOAT-SHORT", { COMPUTATIONAL, FldFloat, 4, false } },
 };

 static int
 binary_integer_usage( const char name[]) {
   // uname can't be cbl_name_t, because at this point name[] might have more
   // than sizeof(cbl_name_t) characters.  The length check comes later.
   char *uname = xstrdup(name);
   std::transform(name, name + strlen(name), uname, ftoupper);

   dbgmsg("%s:%d: checking %s in %zu keyword_aliases",
 	 __func__, __LINE__, uname,
 	 keyword_aliases.size() );

   std::string key = uname;
   auto alias = keyword_aliases.find(key);
   if( alias != keyword_aliases.end() ) key = alias->second;

   auto p = binary_integers.find(key);
   if( p == binary_integers.end() ) return 0;

   yylval.computational.type = p->second.type;
   yylval.computational.capacity = p->second.capacity;
   yylval.computational.signable = p->second.signable;
   dbgmsg("%s:%d: %s has type %d", __func__, __LINE__,
 	 uname, p->second.type );
   free(uname);
   return p->second.token;
 }

 static void
 verify_ws( const YYLTYPE& loc, const char input[], char ch ) {
   if( ! fisspace(ch) ) {
     if( ! (dialect_mf() || dialect_gnu()) ) {
       dialect_error(loc, "separator space required in %qs", input);
     }
   }
 }
 #define verify_ws(C) verify_ws(yylloc, yytext, C)

 int
 binary_integer_usage_of( const char name[] ) {
   cbl_name_t uname = {};
   std::transform(name, name + strlen(name), uname, ftoupper);

   auto p = binary_integers.find(uname);
   if( p != binary_integers.end() ) {
     int token = p->second.token;
     switch( token ) {
     case COMPUTATIONAL:
     case BINARY_INTEGER:
       return token;
     default:
       gcc_unreachable();
       assert(false);
     }
   }
   return 0;
 }

 static uint32_t
 level_of( const char input[] ) {
   unsigned int output = 0;

   if( input[0] == '0' ) input++;

   if( 1 != sscanf(input, "%u", &output) ) {
     yywarn( "%s:%d: invalid level '%s'", __func__, __LINE__, input );
   }

   return output;
 }

 static inline int
 ndigit(int len) {
   char *input = TOUPPER(yytext[0]) == 'V'? yytext + 1 : yytext;
   int n = repeat_count(input);
   return n == -1? len : n;
 }

 static int
 picset( int token ) {
   static const char * const eop = orig_picture + sizeof(orig_picture);
   char *p = orig_picture + strlen(orig_picture);

   if( eop < p + yyleng ) {
     error_msg(yylloc, "PICTURE exceeds maximum size of %zu bytes",
              sizeof(orig_picture) - 1);
   }
   snprintf( p, eop - p, "%s", yytext );
   return token;
 }

 /**
 ## Script and data to produce picture_t::followers.
 ## Based on ISO Table 10.
 #! /usr/bin/awk -f

 BEGIN  {
   str = "B0/ , . + +- +- CR/DB cs cs Z* Z* + + cs cs 9 AX S V P P 1 N E"
   split(str, cols)
 }

 $1 ~ /CR|DB|cs/ { next }

 0 && !nlines++ {
   for( i=0; i < length(cols); i++ ) {
     print i, cols[i], "'" $i "'"
   }
 }

 $field == "x" {
   if( ! nout++ ) {
     printf "%2d: %5s: \"", field, cols[field - 1]
   }

   gsub(/^ +| +$/, "", $1)
   printf "%s", $1
 }

 END {
   if( ! nout++ ) {
     printf "%2d: %5s: \"", field, cols[field - 1]
   }
   print "\""
 }

 B  x x x - x - - x - x x x x x x x x - x - x - x
 0  x x x - x - - x - x x x x x x x x - x - x - x
 /  x x x - x - - x - x x x x x x x x - x - x - x
 ,  x x x - x - - x - x x x x x x x - - x - x
 .  x x - - x - - x - x - x - x - x
 +  - - - - - - - - - - - - - - - - - - - - - - - x
 +
 –
 +  x x x - - - - x x x x - - x x x - - x x x
 CR x x x - - - - x x x x - - x x x - - x x x
 DB x x x - - - - x x x x - - x x x - - x x x
 cs - - - - x
 cs x x x - x - - - - x x - - - - x - - x x x

 Z  x x - - x - - x - x
 *  x x - - x - - x - x
 Z  x x x - x - - x - x x - - - - - - - x - x
 *  x x x - x - - x - x x - - - - - - - x - x
 +  x x - - - - - x - - - x
 –  x x - - - - - x - - - x
 +  x x x - - - - x - - - x x - - - - - x
 –  x x x - - - - x - - - x x - - - - - x
 cs x x - - x - - - - - - - - x
 cs x x x - x - - - - - - - - x x - - - x

 9  x x x x x - - x - x - x - x - x x x x - x - - x
 A  x - - - - - - - - - - - - - - x x
 X  x - - - - - - - - - - - - - - x x
 S
 V  x x - - x - - x - x - x - x - x - x - x
 P  x x - - x - - x - x - x - x - x - x - x
 P  - - - - x - - x - - - - - - - - - x x - x
 1  - - - - - - - - - - - - - - - - - - - - - x
 N  x - - - - - - - - - - - - - - - - - - - - - x
 E  x x x - x - - - - - - - - - - x
 **/

 class picture_t {
   static const char dot = '.', comma = ',';

   typedef std::vector<std::string> followings_t;
   static const std::map <char, followings_t> followers;

   const char * const begin;
   const char *p, *pend;
   size_t pos;
   struct exclusions_t { // Nonzero if set, > 1 is false.
     // crdb means CR/DB or +/-.
     // pluses means 2 or more consecutive '+'.
     // minuses means 2 or more consecutive '-'.
     // "21) The symbol 'Z' and the symbol '*' are mutually exclusive "
     // stars means '*' or Z.
     unsigned short int crdb, currency, dot, pluses, minuses, stars, zzz;
     exclusions_t()
       : crdb(0), currency(0), dot(0), pluses(0), minuses(0), stars(0)
     {}
   } exclusions;
   YYLTYPE loc;

   bool is_crdb() const { // input must be uppercase for CR/DB
     if( p[0] == 'C' || p[0] == 'D' ) {
       char input[3] = { p[0], p[1] };
       return ( 0 == strcmp(input, "CR") || 0 == strcmp(input, "DB") );
     }
     return false;
   }

   const char * match_paren( const char *paren ) const {
     gcc_assert(paren[0] == '(');  // start with opening paren
     paren = std::find_if( paren, pend,
                           []( char ch ) {
                             return ch == '(' || ch == ')';
                           } );
     if( *paren == '(' ) return nullptr; // no nesting
     if( paren == pend ) return nullptr;
     return ++paren;
   }

   const char * next_not( char ch ) const {
     return std::find_if( p, pend,
                          [ch = TOUPPER(ch)]( char next ) {
                            return ch != next;
                          } );
   }

   const char * valid_next( const char *p, const std::string& valid ) const {
     if( p == pend || p + 1 == pend ) return pend;
     if( p[1] == '(' ) {
       return match_paren(++p);
     }
     auto pv = std::find(valid.begin(), valid.end(), TOUPPER(p[1]));
     return pv != valid.end()? ++p : nullptr;
   }
   const char * valid_next( const char *p,
                            bool first = true, char ch = '\0' ) const {
     if( p == pend || p + 1 == pend ) return pend;
     if( p[0] == '(' ) {
       if( (p = match_paren(p)) == nullptr ) return nullptr;
     }
     if( p[0] == '(' ) return nullptr;  // consecutive parentheses

     int index = first? 0 : 1;
     if( !ch ) ch = *p;   // use current character unless overridden
     auto valid = followers.find(TOUPPER(ch));
     if( valid == followers.end() ) {
       YYLTYPE loc(yylloc);
       loc.first_column += int(p - begin);
       error_msg( loc, "PICTURE: strange character %qc, giving up", ch );
       return nullptr;
     }
     return valid_next(p, valid->second[index]);
   }

   const char * start() { // start modifies exclusions, but not p
     auto pnext = p;

     switch(TOUPPER(p[0])) {
     case comma: case dot:
       // use decimal_is_comma()
       //  4:     .: "B0/,+Z*+-9E"
       exclusions.dot++;
       pnext = valid_next(p, "B0/,+Z*+-9E");
       break;
     case '+': case '-':
       //  6:    +-: "B0/,.Z*Z*9VPPE"
       exclusions.crdb++;
       pnext = next_not(p[0]);
       if( p + 1 < pnext ) {
         exclusions.pluses++;
       }
       pnext = valid_next(--pnext, "B0/,.Z*Z*9VPPE");
       break;
     case 'Z': case '*':
       exclusions.stars++;
       pnext = next_not(p[0]);
       break;
     case 'S':
       // 19:     S: "9VP"
       pnext = valid_next(p, "9VP");
       break;
     }

     /*
      * "For fixed editing sign control, the currency symbol, when used, shall
      * be either the leftmost symbol in character-string-1, optionally preceded
      * by one of the symbols '+' or '-' "
      */
     if( pnext ) {
       if( p == pnext || p[0] == '+' || p[0] == '-' ) {
         if( symbol_currency(*pnext) ) {
           exclusions.currency++;
           pnext = next_not(*pnext);
           pnext = valid_next(--pnext, true, '$');
         }
       }
     }

     return pnext;
   }

   const char * next() { // modify state; do not modify position
     auto pnext = p;
     auto loc(picture_t::loc);
     loc.first_column += int(p - begin);

     if( is_crdb() ) {
       if( exclusions.crdb++ ) {
         error_msg( loc, "PICTURE: CR/DB and %c/%c may appear only once", '+', '-' );
         return nullptr;
       }
       if( p + 2 != pend ) {
         error_msg( loc, "PICTURE: CR/DB must appear at the end" );
         return nullptr;
       }
       return pend;
     }

     if( symbol_currency(p[0]) ) {
       if( false && exclusions.currency++ ) { // not enforced
         error_msg( loc, "PICTURE: CURRENCY SYMBOL sequence may appear at most once" );
         return nullptr;
       }
       return valid_next(p, ! exclusions.dot, '$');
     }

     switch(TOUPPER(p[0])) {
     case '(':
       return match_paren(p);
       break;
     case 'B': case '0': case '/':
       pnext = valid_next(p);
       break;
     case comma:
       if( decimal_is_comma() ) {
         if( exclusions.dot++ ) {
           error_msg( loc, "PICTURE: %qc: may appear at most once", p[0] );
           return nullptr;
         }
         pnext = valid_next(p, true, dot);
       } else {
         pnext = valid_next(p);
       }
       break;
     case dot:
       if( p + 1 == pend ) {
         pnext = pend;
       } else {
         if( decimal_is_comma() ) {
           pnext = valid_next(p, true, comma );
         } else {
           if( exclusions.dot++ ) {
             error_msg( loc, "PICTURE: %qc: may appear at most once", p[0] );
             return nullptr;
           }
           pnext = valid_next(p);
         }
       }
       break;

     case '+': case '-':
       // 7 is trailing sign; 13 & 14 are numeric.  Leading sign handled by start().
       if( p + 1 == pend ) {
         if( exclusions.crdb++ ) {
           error_msg( loc, "PICTURE: %c/%c may appear at most once as a sign", '+', '-' );
           return nullptr;
         }
         pnext = pend;
       } else {
         pnext = next_not(p[0]);
         if( p + 1 < pnext ) {
           if( false && exclusions.pluses++ )  { // not enforced
             error_msg( loc, "PICTURE: %qc: sequence may appear at most once", p[0] );
             return nullptr;
           }
         }
         pnext = valid_next(pnext, ! exclusions.dot);
       }
       break;

     case 'Z': case '*':
       if( false && exclusions.stars++ ) { // not enforced
         error_msg( loc, "PICTURE: %qc: sequence may appear at most once", p[0] );
         return nullptr;
       }
       if( (pnext = next_not(p[0])) == nullptr ) return pnext;
       pnext = valid_next(pnext, ! exclusions.dot);
       break;
     case 'P':
       pnext = valid_next(pnext, ! exclusions.dot);
       break;
     case '9':
     case 'A': case 'X':
     case 'V':
     case '1':
     case 'N':
       pnext = valid_next(p);
       break;
     case 'E':
       pnext = valid_next(p, "+9");
       if( pnext && *pnext == '+' ) {
         pnext = valid_next(p, "9");
       }
       break;
     default:
       error_msg( loc, "PICTURE: %qc: invalid character", p[0] );
       return nullptr;
     }
     return pnext;
   }

  public:
   picture_t( const char *p, int len )
     : begin(p)
     , p(p), pend(p + len)
     , loc(yylloc)
   {
     assert(TOUPPER(*p) == 'P'); // as in PICTURE (or PICTURE IS)
     // move p to start of picture string
     while( (p = std::find_if(p, pend, fisspace)) != pend ) {
       this->p = p = std::find_if(p, pend,
                                  []( char ch ) { return ! fisspace(ch); } );
     }
     assert(this->p != pend);
     pos = this->p - begin;
   }

   bool is_valid() {
     if( !p ) return false;
     if( (p = start()) == nullptr ) {
       return false;
     }

     while( p && p < pend) {
       p = next();
     }
     return p == pend;
   }

   int starts_at() const { return pos; }
 };

 /*
  * The Followers map gives 1 or 2 lists of valid characters following a
  * character, the one in the key. If there are two lists, the correct one is
  * determined by the caller based on the state of the picture string, i.e.,
  * what has been seen before.
  */
 const std::map <char, picture_t::followings_t> picture_t::followers {
   /*   B0/ */ { 'B', {"B0/,.Z*+-9AXVPNE" } },
   /*   B0/ */ { '0', {"B0/,.Z*+-9AXVPNE" } },
   /*   B0/ */ { '/', {"B0/,.Z*+-9AXVPNE" } },
   /*     , */ { ',', {"B0/,.Z*+-9VPE"} },
   /*     . */ { '.', {"B0/,Z*+-9E"} },
   /*     +    { '+', "9" }, */
   /*    +- */ { '+', {"B0/,.Z*9VPE", "" } },
   /*    +- */ { '-', {"B0/,.Z*9VPE", "" } },
   /* CR/DB    { 'C', "" }, */
   /*    cs    { 'c', "B0/,.Z*+-9VP" }, */
   /*    cs    { 'c', "+" }, */
   /*    Z* */ { 'Z', {"B0/,.+Z*9VP", "B0/,+Z*"} },
   /*    Z* */ { '*', {"B0/,.+Z*9VP", "B0/,+Z*"} },
   /*     + */ { '+', {"B0/,.+-9VP",  "B0/,+-"} },
   /*    cs */ { '$', {"B0/,.+9VP",   "B0/,+"} },
   /*     9 */ { '9', {"B0/,.+9AXVPE"} },
   /*    AX */ { 'A', {"B0/9AX"} },
   /*    AX */ { 'X', {"B0/9AX"} },
   /*     S */ { 'S', {"9VP"} },
   /*     V */ { 'V', {"B0/,+Z*+-9P"} },
   /*     P */ { 'P', {"+VP", "B0/,+Z*9P"} },
   /*     1 */ { '1', {"1"} },
   /*     N */ { 'N', {"B0/N"} },
   /*     E */ { 'E', {"+9"} },
 };

 /*
  * Although picture_t::is_valid return a bool, it's not used. The validation
  * routines emit messages where the error is detected. The entire string is
  * subsequently parsed by the parser, which might otherwise accept an invalid
  * string, but will usually emit a message of its own.
  */
 static int
 validate_picture() {
   picture_t picture(yytext, yyleng);
   picture.is_valid();
   return  picture.starts_at();
 }

 static inline bool
 is_integer_token( int *pvalue = NULL ) {
   int v, n = 0;
   if( pvalue == NULL ) pvalue = &v;
   return 1 == sscanf(yytext, "%d%n", pvalue, &n) && n == yyleng;
 }

 static bool need_nume = false;
 bool need_nume_set( bool tf ) {
   dbgmsg( "need_nume now %s", tf? "true" : "false" );
   return need_nume = tf;
 }

 static int datetime_format_of( const char input[] );

 static int symbol_function_token( const char name[] ) {
   const auto e = symbol_function( 0, name );
   return e ? symbol_index(e) : 0;
 }

 bool in_procedure_division(void );

 static symbol_elem_t *
 symbol_exists( const char name[] ) {
   typedef std::map <std::string, size_t> name_cache_t;
   static std::map <size_t, name_cache_t> cachemap;

   cbl_name_t lname;
   std::transform( name, name + strlen(name) + 1, lname, tolower );
   auto& cache = cachemap[PROGRAM];

   if( in_procedure_division() && cache.empty() ) {
     for( auto e = symbols_begin(PROGRAM) + 1;
          PROGRAM == e->program && e < symbols_end(); e++ ) {
       if( e->type == SymFile ) {
         cbl_file_t *f(cbl_file_of(e));
         cbl_name_t lname;
         std::transform( f->name, f->name + strlen(f->name) + 1, lname, tolower );
         cache[lname] = symbol_index(e);
         continue;
       }
       if( e->type == SymField ) {
         auto f(cbl_field_of(e));
         cbl_name_t lname;
         std::transform( f->name, f->name + strlen(f->name) + 1, lname, tolower );
         cache[lname] = symbol_index(e);
       }
     }
     cache.erase("");
   }
   auto p = cache.find(lname);

   if( p == cache.end() ) {
     symbol_elem_t * e = symbol_field( PROGRAM, 0, name );
     return e;
   }

   return symbol_at(p->second);
 }

 static int
 typed_name( const char name[] ) {
   if( 0 == PROGRAM ) return NAME;
   if( need_nume ) { need_nume_set(false); return NUME; }

   int token = repository_function_tok(name);
   switch(token) {
   case 0:
     break;
   case FUNCTION_UDF_0:
     yylval.number = symbol_function_token(name);
     __attribute__((fallthrough));
   default:
     return token;
   }

   struct symbol_elem_t *e = symbol_special( PROGRAM, name );
   if( e ) return  cbl_special_name_of(e)->token;

   if( (token = redefined_token(name)) ) { return token; }

   e = symbol_exists( name );

   auto type = e && e->type == SymField? cbl_field_of(e)->type : FldInvalid;

   switch(type) {
   case FldLiteralA:
     {
       auto f = cbl_field_of(e);
       if( is_constant(f) ) {
 	if(  f->data.initial ) {
 	  int token = cbl_figconst_tok(f->data.initial);
 	  if( token ) return token;
 	}
         int token = datetime_format_of(f->data.initial);
         if( token ) {
           yylval.string = xstrdup(f->data.initial);
           return token;
         }
       }
     }
     __attribute__((fallthrough));
   case FldLiteralN:
     {
       const auto f = cbl_field_of(e);
       if( type == FldLiteralN ) {
         yylval.numstr.radix =
           f->has_attr(hex_encoded_e)? hexadecimal_e : decimal_e;
         yylval.numstr.string = xstrdup(f->data.initial);
         return NUMSTR;
       }
       if( !f->has_attr(record_key_e) ) { // not a key-name literal
         yylval.literal.set(f);
         ydflval.string = yylval.literal.data;
         return LITERAL;
       }
     }
     __attribute__((fallthrough));
   case FldInvalid:
   case FldGroup:
   case FldForward:
   case FldIndex:
   case FldAlphanumeric:
   case FldPacked:
   case FldNumericDisplay:
   case FldNumericEdited:
   case FldAlphaEdited:
   case FldNumericBinary:
   case FldFloat:
   case FldNumericBin5:
   case FldPointer:
     return NAME;
   case FldSwitch:
     return SWITCH;
   case FldClass:
     return cbl_field_of(e)->level == 88? NAME88 : CLASS_NAME;
     break;
   default:
     yywarn("%s:%d: invalid symbol type %s for symbol %qs",
           __func__, __LINE__, cbl_field_type_str(type), name);
     return NAME;
   }
   return cbl_field_of(e)->level == 88? NAME88 : NAME;
 }

 int
 retype_name_token() {
   return typed_name(ydflval.string);
 }

 static char *
 tmpstring_append( int len ) {
   const char *extant = tmpstring == NULL ? "" : tmpstring;
   char *s = xasprintf("%s%.*s", extant, len, yytext);
   free(tmpstring);
   return tmpstring = s;
 }

 #define pop_return yy_pop_state(); return

 static bool is_not = false;

 static uint64_t
 integer_of( const char input[], bool is_hex = false) {
   uint64_t output = 0;
   const char *fmt = is_hex? "%ul" : "%hl";

   if( input[0] == '0' ) input++;

   if( 1 != sscanf(input, fmt, &output) ) {
     yywarn( "%s:%d: invalid integer '%s'", __func__, __LINE__, input );
   }

   return output;
 }