gcc/cobol/lexio.cc - gcc.git - Git at Google

 /*
  * Copyright (c) 2021-2025 Symas Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of the Symas Corporation nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "config.h"
 #include <ext/stdio_filebuf.h>
 #include "cobol-system.h"
 #include "cbldiag.h"
 #include "util.h"
 #include "copybook.h"
 #include "lexio.h"

 extern int yy_flex_debug;

 source_format_t& cdf_source_format();

 void
 source_format_t::infer( const char *bol, bool want_reference_format ) {
   if( bol ) {
     left = 7;
     if( want_reference_format ) {
       right = 73;
     }
   }
   dbgmsg("%s:%d: %s format detected", __func__, __LINE__,
          description());
 }


 // public source format test functions
 bool is_fixed_format() { return cdf_source_format().is_fixed(); }
 bool is_reference_format() { return cdf_source_format().is_reffmt(); }

 static bool debug_mode = false;

 /*
  * The "debug mode" is a little odd, because we have to make sure a
  * leading "D" doesn't start the verb DISPLAY (for example).  If
  * debug_mode is on, debug lines are included in the parse.  If
  * debug_mode is off but we're not in fixed_format, lines starting
  * with "D" are also included.
  *
  * So, the line is excluded if: fixed format and not debug mode
  * Else, it's included.
 */

 static inline int left_margin() {
   return cdf_source_format().left_margin();
 }
 static inline int right_margin() {
   return cdf_source_format().right_margin();
 }

 /*
  * When setting the indicator column explicity:
  *   To get strict fixed 72-column lines, use a negative column number.
  *   When setting back to 0 (free), the right margin is also reset to 0.
  */
 void
 cobol_set_indicator_column( int column );

 bool include_debug()      { return is_fixed_format() && debug_mode; }
 bool set_debug( bool tf ) { return debug_mode = tf && is_fixed_format(); }

 static bool nonblank( const char ch ) { return !isblank(ch); }

 static inline char *
 start_of_line( char *bol, char *eol ) {
   bol = std::find_if(bol, eol, nonblank);
   gcc_assert(bol < eol); // must exist
   return bol;
 }

 static inline char *
 continues_at( char *bol, char *eol ) {
   if( cdf_source_format().is_free() ) return NULL;  // cannot continue in free format
   bol += left_margin();
   if( *bol != '-' ) return NULL; // not a continuation line
   return start_of_line(++bol, eol);
 }

 // Return pointer to indicator column. Test ch if provided.
 // NULL means no indicator column or tested value not present.
 static inline char *
 indicated( char *bol, const char *eol, char ch = '\0' ) {
   if( cdf_source_format().left_margin() == 0 && *bol != '*' ) {
     return NULL;  // no indicator column in free format, except for comments
   }
   gcc_assert(bol != NULL);
   auto ind = bol + left_margin();
   if( eol <= ind ) return NULL; // left margin would be after end of line
   // If TAB is in the line-number region, nothing is in the indicator column.
   bool has_tab = std::any_of(bol, ind,
                              [](const char ch) { return ch == '\t'; } );
   if( has_tab ) return NULL;
   if( (bol += left_margin()) > eol ) return NULL;
   return ch == '\0' || ch == *bol? bol : NULL;
 }

 static char *
 remove_inline_comment( char *bol, char *eol ) {
   char *nl = std::find(bol, eol, '\n');

   if( bol < nl ) {
     static char ends = '\0';
     std::swap(*nl, ends);
     char *comment = strstr(bol, "*>");
     if( comment ) {
       std::fill(comment, nl, SPACE);
     }
     std::swap(*nl, ends);
   }
   return eol;
 }

 static void
 erase_line( char *src, char *esrc ) {
   dbgmsg( "%s: erasing %.*s from input", __func__, int(esrc-src), src);
   erase_source(src, esrc);
 }

 static size_t
 count_newlines( const char *beg, const char *end ) {
   return std::count(beg, end, '\n');
 }

 size_t
 filespan_t::tab_check( const char *src, const char *esrc ) {
   static const char tab = '\t';

   const char *data = src + left_margin();
   if( data < esrc ) { // not a blank line
     const char *tab_at = std::find(src, data, tab);
     if( tab_at < data ) {
       return (tab_at - src) + 1;
     }
   }
   return 0;
 }

 static const auto extended_icase = regex::extended | regex::icase;

 std::stack< std::list<replace_t> > replace_directives;

 static bool
 is_word_or_quote( char ch ) {
   return ch == '"' || ch == '\'' || ISALNUM(ch);
 }
 /*
  * If the replacement is not leading/trailing, the edges of the
  * matched pattern must delimit a Cobol word.  If not, add a space to
  * the replacement.
  */
 static void
 maybe_add_space(const span_t& pattern, replace_t& recognized) {
   static const char blank[] = " ";
   const char *befter[2] = { "", "" };
   gcc_assert(0 < recognized.before.size());

   // start of pattern and end of preceding text
   if( pattern.p[0] == '.' && is_word_or_quote(recognized.before.p[-1]) ) {
     befter[0] = blank;
   }
   // end of pattern, and start of succeeding text
   if( pattern.pend[-1] == '.' && is_word_or_quote(recognized.before.pend[0]) ) {
     befter[1] = blank;
   }

   if( befter[0] == blank || befter[1] == blank ) {
     const char *s = xasprintf( "%s%.*s%s",
                                befter[0],
                                recognized.after.size(), recognized.after.p,
                                befter[1] );
     recognized.after = span_t(s, s + strlen(s));
   }
 }

 /*
  * Keep track of the next pending replacement for each active REPLACE
  * directive.  For the current line, apply patterns that begins on the
  * line.  (It may match input extending beyond the current eol.)
  *
  * As each replacement is identified, append it to the passsed list of
  * pending replacements.  For these elements:
  *
  *     before is a span in mfile
  *     after is dynamically allocated
  */
 static void
 recognize_replacements( filespan_t mfile, std::list<replace_t>& pending_replacements ) {
   static const char *top_of_stack_cache, *applies_to;

   struct future_replacement_t {
     replace_t directive;
     span_t found;
     future_replacement_t( const replace_t& replace, span_t found )
       : directive(replace), found(found)
     {}
     bool operator<( const future_replacement_t& that ) const {
       return found.p < that.found.p;
     }
   };

   static std::list<future_replacement_t> futures;

   if( replace_directives.empty() ) return;

   if( ! (top_of_stack_cache == replace_directives.top().front().before.p
          &&
          applies_to == mfile.data) )
   {
     futures.clear();
     top_of_stack_cache = replace_directives.top().front().before.p;
     applies_to = mfile.data;
   }

   if( futures.empty() ) {
     /*
      * From the current point in the file, find the next match for each
      * pattern at the top of the replacement stack.
      */
     for( const auto& directive : replace_directives.top() ) {
       regex re(directive.before.p, extended_icase);
       cmatch cm;

       span_t found(mfile.eodata, mfile.eodata);

       if( regex_search( mfile.ccur(),
                         const_cast<const char *>(mfile.eodata),
                         cm, re) ) {
         gcc_assert(cm[1].matched);
         found = span_t( cm[1].first, cm[1].second );
         if( yy_flex_debug ) {
           size_t n = count_newlines(mfile.data, found.p);
           dbgmsg("%s:%d first '%.*s' is on line " HOST_SIZE_T_PRINT_UNSIGNED
                  " (offset " HOST_SIZE_T_PRINT_UNSIGNED ")", __func__, __LINE__,
                  directive.before.size(), directive.before.p,
                  (fmt_size_t)++n, (fmt_size_t)(found.p - mfile.data));
         }
       } else {
         dbgmsg("%s:%d not found: '%s' in \n'%.*s'", __func__, __LINE__,
               directive.before.p, int(strlen(directive.before.p)), mfile.cur);
       }
       futures.push_back( future_replacement_t(directive, found) );
     }
   }

   gcc_assert(!futures.empty());
   gcc_assert(futures.size() == replace_directives.top().size());

   replace_t recognized;

   auto pnext = std::min_element(futures.begin(), futures.end());

   for( const char *bol = mfile.cur; // more than one replacement may apply to a line
        bol <= pnext->found.p && pnext->found.p < mfile.eol; ) {
     auto& next(*pnext);
     recognized = replace_t( next.found, span_t(strlen(next.directive.after.p),
                                                next.directive.after.p) );
     maybe_add_space(next.directive.before, recognized);
     pending_replacements.push_back(recognized);
     bol = next.found.pend;

     if( yy_flex_debug ) {
       size_t n = std::count(const_cast<const char *>(mfile.data),
                             recognized.before.p, '\n');
       dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
               " @ " HOST_SIZE_T_PRINT_UNSIGNED ": '%s'\n/%.*s/%.*s/",
               __func__, __LINE__,
               (fmt_size_t)++n, (fmt_size_t)(next.found.p - mfile.data),
               next.directive.before.p,
               int(recognized.before.size()), recognized.before.p,
               int(recognized.after.size()), recognized.after.p );
     }

     // Update the futures element for this pattern
     cmatch cm;

     next.found = span_t(mfile.eodata, mfile.eodata);

     regex re(next.directive.before.p, extended_icase);
     if( regex_search(bol, const_cast<const char *>(mfile.eodata), cm, re) ) {
       gcc_assert(cm[1].matched);
       next.found = span_t( cm[1].first, cm[1].second );
       size_t n = std::count(const_cast<const char *>(mfile.data),
                             next.found.p, '\n');
       if( false )
         dbgmsg("%s:%d next '%.*s' will be on line " HOST_SIZE_T_PRINT_UNSIGNED
                " (offset " HOST_SIZE_T_PRINT_UNSIGNED ")", __func__, __LINE__,
                next.directive.before.size(), next.directive.before.p,
                (fmt_size_t)++n, (fmt_size_t)(next.found.p - mfile.data));
     }
     pnext = std::min_element(futures.begin(), futures.end());
   }
 }

 static void
 check_push_pop_directive( filespan_t& mfile ) {
   char eol = '\0';
   const char *p = std::find(mfile.cur, mfile.eol, '>');
   if( ! (p < mfile.eol && p[1] == *p ) ) return;

   const char pattern[] =
     ">>[[:blank:]]*(push|pop)[[:blank:]]+"
     "("
       "all|"
       "call-convention|"
       "cobol-words|"
       "define|"
       "source[[:blank:]]+format|"
       "turn"
     ")";
   static regex re(pattern, extended_icase);

   // show contents of marked subexpressions within each match
   cmatch cm;

   std::swap(*mfile.eol, eol); // see implementation for excuses
   bool ok = regex_search(p, const_cast<const char *>(mfile.eol), cm, re);
   std::swap(*mfile.eol, eol);

   if( ok ) {
     gcc_assert(cm.size() > 1);
     bool push = TOUPPER(cm[1].first[1]) == 'U';
     switch( TOUPPER(cm[2].first[0]) ) {
     case 'A': // ALL
       push? cdf_push() : cdf_pop();
       break;
     case 'C':
       switch( TOUPPER(cm[2].first[1]) ) {
       case 'A': // CALL-CONVENTION
         push? cdf_push_call_convention() : cdf_pop_call_convention();
         break;
       case 'O': // COBOL-WORDS
         push? cdf_push_current_tokens() : cdf_pop_current_tokens();
         break;
       default:
         gcc_unreachable();
       }
       break;
     case 'D': // DEFINE
       push? cdf_push_dictionary() : cdf_pop_dictionary();
       break;
     case 'S': // SOURCE FORMAT
       push? cdf_push_source_format() : cdf_pop_source_format();
       break;
     case 'T': // TURN
       push? cdf_push_enabled_exceptions() : cdf_pop_enabled_exceptions();
       break;
     default:
       gcc_unreachable();
     }
     erase_line(const_cast<char*>(cm[0].first),
                const_cast<char*>(cm[0].second));
   }
 }

 static void
 check_source_format_directive( filespan_t& mfile ) {
   char eol = '\0';
   const char *p = std::find(mfile.cur, mfile.eol, '>');
   if( ! (p < mfile.eol && p[1] == *p ) ) return;

   const char pattern[] =
     ">>[[:blank:]]*source[[:blank:]]+"
     "(format[[:blank:]]+)?"
     "(is[[:blank:]]+)?"
     "(fixed|free)";
   static regex re(pattern, extended_icase);

   // show contents of marked subexpressions within each match
   cmatch cm;

   std::swap(*mfile.eol, eol); // see implementation for excuses
   bool ok = regex_search(p, const_cast<const char *>(mfile.eol), cm, re);
   std::swap(*mfile.eol, eol);

   if( ok ) {
     gcc_assert(cm.size() > 1);
     switch( cm[3].length() ) {
     case 4:
       cobol_set_indicator_column(0);
       break;
     case 5:
       cobol_set_indicator_column(-7);
       break;
     default:
       gcc_assert(cm[3].length() == 4 || cm[3].length() == 5);
       break;
     }

     dbgmsg( "%s:%d: %s format set, on line " HOST_SIZE_T_PRINT_UNSIGNED,
             __func__, __LINE__,
             cdf_source_format().description(),
             (fmt_size_t)mfile.lineno() );
     char *bol = cdf_source_format().is_fixed()? mfile.cur : const_cast<char*>(cm[0].first);
     gcc_assert(cm[0].second <= mfile.eol);
     erase_line(bol, const_cast<char*>(cm[0].second));
   }
 }

 struct buffer_t : public bytespan_t {
   char *pos;    // current output position

   buffer_t( char *data, char *eodata )
     : bytespan_t(data, eodata)
     , pos(data)
   {
     if(pos) *pos = '\0';
   }

   size_t nline() const {
     gcc_assert(data <= pos);
     return std::count(data, pos, '\n');
   }
   size_t free_space() const { gcc_assert(pos <= eodata); return eodata - pos; }

   bool pad_lines( size_t goal ) {
     while( nline() < goal ) {
       if( pos == eodata ) return false;
       *pos++ = '\n';
     }
     return true;
   }

   void show() const {
     gcc_assert(data <= pos);
     dbgmsg("flex input buffer: '%.*s'\n[xelf]", int(pos - data), data);
   }
   void dump() const {
 #ifdef GETENV_OK
     if( getenv("lexer_input") ) show();
 #endif
   }
 };

 static inline bool is_p( char ch ) { return TOUPPER(ch) == 'P'; }

 static bool
 is_program_id( const char *p, const char *eol ) {
   static const std::string program_id("PROGRAM-ID");
   auto eop = p + program_id.size();
   if( eop < eol ) {
     // PROGRAM-ID must be followed by a dot, perhaps with intervening whitespace.
     for( const char *dot=eop; dot < eol && *dot != '.'; dot++ ) {
       if( !ISSPACE(*dot) ) return false;
     }
     std::string line (p, eop);
     std::transform(line.begin(), line.end(), line.begin(), ::toupper);
     return line == program_id;
   }
   return false;
 }

 const char * esc( size_t len, const char input[] );

 static bool
 is_word_char( char ch ) {
   switch(ch) {
   case '$':
   case '-':
   case '_':
     return true;
   }
   return ISALNUM(ch);
 }

 static bool
 is_numeric_char( char ch ) {
   return ISDIGIT(ch)
     || TOUPPER(ch) == 'E'
     || ch == '.'
     || ch == ','
     ;
 }

 static bool
 is_numeric_term( span_t term ) {
   gcc_assert(term.p);
   if( term.p[0] == '+' || term.p[0] == '-' ) term.p++;
   auto p = std::find_if( term.p, term.pend,
                          []( char ch ) {
                            return ! is_numeric_char(ch);
                          } );
   return p == term.pend;
 }

 struct replacing_term_t {
   bool matched, done;
   span_t leading_trailing, term, stmt;

   explicit replacing_term_t(const char input[])
     : matched(false), done(false), stmt(span_t(input, input))
     {}
 };

 extern YYLTYPE yylloc;

 static const char *
 last_newline (const char *p, const char *pend ) {
   size_t len = pend - p;
   return static_cast<const char *>( memrchr( p, '\n', len ) );
 }
 /*
  * For some statement parsed with regex_search, set yyloc to indicate the line
  * and column spans of the term.  Assume stmt begins at the start of a line.
  */
 static void
 update_yylloc( const csub_match& stmt, const csub_match& term ) {
   gcc_assert(stmt.first <= term.first && term.second <= stmt.second);

   class dump_loc_on_exit {
    public:
     dump_loc_on_exit() {
       if( gcobol_getenv( "update_yylloc" ) )
         location_dump( "update_yylloc", __LINE__, "begin", yylloc);
     }
     ~dump_loc_on_exit() {
       if( gcobol_getenv( "update_yylloc" ) )
         location_dump( "update_yylloc", __LINE__, "end  ", yylloc);
     }
   } dloe;

   size_t nline = std::count( stmt.first, term.second, '\n' );
   size_t n = std::count( term.first, term.second, '\n' );

   if( nline ) {
     yylloc.last_line += nline;
     yylloc.first_line = yylloc.last_line - n;
   }

   /*
    * Set the column span for the term.
    */
   const char *p = last_newline(stmt.first, stmt.second);
   if( !p ) { // no newlines in entire statement
     yylloc.first_column = (term.first - stmt.first) + 1;
     yylloc.last_column =  (term.second - stmt.first) + 1;
     return;
   }

   p = last_newline(stmt.first, term.first);
   if( !p ) { // no newlines before term
     yylloc.first_column = (term.first - stmt.first) + 1;
     p = last_newline(term.first, term.second);
     gcc_assert(p); // newline must be in term
     yylloc.last_column = (term.second - p) + 1;
     return;
   }

   const char *bol = p; // bol points to last newline before term

   yylloc.first_column = term.first - bol;
   p = last_newline(term.first, term.second);
   if( p ) { // term has newlines, too
     yylloc.last_column =  (p - term.first);
   } else {
     yylloc.last_column = yylloc.first_column + term.length();
   }
 }

 static replacing_term_t
 parse_replacing_term( const char *stmt, const char *estmt ) {
   gcc_assert(stmt); gcc_assert(estmt); gcc_assert(stmt <= estmt);
   replacing_term_t output(stmt);

   static const char pattern[] =
     "^([[:space:];,]+(LEADING|TRAILING|BY))?"    // 1, 2
     "[[:space:];,]+" // leading space between pairs
     "("                                        // 3
             "(\""  "([\"]{2}|[^\"])*" "\")"    // 4, 5
         "|" "('"   "([']{2}|[^'])*"   "')"     // 6, 7
         "|" "("    "[+-]?[[:alnum:]]+([_-]+[[:alnum:]]+)*" ")" // 8, 9
         "|" "(==(" "(=?[^=]+)*" ")==)"         // 10, 11, 12
     ")"
     "(([[:space:];,]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:];,]*([.]))?"  // 13, 14, 15
     ;

   static regex re(pattern, extended_icase);
   cmatch cm;

   if( ! regex_search( stmt, estmt, cm, re) ) return output;

   bool replacing_term = cm[2].matched && TOUPPER(cm[2].first[0]) == 'B';

   if( cm[2].matched && ! replacing_term ) {
     output.leading_trailing = cm[2];
   }

   // Apply such that quoted matches supersede word matches.
   if( cm[11].matched ) output.term = cm[11];
   if( cm[ 8].matched ) output.term = cm[ 8];
   if( cm[ 6].matched ) output.term = cm[ 6];
   if( cm[ 4].matched ) output.term = cm[ 4];

   // The matched segment extends to the end of the matched term, or to
   // the dot at end of statement.  Include the pseudotext ==, if found.
   output.stmt = span_t(cm[0].first, output.term.pend);
   if( cm[10].matched ) output.stmt.pend = cm[10].second;

   if( cm[15].matched && ISSPACE(cm[15].second[0]) ) { // matched end of statement
     output.done = output.matched = true;
     output.stmt = cm[0];
     gcc_assert(output.stmt.pend[-1] == '.');
     dbgmsg("%s:%d: done at '%.*s'", __func__, __LINE__,
            output.term.size(), output.term.p);
     return output;
   }

   if( is_numeric_term(output.term) ) {
     output.matched = output.stmt.p < output.term.p;
     gcc_assert(output.matched);
     // look for fractional part
     if( is_numeric_char(*output.term.pend) && ISDIGIT(output.term.pend[1]) ) {
       gcc_assert(!ISDIGIT(*output.term.pend));
       auto p = std::find_if(++output.term.pend, estmt,
                             []( char ch ) { return !ISDIGIT(ch); } );
       output.stmt.pend = output.term.pend = p;
       output.done = '.' == output.stmt.pend[0] && ISSPACE(output.stmt.pend[1]);
       if( output.done ) output.stmt.pend++;
     }
     dbgmsg("%s:%d: %s '%.*s'", __func__, __LINE__,
            output.done? "done at" : "term is",
            output.term.size(), output.term.p);
     return output;
   }

   if( yy_flex_debug ) { // should be looking only for words
     dbgmsg("%s:%d: not done, working with '%.*s'", __func__, __LINE__,
           cm[0].length(), cm[0].first);
     int i=0;
     for( auto m : cm ) {
       if( m.matched )
         dbgmsg("%4d) '%.*s'", i, m.length(), m.first);
       i++;
     }
   }

   if( !cm[8].matched ) {
     output.matched = output.stmt.p < output.term.p;
     gcc_assert(output.matched);
     dbgmsg("%s:%d: term is '%.*s'", __func__, __LINE__,
            output.term.size(), output.term.p);
     return output;
   }

   bool extraneous_replacing = 'R' == TOUPPER(cm[8].first[0]); // maybe
   if( extraneous_replacing ) { // prove it
     static const char replacing[] = "REPLACING";
     for( size_t i=0; i < strlen(replacing); i++ ) {
       if( replacing[i] != TOUPPER(cm[8].first[i]) ) {
         extraneous_replacing = false;
         break;
       }
     }
     if( extraneous_replacing ) {
       update_yylloc( cm[0], cm[8] );
       yywarn("syntax error: invalid '%.*s'", cm[8].length(), cm[8].first);
       output.matched = false;
       return output;
     }
   }

   gcc_assert(cm[8].matched);
   gcc_assert(0 < output.term.size());

   dbgmsg("%s:%d: more words starting at '%.80s'", __func__, __LINE__,
          output.term.pend);

   static const char term_pattern[] =
     "^[[:space:]]+"
     "(" "(IN|OF)[[:space:]]+" ")"                            // 1, 2
     "(" "[+-]?[[:alnum:]]+([$_-]+[[:alnum:]]+)*" ")"         // 3, 4
     "(" "[[:space:]]*[(]" ")?"                               // 5
     "(([[:space:]]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:]]*([.]))?"  // 6, 7, 8
     ;
   static const char paren_pattern[] =
     "^[[:space:]]*"
     "(" "[()][^()]*[()]" ")"    // 1
     "(([[:space:]]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:]]*([.]))?"  // 2, 3, 4
     ;

   regex term_re(term_pattern, extended_icase);
   regex paren_re(paren_pattern, extended_icase);
   ssize_t nsub = 0;

   while( regex_search( output.term.pend, estmt, cm, term_re) ) {
     output.stmt.pend = output.term.pend = cm[3].second; // found a word
     if( cm[5].matched ) break; // found left parenthesis
     const csub_match& done(cm[8]);
     if( done.matched ) {
       output.done = output.matched = output.stmt.p < output.term.p;
       gcc_assert(output.done);
       goto matched;
     }
   }

   // match subscripts, if any
   while( regex_search( output.term.pend, estmt, cm, paren_re) ) {
     output.stmt.pend = output.term.pend = cm[1].second;
     if( cm[1].first[0]   == '(' ) nsub++;
     if( cm[1].first[0]   == ')' ) nsub--;
     if( cm[1].second[-1] == '(' ) nsub++;
     if( cm[1].second[-1] == ')' ) nsub--;

     const csub_match& done(cm[4]);
     if( done.matched ) {
       output.matched = output.stmt.p < output.term.p;
       output.stmt.pend = done.second;
       output.done = output.stmt.pend[-1] == '.';
       goto matched;
     }

     if( nsub == 0 ) break;
   }

  matched:
   output.matched = output.stmt.p < output.term.p;

   if( yy_flex_debug ) {
     const char *status = "unmatched";
     if( output.matched ) status = output.done? "done" : "matched";
     dbgmsg("%s:%d: %s term is '%.*s'", __func__, __LINE__, status,
            output.term.size(), output.term.p? output.term.p : "");
   }
   return output;
 }

 struct replacing_pair_t {
   span_t leading_trailing, stmt;
   replace_t replace;

   bool matched() const { return 0 < stmt.size(); }
   bool done() const { return matched() && stmt.pend[-1] == '.'; }
 };
 static replacing_pair_t
 parse_replacing_pair( const char *stmt, const char *estmt ) {
   replacing_pair_t pair;

   pair.replace = replace_t();
   auto parsed = parse_replacing_term( stmt, estmt ); // before
   if( parsed.matched ) {
     if( parsed.term.size() == 0 ) return pair; // failure: empty before string
     pair.leading_trailing = parsed.leading_trailing;
     pair.stmt = parsed.stmt;
     pair.replace.before = parsed.term;

     if( !parsed.done ) {
       parsed = parse_replacing_term( pair.stmt.pend, estmt ); // after
       if( parsed.matched ) {
         pair.stmt.pend = parsed.stmt.pend;
         pair.replace.after = parsed.term;
       } else {
         dbgmsg("%s:%d: not matched '%.*s'", __func__, __LINE__,
                pair.stmt.size(), pair.stmt.p);
       }
     }
     if( yy_flex_debug ) {
       const char *status = "unmatched";
       if( pair.matched() ) status = pair.done()? "done" : "matched";
       dbgmsg("%s:%d: [%s] replacing '%.*s' with '%.*s'", __func__, __LINE__,
              status,
              pair.replace.before.size(), pair.replace.before.p,
              pair.replace.after.size(), pair.replace.after.p);
     }
   } else {
     for( auto p = stmt; (p = std::find(p, estmt, '.')) < estmt; p++ ) {
       if( ISSPACE(p[1]) ) {
         pair.stmt = span_t(stmt, ++p);
         break;
       }
     }
     if( pair.stmt.p ) {
       yywarn("CDF syntax error '%.*s'", (int)pair.stmt.size(), pair.stmt.p);
     }
     else {
       // This eliminated a compiler warning about "format-overflow"
       yywarn("CDF syntax error");
     }
     pair.stmt = span_t(size_t(0), stmt);
     pair.replace = replace_t();
   }
   return pair;
 }

 static std::pair<std::list<replace_t>, char *>
 parse_replace_pairs( const char *stmt, const char *estmt, bool is_copy_stmt ) {
   std::list<replace_t> pairs ;

   static const char     any_ch[] = ".";
   static const char    word_ch[] = "[[:alnum:]$_-]";
   static const char nonword_ch[] = "[^[:alnum:]\"'$_-]";

   // Pattern to find one REPLACE pseudo-text pair
   static const char replace_pattern[] =
     "([[:space:]]+(LEADING|TRAILING))?"    // 1, 2
     "[[:space:]]+"
     "==(" "(=?[^=]+)+" ")=="               // 3, 4
     "[[:space:]]+BY[[:space:]]+"
     "==(" "(=?[^=]+)*" ")=="               // 5, 6
     "(([[:space:]]+[\"'=[:alnum:]+-]{1,2})" "|" "[[:space:]]*([.]))?"  // 7, 8, 9
     ;

   regex pair_re(replace_pattern, extended_icase);
   cmatch cm;
   replacing_pair_t parsed;
   bool end_of_stmt = false;

   for( auto p = stmt; p < estmt && !end_of_stmt; p = parsed.stmt.pend ) {
     if( is_copy_stmt ) {
       parsed = parse_replacing_pair(p, estmt);
       if( parsed.replace.before.size() == 0 ) break; // empty before
       if( parsed.replace.after.p == NULL ) break;    // invalid after
       end_of_stmt = parsed.done();
     } else {
       if( ! regex_search( p, estmt, cm, pair_re) ) break;
       // Report findings.
       if( false && yy_flex_debug ) {
         for( size_t i=0; i < cm.size(); i++ ) {
           dbgmsg("%s: %s " HOST_SIZE_T_PRINT_UNSIGNED ": '%.*s'", __func__,
                  cm[i].matched? "Pair" : "pair",
                  (fmt_size_t)i,
                  cm[i].matched? int(cm[i].length()) : 0,
                  cm[i].matched? cm[i].first : "");
         }
       }
       gcc_assert(cm[3].matched);
       gcc_assert(cm[5].matched);
       parsed.leading_trailing = cm[2];
       parsed.replace.before   = cm[3];
       parsed.replace.after    = cm[5];

       parsed.stmt = cm[0];
       // If not done, exclude trailing portion from statement match.
       if( !parsed.done() && cm[8].matched ) {
         gcc_assert(!cm[9].matched);
         parsed.stmt.pend = cm[8].first;
       }
     }

     span_t& before(parsed.replace.before);
     const span_t& after(parsed.replace.after);

     const char *befter[2] = { nonword_ch, nonword_ch };
     gcc_assert(before.p < before.pend);
     if( !is_word_char(before.p[0]) )     befter[0] = any_ch;
     if( !is_word_char(before.pend[-1]) ) befter[1] = any_ch;

     const char *src = esc(before.size(), before.p);

     if( parsed.leading_trailing.size() > 0 ) {
       switch( TOUPPER(parsed.leading_trailing.p[0]) ) {
       case 'L': // leading
         befter[1] = word_ch;
         break;
       case 'T': // trailing
         befter[0] = word_ch;
         break;
       default:
         gcc_unreachable();
       }
       dbgmsg("%s:%d: dealing with %.*s", __func__, __LINE__,
              int(parsed.leading_trailing.size()), parsed.leading_trailing.p);
     }

     src = xasprintf("%s(%s)%s", befter[0], src, befter[1]);

     struct {  span_t before, after; }  output;
     output.before = span_t(strlen(src), src);
     output.after = after.dup();

     gcc_assert(!before.has_nul());
     pairs.push_back( replace_t( output.before, output.after ) );

     // COPY REPLACING matches end-of-statment here
     // REPLACE matched end-of-statement in caller, and estmt[-1] == '.'
     if( is_copy_stmt && parsed.stmt.pend[-1] == '.' ) break;
   }

   if( yy_flex_debug ) {
     dbgmsg( "%s:%d: %s: " HOST_SIZE_T_PRINT_UNSIGNED " pairs parsed from  '%.*s'",
             __func__, __LINE__,
             parsed.done() ? "done" : "not done",
             (fmt_size_t)pairs.size(), parsed.stmt.size(), parsed.stmt.p );
     int i = 0;
     for( const auto& replace : pairs ) {
       dbgmsg("%s:%d:%4d: '%s' => '%s'", __func__, __LINE__,
              ++i, replace.before.p, replace.after.p);
     }
   }
   if( !parsed.done() ) {
     pairs.clear();
     return std::make_pair(pairs, const_cast<char*>(stmt));
   }

   return std::make_pair(pairs, const_cast<char*>(parsed.stmt.pend));
 }

 struct copy_descr_t {
   bool parsed;
   int fd;
   size_t nreplace;
   span_t partial_line, erased_lines;

   copy_descr_t( const char *line, const char *eol)
     : parsed(false), fd(-1), nreplace(0), partial_line(line, eol) {}
 };

 static YYLTYPE
 location_in( const filespan_t& mfile, const csub_match& cm ) {
   YYLTYPE loc {
     int(mfile.lineno() + 1), int(mfile.colno() + 1),
     int(mfile.lineno() + 1), int(mfile.colno() + 1)
   };
   gcc_assert(mfile.cur <= cm.first && cm.second <= mfile.eodata);
   auto nline = std::count(cm.first, cm.second, '\n');
   if( nline ) {
     gcc_assert(nline < loc.first_line);
     loc.first_line -= nline;
     auto p = static_cast<const char*>(memrchr(cm.first, '\n', cm.length()));
     loc.last_column = (cm.second) - p;
   }
   location_dump(__func__, __LINE__, "copy?", loc);
   return loc;
 }

 static copy_descr_t
 parse_copy_directive( filespan_t& mfile ) {
   static const char *most_recent_buffer;
   static span_t copy_stmt(mfile.eodata, mfile.eodata);

   static const char pattern[] =
     "COPY" "[[:space:]]+"
     /* 1 */    "("
     /*2,3*/            "\"("  "([\"]{2}|[^\"])+" ")\""
     /*4,5*/        "|" "'("   "([']{2}|[^'])+"   ")[']"
     /*6,7*/        "|" "("    "[[:alnum:]]+([_-]+[[:alnum:]]+)*" ")"
     /*   */    ")"
     /* 8 */    "("
     /* 9 */        "[[:space:]]+(OF|IN)[[:space:]]+"
     /* 10*/        "("
     /*11,12*/              "(\""  "([\"]{2}|[^\"])+" "\")"
     /*13,14*/          "|" "('"   "([']{2}|[^'])+"   "')"
     /*15,16*/          "|" "("    "[[:alnum:]]+([_-]+[[:alnum:]]+)*" ")"
     /*   */        ")"
     /*   */    ")?"
     /*17,18*/  "([[:space:]]+SUPPRESS([[:space:]]+PRINTING)?)?"
     /*19,20 */ "(" "([[:space:]]*[.])" "|" "[[:space:]]+REPLACING" ")"
     ;

   static regex re(pattern, extended_icase);
   cmatch cm;
   copy_descr_t outcome(mfile.cur, mfile.cur);

   // COPY appears in current buffer?
   if( most_recent_buffer != mfile.data || copy_stmt.p < mfile.cur ) {
     most_recent_buffer = mfile.data;
     copy_stmt.p = mfile.eodata;

     if( regex_search(mfile.ccur(),
                      const_cast<const char *>(mfile.eodata), cm, re) ) {
       copy_stmt = span_t( cm[0].first, cm[0].second );
       if( yy_flex_debug ) {
         size_t nnl = 1 + count_newlines(mfile.data, copy_stmt.p);
         size_t nst = 1 + count_newlines(copy_stmt.p, copy_stmt.pend);
         dbgmsg("%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
                ": COPY directive is " HOST_SIZE_T_PRINT_UNSIGNED " lines '%.*s'",
                __func__, __LINE__,
                (fmt_size_t)nnl, (fmt_size_t)nst, copy_stmt.size(), copy_stmt.p);
       }
     }
   }

   // If COPY appears on the current line, parse it completely this time.
   if( mfile.cur <= copy_stmt.p &&
                    copy_stmt.p < mfile.eol ) {
     outcome.parsed = regex_search(copy_stmt.p, copy_stmt.pend, cm, re);
     gcc_assert(outcome.parsed);
     outcome.partial_line = span_t(mfile.cur, copy_stmt.p);

     if( yy_flex_debug ) {
       dbgmsg(HOST_SIZE_T_PRINT_UNSIGNED " expressions",
              (fmt_size_t)std::count(pattern, pattern + sizeof(pattern), '('));
       int i = 0;
       for( const auto& m : cm ) {
         if( m.matched )
           dbgmsg("%s:%d: %2d: '%.*s'", __func__, __LINE__,
                 i, int(m.length()), m.first);
         i++;
       }
     }

     auto& copybook_name = cm[1];
     auto& library_name = cm[10];

     bool replacing = !cm[20].matched;

     if( library_name.matched ) {
       YYLTYPE loc = location_in( mfile, library_name );
       copybook.library( loc, xstrndup(library_name.first, library_name.length()) );
     }
     YYLTYPE loc = location_in( mfile, copybook_name );
     outcome.fd = copybook.open( loc, xstrndup(copybook_name.first,
                                               copybook_name.length()) );
     if( outcome.fd == -1 ) { // let parser report missing copybook
       dbgmsg("%s:%d: (no copybook '%s' found)", __func__, __LINE__, copybook.source());
       return outcome;
     }

     if( replacing ) {
       std::pair<std::list<replace_t>, char*>
         result = parse_replace_pairs( cm[0].second, mfile.eodata, true );

       const std::list<replace_t>& replacements(result.first);
       outcome.parsed = (outcome.nreplace = replacements.size()) > 0;
       if( outcome.parsed ) {
         replace_directives.push(replacements);
       }
       copy_stmt.pend = result.second;

       // Maybe we don't need these.  We'll see.
       for( const auto& r : replacements ) {
         copybook.replacement(pseudo_e, r.before.dup().p, r.after.dup().p);
       }
     }

     // If the parse failed, pass it through to the parser for analysis.
     if( outcome.parsed ) {
       erase_line( const_cast<char*>(copy_stmt.p),
                   const_cast<char*>(copy_stmt.pend));
       outcome.erased_lines = copy_stmt;
     }

     mfile.eol = const_cast<char*>(copy_stmt.pend);
     mfile.next_line();
   }
   return outcome;
 }

 static char *
 parse_replace_last_off( const filespan_t& mfile ) {
   static const char pattern[] =
     "REPLACE" "[[:space:]]+"
     "(LAST[[:space:]]+)?OFF[[:space:]]*[.]"
     ;
   static regex re(pattern, extended_icase);
   cmatch cm;

   // REPLACE [LAST] OFF?
   bool found = regex_search(mfile.ccur(),
                             const_cast<const char *>(mfile.eodata), cm, re);
   gcc_assert(found); // caller ensures

   gcc_assert(cm.size() == 2);
   // LAST OFF removes most recent REPLACE
   if( cm[1].matched ) {
     gcc_assert(TOUPPER(cm[1].first[0]) == 'L');
     if( ! replace_directives.empty() ) {
       replace_directives.pop();
     }
   } else { // OFF clears the REPLACE stack
     while( ! replace_directives.empty() ) {
       replace_directives.pop();
     }
   }

   dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ": parsed '%.*s', ",
           __func__, __LINE__,
           (fmt_size_t)mfile.lineno(), int(cm[0].length()), cm[0].first );

   // Remove statement from input
   erase_line(const_cast<char*>(cm[0].first),
              const_cast<char*>(cm[0].second));

   return const_cast<char*>(cm[0].second);
 }

 static span_t
 parse_replace_text( filespan_t& mfile ) {
   static const char pattern[] =
     /* 0 */    "REPLACE"
     /* 1 */    "([[:space:]]+ALSO)?"
     /* 2 */    "("
     /*3,4*/      "([[:space:]]+(LEADING|TRAILING))?"
     /* 5 */      "([[:space:]]+"
     /* 6 */         "==" "(=?[^=]+)+" "=="
     /*   */         "[[:space:]]+BY[[:space:]]+"
     /* 7 */         "==" "(=?[^=]+)*" "=="
     /*   */      ")"
     /*   */    ")+[[:space:]]*[.]"
     ;
   static regex re(pattern, extended_icase);
   cmatch cm;
   const size_t current_lineno(mfile.lineno());

   if( false && yy_flex_debug ) {
     auto pend = mfile.eol;
     gcc_assert(mfile.line_length() > 2);
     if( pend[-1] == '\n' ) pend -= 2;
     auto len = int(pend - mfile.cur);
     dbgmsg("%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ": parsing '%.*s",
            __func__, __LINE__,
            (fmt_size_t)current_lineno, len, mfile.cur);
   }

   if( ! regex_search(mfile.ccur(), mfile.eodata, cm, re) ) {
     dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED
             ": not a REPLACE statement:\n'%.*s'",
             __func__, __LINE__, (fmt_size_t)current_lineno,
             int(mfile.line_length()), mfile.cur );
     return span_t();
   }

   // Report findings.
     if( yy_flex_debug ) {
       dbgmsg(HOST_SIZE_T_PRINT_UNSIGNED " expressions",
              (fmt_size_t)std::count(pattern, pattern + sizeof(pattern), '('));
       int i = 0;
       for( const auto& m : cm ) {
         if( m.matched )
           dbgmsg("%s:%d: %2d: '%.*s'", __func__, __LINE__,
                 i, int(m.length()), m.first);
         i++;
       }
     }

   gcc_assert(cm.size() > 7);

   // Update active REPLACE stack
   if( ! cm[1].matched ) { // ALSO pushes, else clear stack and push one.
     while( !replace_directives.empty() ) {
       replace_directives.pop();
     }
   } else {
     gcc_assert(TOUPPER(cm[1].first[0]) == 'A');
   }

   span_t replace_stmt(cm[0].first, cm[0].second);

   std::pair<std::list<replace_t>, char*>
         result = parse_replace_pairs(replace_stmt.p, replace_stmt.pend, false);
   const std::list<replace_t>& replacements(result.first);
   replace_directives.push( replacements );

   if( yy_flex_debug ) {
     dbgmsg( "%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ": " HOST_SIZE_T_PRINT_UNSIGNED
             " pairs parsed from  '%.*s'", __func__, __LINE__,
             (fmt_size_t)current_lineno, (fmt_size_t)replacements.size(),
             int(replace_stmt.size()), replace_stmt.p );
     for( const auto& replace : replacements ) {
       int i = 0;
       dbgmsg("%s:%d:%4d: '%s' => '%s'", __func__, __LINE__,
             ++i, replace.before.p, replace.after.p);
     }
   }

   // Remove statement from input
   erase_line(const_cast<char*>(replace_stmt.p),
              const_cast<char*>(replace_stmt.pend));

   return replace_stmt;
 }

 static span_t
 parse_replace_directive( filespan_t& mfile ) {
   static const char *most_recent_buffer, *next_directive;
   static bool off_coming_up;
   static const char pattern[] =
     "REPLACE" "[[:space:]]+" "(LAST|OFF|ALSO|LEADING|TRAILING|==)";

   static regex re(pattern, extended_icase);
   cmatch cm;

   // REPLACE appears in current buffer?
   if( most_recent_buffer != mfile.data || next_directive < mfile.cur ) {
     most_recent_buffer = mfile.data;
     next_directive = mfile.eodata;

     if( regex_search(mfile.ccur(),
                      const_cast<const char *>(mfile.eodata), cm, re) ) {
       gcc_assert(cm[1].matched);
       next_directive = cm[0].first;

       switch( TOUPPER(cm[1].first[0]) ) {
       case 'L':
         off_coming_up = 'A' == TOUPPER(cm[1].first[1]); // LAST OFF, else LEADING
         break;
       case 'O': // OFF
         off_coming_up = true;
         break;
       case 'A': case 'T': case '=': // [ALSO] [ eading/Trailing] == ...
         off_coming_up = false;
         break;
       default:
         gcc_unreachable();
       }
     }
   }

   span_t erased;
   // REPLACE appears on current line?
   if( mfile.cur <= next_directive &&
                    next_directive < mfile.eol ) {
     if( off_coming_up ) {
       parse_replace_last_off(mfile);
     } else {
       erased = parse_replace_text(mfile);
     }
   }
   return erased;
 }

 /*
  * Maintain the number of newlines by counting those that will be
  * overwritten, and appending them to the appended line.  Return the
  * new EOL pointer.
  *
  * The newlines accumulate past eodata, at the start of the blank
  * lines created by the caller.
  */
 char *
 bytespan_t::append( const char *input, const char *eoinput ) {
   gcc_assert(data < eodata);

 #define LEXIO 0
 #if LEXIO
   auto nq = std::count_if(data, eodata, isquote);
   dbgmsg("%s:%3d:  input ------ '%.*s'", __func__, __LINE__, int(eoinput - input), input);
   dbgmsg("%s:%3d:  precondition '%.*s' (" HOST_SIZE_T_PRINT_UNSIGNED ": %s)",
          __func__, __LINE__,
          int(size()), data, (fmt_size_t)nq, in_string()? "in string" : "not in string");
 #endif
   if( !in_string() ) { // Remove trailing space unless it's part of a literal.
     while(data < eodata && ISSPACE(eodata[-1])) eodata--;
     gcc_assert(ISSPACE(eodata[0]));
     gcc_assert(data == eodata || !ISSPACE(eodata[-1]));
   }
   // skip leading blanks
   while( input < eoinput && ISSPACE(*input) ) input++;
   if( isquote(*input) ) input++;

   size_t len = eoinput - input;
   char * pend = eodata + len;

   int nnl =   std::count(eodata,   pend, '\n');   // newlines to be overwritten
   gcc_assert(0 == std::count(input, eoinput, '\n'));  // newlines in input

   memmove(eodata, input, len);
   nnl += std::count(pend, pend + nnl, '\n'); // other newlines to be overwritten
   std::fill(pend, pend + nnl, '\n');

   eodata = pend;

 #if LEXIO
   dbgmsg("%s:%3d: postcondition '%.*s'", __func__, __LINE__, int(size() + len) - 1, data);
 #endif

   return eodata;
 }

 const char * cobol_filename();

 static filespan_t&
 mapped_file( FILE *input ) {
   static std::map<int, filespan_t> inputs;

   int fd = fileno(input);
   gcc_assert(fd > 0);

   filespan_t& mfile = inputs[fd];
   if( mfile.data ) {
     return mfile;
   }

   struct stat sb;
   if( 0 != fstat(fd, &sb) ) {
     cbl_err( "%s: could not stat fd %d", __func__, fd );
   }

   mfile.use_nada();

   if( sb.st_size > 0 ) {
     static const int flags = MAP_PRIVATE;

     void *p = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, flags, fd, 0);
     if( p == MAP_FAILED ) {
       cbl_err( "%s: could not map fd %d", __func__, fd );
     }

     mfile.lineno_reset();
     mfile.data = mfile.cur = mfile.eol = mfile.eodata = static_cast<char*>(p);
     mfile.eodata += sb.st_size;
   }
   return mfile;
 }

 char filespan_t::empty_file[8] = "      \n";

 static void unmap_file( filespan_t& mfile  ) {
   if( ! mfile.nada() ) {
     munmap(mfile.data, mfile.size() - 1);
   }
   mfile = filespan_t();
 }

 extern int yylineno;

 static void
 print_lexer_input( const char *buf, const char *ebuf ) {
   const char *eol, *lexio = getenv("lexio");
   int i;
   static int nbuf = 1;
   static FILE *output = NULL;

   if( !lexio ) return;
   if( !output ) {
     output = fopen( lexio, "w" );
     if( !output ) output = stderr;
   }

   fprintf( output, "*> buffer %d\n", nbuf );
   for( i = 0, eol = std::find(buf, ebuf, '\n');
        eol != ebuf; buf = eol, eol = std::find(buf, ebuf, '\n'), i++ ) {
     eol++;
     fprintf( output, "%5d %.*s", yylineno + i, int(eol - buf), buf );
   }
   if( buf < ebuf ) {
     fprintf( output, "%5d %.*s", yylineno + i, int(eol - buf), buf );
   }
   fprintf( output, "*> endbuf %d\n", nbuf++ );
   fflush(output);
 }

 /*
  * Fill about as much of the lexer's buffer as possible, except skip
  * leading blanks on blank lines.
  */
 int
 lexer_input( char buf[], int max_size, FILE *input ) {
   filespan_t& mfile( mapped_file(input) );

   if( mfile.cur == mfile.eodata ) {
     if( mfile.cur ) unmap_file(mfile);
     return 0;
   }

   gcc_assert( mfile.data <= mfile.cur && mfile.cur < mfile.eodata );

   char *next = std::min(mfile.eodata, mfile.cur + max_size);
   buffer_t output(buf, buf + max_size); // initializes pos

   // Fill output, keeping only NL for blank lines.
   for( auto p = mfile.cur; p < next; *output.pos++ = *p++ ) {
     static bool at_bol = false;
     if( at_bol ) {
       auto nonblank_l = std::find_if( p, next,
                                     []( char ch ) {
                                       return !isblank(ch); } );
       if( nonblank_l + 1 < next ) {
         if( *nonblank_l == '\r' ) nonblank_l++; // Windows
         if( *nonblank_l == '\n' ) {
           p = nonblank_l;
           continue;
         }
       }
     }
     at_bol = *p == '\n';
   }

   gcc_assert(  output.pos <= output.eodata );
   output.eodata = output.pos;

   mfile.cur = next;
   gcc_assert(mfile.cur <= mfile.eodata);

   // Buffer full or input exhausted.
   print_lexer_input(output.data, output.eodata);

   return output.size();
 }

 static const char *
 find_filter( const char filter[] ) {

   if( 0 == access(filter, X_OK) ) {
     return filter;
   }

   const char *path = getenv("PATH");
   if( ! path ) return NULL;
   char *p = xstrdup(path), *eopath = p + strlen(p);

   while( *p != '\0' ) {
     auto pend = std::find( p, eopath, ':' );
     if( *pend == ':' ) *pend++ = '\0';

     char *name = xasprintf( "%s/%s", p, filter );

     if( 0 == access(name, X_OK) ) {
       return name;
     }
     p = pend;
   }
   return NULL;
 }

 bool verbose_file_reader = false;

 typedef std::pair <char *, std::list<std::string> > preprocessor_filter_t;
 static std::list<preprocessor_filter_t> preprocessor_filters;
 static std::list<const char *> included_files;

 /*
  * Keep a list of files added with -include on the command line.
  */
 bool
 include_file_add(const char filename[]) {
   struct stat sb;
   if( -1 == stat(filename, &sb) ) return false;
   included_files.push_back(filename);
   return true;
 }

 bool
 preprocess_filter_add( const char input[] ) {
   std::list <std::string> options;
   std::string filter(input);
   size_t pos = filter.find(",");

   if( pos != filter.npos ) {
     std::vector<char> others( filter.size() - pos, '\0' );
     std::copy( filter.begin() + pos + 1, filter.end(), others.begin() );
     filter.resize(pos);
     char *optstr = others.data();
     for( char *opt = optstr + 1; (opt = strtok(opt, ",")); opt = NULL ) {
       options.push_back(opt);
     }
   }

   auto filename = find_filter(filter.c_str());
   if( !filename ) {
     yywarn("preprocessor '%s/%s' not found", getcwd(NULL, 0), filter.c_str());
     return false;
   }
   preprocessor_filters.push_back( std::make_pair(xstrdup(filename), options) );
   return true;
 }

 void
 cdftext::echo_input( int input, const char filename[] ) {
   int fd;
   if( -1 == (fd = dup(input)) ) {
       yywarn( "could not open preprocessed file %s to echo to standard output",
                filename );
       return;
   }

   auto mfile = map_file(fd);

   if( -1 == write(STDOUT_FILENO, mfile.data, mfile.size()) ) {
     yywarn( "could not write preprocessed file %s to standard output",
           filename );
   }
   if( -1 == munmap(mfile.data, mfile.size()) ) {
     yywarn( "could not release mapped file" );
   }
   if( -1 == close(fd) ) {
     yywarn( "could not close mapped file" );
   }
 }

 static inline ino_t
 inode_of( int fd ) {
   struct stat sb;
   if( -1 == fstat(fd, &sb) ) {
     cbl_err( "could not stat fd %d", fd);
   }
   return sb.st_ino;
 }

 FILE *
 cdftext::lex_open( const char filename[] ) {
   int input = open_input( filename );
   if( input == -1 ) return NULL;

   int output = open_output();

   // Process any files supplied by the -include command-line option.
   for( auto name : included_files ) {
     if( -1 == (input = open(name, O_RDONLY)) ) {
       yyerrorvl(1, "", "cannot open -include file %s", name);
       continue;
     }
     cobol_filename(name, inode_of(input));
     filespan_t mfile( free_form_reference_format( input ) );

     process_file( mfile, output );

     cobol_filename_restore(); // process_file restores only for COPY
   }
   included_files.clear();

   cobol_filename(filename, inode_of(input));
   filespan_t mfile( free_form_reference_format( input ) );

   process_file( mfile, output );

   if( lexer_echo() ) {
     echo_input(output, filename);
   }

   for( auto filter_pair : preprocessor_filters ) {
     input  = output;
     output = open_output();

     char *filter = filter_pair.first;
     std::list<std::string>& options = filter_pair.second;

     std::vector <char*> argv(2 + options.size(), NULL);
     argv[0] = filter;

     auto last_argv = std::transform( options.begin(), options.end(), argv.begin() + 1,
                                      []( const std::string& opt ) {
                                        return xstrdup(opt.c_str());
                                      } );
     *last_argv = NULL;

     pid_t pid = fork();

     switch(pid){
     case -1: cbl_err( "%s", __func__);
       break;
     case 0: // child
       if( -1 == dup2(input, STDIN_FILENO) ) {
         cbl_err( "%s: could not dup input", __func__);
       }
       if( -1 == dup2(output, STDOUT_FILENO) ) {
         cbl_err( "%s: could not dup output", __func__);
       }
       if( -1 == lseek(STDIN_FILENO, SEEK_SET, 0) ) {
         cbl_err( "%s: could not seek to start of file", __func__);
       }
       int erc;
       if( -1 == (erc = execv(filter, argv.data())) ) {
         yywarn("could not execute %s", filter);
       }
       _exit(erc);
     }
     int status;
     auto kid = wait(&status);
     gcc_assert(pid == kid);
     if( kid == -1 ) cbl_err( "failed waiting for pid %ld", static_cast<long>(pid));

     if( WIFSIGNALED(status) ) {
       cbl_errx( "%s pid %ld terminated by %s",
                 filter, static_cast<long>(kid), strsignal(WTERMSIG(status)) );
     }
     if( WIFEXITED(status) ) {
       if( (status = WEXITSTATUS(status)) != 0 ) {
         cbl_errx( "%s exited with status %d",
              filter, status);
       }
     }
     yywarn( "applied %s", filter );
   }

   return fdopen( output, "r");
 }

 int
 cdftext::open_input( const char filename[] ) {
   int fd = open(filename, O_RDONLY);
   if( fd == -1 ) {
     dbgmsg( "could not open '%s': %s", filename, xstrerror(errno) );
   }

   verbose_file_reader = NULL != getenv("GCOBOL_TEMPDIR");

   if( verbose_file_reader ) {
     yywarn("verbose: opening %s for input", filename);
   }
   return fd;
 }

 int
 cdftext::open_output() {
   char *name = getenv("GCOBOL_TEMPDIR");

   if( name && 0 != strcmp(name, "/") ) {
     int fd;
     char * stem = xasprintf("%sXXXXXX", name);
     if( -1 == (fd = mkstemp(stem)) ) {
       cbl_err( "could not open temporary file '%s' (%s)",
                name, realpath(name, stem));
     }
     return fd;
   }

   FILE *fh = tmpfile();
   if( !fh ) {
     cbl_err("could not create temporary file");
   }

   return fileno(fh);
 }

 filespan_t
 cdftext::map_file( int fd ) {
   gcc_assert(fd > 0);

   filespan_t mfile;
   mfile.use_nada();

   struct stat sb;
   do {
     if( 0 != fstat(fd, &sb) ) {
       cbl_err( "%s: could not stat fd %d", __func__, fd );
     }
     if( S_ISFIFO(sb.st_mode) ) {
       // Copy FIFO to regular file that can be mapped.
       int input = open_output();
       std::swap(fd, input); // fd will continue to be the input
       static char block[4096 * 4];
       ssize_t n;
       while( (n = read(input, block, sizeof(block))) != 0 ) {
         ssize_t nout = write(fd, block, n);
         if( nout != n ) {
           cbl_err( "%s: could not prepare map file from FIFO %d",
               __func__, input);
         }
         if( false ) dbgmsg("%s: copied " HOST_SIZE_T_PRINT_DEC " bytes from FIFO",
                                 __func__, (fmt_size_t)nout);
       }
     }
   } while( S_ISFIFO(sb.st_mode) );

   if( sb.st_size > 0 ) {
     static const int flags = MAP_PRIVATE;

     void *p = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, flags, fd, 0);
     if( p == MAP_FAILED ) {
       cbl_err( "%s: could not map fd %d", __func__, fd );
     }

     mfile.lineno_reset();
     mfile.data = mfile.cur = mfile.eol = mfile.eodata = static_cast<char*>(p);
     mfile.eodata += sb.st_size;
   }

   return mfile;
 }

 bool lexio_dialect_mf();

 /*
  * A valid sequence area is 6 digits or blanks at the begining of the line that
  * contains PROGRAM-ID. Return NULL if no valid sequence area, else return
  * pointer to BOL.
  */
 static const char *
 valid_sequence_area( const char *data, const char *eodata ) {

   for( const char *p = data;
        (p = std::find_if(p, eodata, is_p)) != eodata;
        p++ )
   {
     auto eol = std::find(p, eodata, '\n');
     if( p == data || ISSPACE(p[-1]) ) {
       if( is_program_id(p, eol) ) {  // found program-id token
 	const char *bol = p;
 	for( ; data <= bol-1 && bol[-1] != '\n'; --bol )
 	  ;
 	if( 6 < p - bol ) {
 	  if( std::all_of(bol, bol+6, ::isdigit) ) {
 	    return bol;
 	  }
 	  if( std::all_of(bol, bol+6, ::isblank) ) {
 	    return bol;
 	  }
 	  break;
 	}
       }
     }
   }
   return nullptr;
 }

 /*
  * Reference Format -- valid COBOL between columns 8 and 72 -- has data after
  * column 72 on the PROGRAM-ID line. Extended Reference Format (that allows
  * longer lines) has no reason to follow the PROGRAM-ID with more stuff.
  */
 static bool
 infer_reference_format( const char *bol, const char *eodata ) {
   assert(bol);
   auto eol = std::find(bol, eodata, '\n');
   if( 72 < eol - bol ) {
     return ! std::all_of(bol + 72, eol, ::isspace);
   }
   return false;
 }

 filespan_t
 cdftext::free_form_reference_format( int input ) {
   filespan_t source_buffer = map_file(input);
   filespan_t mfile(source_buffer);

   /*
    * current_line_t describes the segment of mapped file that is the
    * "current line" being processed.  Its only use is for line
    * continuation, whether string literals or not.
    */
   struct current_line_t {
     size_t lineno;
     bytespan_t line;
     // construct with length zero
     explicit current_line_t( char data[] ) : lineno(0), line(data, data) {}
   } current( mfile.data );

   /*
    * Infer source code format.
    */
   if( cdf_source_format().inference_pending()  ) {
     const char *bol = valid_sequence_area(mfile.data, mfile.eodata);
     if( bol ) {
       cdf_source_format().infer( bol, infer_reference_format(bol, mfile.eodata) );
     }
   }

   while( mfile.next_line() ) {
     check_push_pop_directive(mfile);
     check_source_format_directive(mfile);
     remove_inline_comment(mfile.cur, mfile.eol);

     if( mfile.is_blank_line() ) continue;

     char *indcol = indicated(mfile.cur, mfile.eol); // true only for fixed
     //                                              // format

     if( is_fixed_format() && !indcol ) { // short line
       erase_source(mfile.cur, mfile.eol);
     }

     if( indcol ) {
       // Set to blank columns 1-6 and anything past the right margin.
       erase_source(mfile.cur, indcol);
       if( is_reference_format() ) {
         if( mfile.cur + right_margin() <  mfile.eol ) {
           auto p = std::find(mfile.cur + right_margin(), mfile.eol, '\n');
           erase_source(mfile.cur + right_margin(), p);
         }
       }

       switch( TOUPPER(*indcol) ) {
       case '-':
         gcc_assert(0 < current.line.size());
         /*
          * The "current line" -- the line being continued -- may be many
          * lines earlier (with many intervening newlines) or may intrude
          * on its succeeding line.  Erase the continuation line.
          */
         {
           char *pend = mfile.eol;
           if( right_margin() ) {
             pend = std::min(mfile.cur + right_margin(), mfile.eol);
           }
           // The appended segment has no newline because the erased line retains
           // one.
           pend = std::find(indcol + 1, pend, '\n');
           char *p = current.line.append(indcol + 1, pend );
           if( (p = std::max(p, mfile.cur)) < mfile.eol ) {
             erase_source(p, mfile.eol);
           }
         }
         continue;
       case SPACE:
         break;
       case 'D':
         /*
          * Pass the D to the lexer, because WITH DEBUGGING MODE is
          * parsed in the parser.  This assumes too strict a rule: that
          * all the source is in one format. In fact, DEBUGGING MODE
          * could be set on, and >>SOURCE-FORMAT can switch back and
          * forth. To solve that, we'd have to parse WITH DEBUGGING MODE
          * in free_form_reference_format(), which is a lot of work for
          * an obsolete feature.
          */
         break;
       case '*':
       case '/':
         if( indcol < mfile.eol - 1 ) {
           erase_source(indcol, mfile.eol);
         }
         continue;
       case '$':
         if( lexio_dialect_mf() ) {
           break;
         }
         __attribute__ ((fallthrough));
       default: // flag other characters in indicator area
         if( ! ISSPACE(indcol[0]) ) {
           yyerrorvl( mfile.lineno(), cobol_filename(),
                      "error: stray indicator '%c' (0x%x): \"%.*s\"",
                      indcol[0], indcol[0],
                      int(mfile.line_length() - 1), mfile.cur );
           *indcol = SPACE;
         }
         break;
       }
     }
     current.line.update(mfile.cur, mfile.eol, right_margin());
     current.lineno = mfile.lineno();
   } // next line

   return source_buffer;
 }

 /*
  * process_file is a recursive routine that opens and processes
  * included files.  It uses the input file stack in two ways: to check
  * copybook uniqueness, and (via the lexer) to keep track filenames
  * and line numbers.
  *
  * When reading copybook files, the copybook object enforces the rule
  * that no copybook may include itself, even indirectly. It does that
  * by relying on the unique_stack to deny a push.  Because the reader
  * makes no attempt to count lines, line numbers in the input stack
  * are all 1 at this point.
  *
  * When returning from the top-level recursion, the input stack has
  * the original file's name on top, with depth 1.  At that point, the
  * lexer begins tokenizing the input.
  *
  * The input stream sent to the lexer is delimited by #FILE tokens
  * denoting the source filename.  As far as the lexer is concerned,
  * there's only ever one file: the name passed to lex_open() when we
  * kicked things off.  But messages and the debugger need to know
  * which file and line each statment appeared in.
  *
  * The lexer uses the input stack to keep track of names and
  * numbers. The top of the input file stack is the current file
  * context, initially set to line 1. When the lexer sees a push, it
  * updates the top-of-stack with the current line number, yylineno,
  * and then pushes the copybook filename with line 1. When it sees a
  * pop, the current file is popped, of course; its line number no
  * longer matters. Then the top-of-stack is used to update the current
  * cobol filename and yylineno.
  */
 void
 cdftext::process_file( filespan_t mfile, int output, bool second_pass ) {
   static size_t nfiles = 0;

   __gnu_cxx::stdio_filebuf<char> outbuf(fdopen(output, "a"), std::ios::out);
   std::ostream out(&outbuf);
   std::ostream_iterator<char> ofs(out);

   // indicate current file
   static const char file_push[] = "\f#FILE PUSH ", file_pop[] = "\f#FILE POP\f";

   if( !included_files.empty() ) { ++nfiles; }; // force push/pop of included filename
   if( !second_pass && nfiles++ ) {
     static const char delimiter[] = "\f";
     const char *filename = cobol_filename();
     std::copy(file_push, file_push + strlen(file_push), ofs);
     std::copy(filename, filename + strlen(filename), ofs);
     std::copy(delimiter, delimiter + strlen(delimiter), ofs);
     out.flush();
   }

   // parse CDF directives
   while( mfile.next_line() ) {
     yylloc = mfile.as_location();
     auto copied = parse_copy_directive(mfile);
     if( copied.parsed && copied.fd != -1 ) {
       gcc_assert(copied.erased_lines.p);
       std::copy_if(copied.erased_lines.p, copied.erased_lines.pend, ofs,
                    []( char ch ) { return ch == '\n'; } );
       struct { int in, out; filespan_t mfile; } copy;
       dbgmsg("%s:%d: line " HOST_SIZE_T_PRINT_UNSIGNED ", opening %s on fd %d",
              __func__, __LINE__, (fmt_size_t)mfile.lineno(),
              copybook.source(), copybook.current()->fd);
       copy.in = copybook.current()->fd;
       copy.mfile = free_form_reference_format( copy.in );

       if( copied.partial_line.size() ) {
         std::copy(copied.partial_line.p, copied.partial_line.pend, ofs);
       }
       out.flush();

       if( copied.nreplace == 0 ) {
         // process with extant REPLACE directive
         process_file(copy.mfile, output);
       } else {
         copy.out = open_output();
         // process to intermediate, applying COPY ... REPLACING
         process_file(copy.mfile, copy.out);
         copy.mfile = map_file(copy.out);
         replace_directives.pop();
         // process intermediate with extant REPLACE directive
         process_file(copy.mfile, output, true);
         // COPY statement is erased from input if processed successfully
       }
       cobol_filename_restore();
     }

     auto erased = parse_replace_directive(mfile);
     if( erased.p ) {
       std::copy_if( erased.p, erased.pend, ofs,
                     []( char ch ) { return ch == '\n'; } );
     }
     if( replace_directives.empty() ) {
       std::copy(mfile.cur, mfile.eol, ofs);
       continue; // No active REPLACE directive.
     }

     std::list<span_t> segments = segment_line(mfile);

     for( const auto& segment : segments ) {
       std::copy(segment.p, segment.pend, ofs);
     }

     out.flush();
   }
   // end of file
   if( !second_pass && --nfiles ) {
     std::copy(file_pop, file_pop + strlen(file_pop), ofs);
     out.flush();
   }
   if( !included_files.empty() ) { --nfiles; };
 }

 std::list<span_t>
 cdftext::segment_line( filespan_t& mfile ) {
   std::list<span_t> output;

   gcc_assert( ! replace_directives.empty() );
   std::list<replace_t> pending;
   recognize_replacements( mfile, pending );

   if( pending.empty() ) {
     output.push_back( span_t(mfile.cur, mfile.eol) );
     return output;
   }

   /*
    * If the replacement changes the number of lines in the replaced text, we
    * need to reset the line number, because the next statement is on a
    * different line in the manipulated text than in the original.  Before each
    * replacement, set the original line number.  After each replacement, set
    * the line number after the elided text on the next line.
    */
   for( const replace_t& segment : pending ) {
     gcc_assert(mfile.cur <= segment.before.p);
     gcc_assert(segment.before.pend <= mfile.eodata);

     struct { unsigned long ante, post; } lineno = {
       gb4(mfile.lineno()), gb4(mfile.lineno() + segment.after.nlines())
     };
     const char *directive = lineno.ante == lineno.post?
       nullptr : xasprintf("\n#line %lu \"%s\"\n",
                           lineno.ante, cobol_filename());

     if( directive )
       output.push_back( span_t(strlen(directive), directive) );
     output.push_back( span_t(mfile.cur, segment.before.p) );
     output.push_back( span_t(segment.after.p, segment.after.pend ) );
     if( directive )
       output.push_back( span_t(strlen(directive), directive) );

     mfile.cur = const_cast<char*>(segment.before.pend);
   }

   if( mfile.eol < mfile.cur ) {
     if( (mfile.eol = std::find(mfile.cur, mfile.eodata, '\n')) < mfile.eodata ) {
       mfile.eol++;
     }
   }

   // last segment takes to EOL
   output.push_back( span_t(mfile.cur, mfile.eol) );

   return output;
 }