| /* |
| * Copyright (c) 2021-2025 Symas Corporation |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of the Symas Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef CHARMAPS_H |
| #define CHARMAPS_H |
| |
| #include <string> |
| #include <vector> |
| |
| #include <unistd.h> |
| |
| /* There are four distinct codeset domains in the COBOL compiler. |
| * |
| * First is the codeset of the console. Established by looking at what |
| * setlocale() reports, this can be either UTF-8 or some ASCII based code |
| * page. (We assume CP1252). Data coming from the console or the system, |
| * ACCEPT statements; redirected console input, getenv() and other system |
| * calls are in the "console" domain. |
| * |
| * Second is the internal single-byte-coded codeset of the data, in memory, |
| * being manipulated by the generated code of the cobol executable. The actual |
| * codeset of "internal" is either EBCDIC (in the form of Code Page 1140 or |
| * ASCII (Code Page 1252) |
| * |
| * Third is the C++ source code of the GCOBOL compiler; this comment is |
| * in that environment. We neither know, nor care, if this code is encoded in |
| * in UTF-8 (as is probable, in these enlighted days of 2022) or something like |
| * Code Page1252. We are going to regard it as "ascii" under the |
| * assumption that there is no reason for any character in the compiler's |
| * source code to have a code point outside of the plain vanilla 0x20 through |
| * 0x7F range. |
| * |
| * Fourth is the "raw" COBOL source code that is the input to the GCOBOL |
| * compiler. This domain can be either UTF-8 or something like CodePage1252. |
| * Which encoding is relevant; The literal string MOVE "<euro>1234" is seven |
| * bytes long in UTF-8, and five bytes long in CP1252. We start with an |
| * assumption that it is UTF-8 and switch to CP1252 upon encountering a byte |
| * sequence with values above 0x80 that can't be UTF-8. We have provision for |
| * forcing it to be one or the other. Codepoints in that domain are referenced |
| * as "raw". Codepoint in the "raw" domain don't last long; they are be |
| * converted to either "ascii" or "internal" early on, as necessary. |
| */ |
| |
| |
| /* Notes on character codesets: |
| |
| This library is implemented to handle "native" codesets of either ASCII (in |
| the form of a single-byte-coded codeset like code page 1252) or EBCDIC (in |
| the form of a single-byte-coded codeset like code page 1140). |
| |
| This C/C++ source code, however, is assumed to be an ASCII-based codeset, |
| so that a character constant like a space is assumed to encode as 0x20. |
| |
| Furthermore, we assume that the codeset of the COBOL source code being |
| compiled is also ASCII-based, even if it is actually UTF-8. Said another |
| way, characters encoded between zero and 127 are regarded as ASCII. |
| |
| This means that we are not going to try to compile EBCDIC COBOL source code; |
| any such will have to be externally converted to ASCII before feeding it |
| through this compiler on an ASCII based Linux system. |
| |
| This situation is rife for confusion here in the source code for the |
| library. |
| |
| To help reduce that confusion, we are going to eschew character constants |
| in the C/C++ source code. Instead, we use symbolic versions. In general, |
| "source_space" means 0x20, while "internal_space" will be either 0x20 |
| when using the ASCII-based native codeset, or it will be 0x40 when using |
| the EBCDIC-based native codeset. |
| |
| Maintaining one's sanity while learning and working with this C/C++ code |
| will require a firm grip on context. You'll have to keep track of whether |
| the character is being used to analyze the ASCII-based COBOL source, or |
| whether the character in question is part of the native COBOL cobol data |
| that is being analyzed or generated. |
| |
| For example, when a PICTURE string has in it a source_nine, the generated |
| result in the variable is based on character_zero. |
| |
| Stay alert! */ |
| |
| extern int __gg__decimal_point ; |
| extern int __gg__decimal_separator ; |
| extern int __gg__quote_character ; |
| extern int __gg__low_value_character ; |
| extern int __gg__high_value_character ; |
| extern std::vector<std::string> __gg__currency_signs ; |
| extern int __gg__default_currency_sign; |
| extern cbl_encoding_t __gg__display_encoding ; |
| extern cbl_encoding_t __gg__national_encoding ; |
| |
| #define NULLCH ('\0') |
| #define DEGENERATE_HIGH_VALUE 0xFF |
| #define DEGENERATE_LOW_VALUE 0x00 |
| |
| #define ascii_A ((uint8_t)('A')) |
| #define ascii_B ((uint8_t)('B')) |
| #define ascii_C ((uint8_t)('C')) |
| #define ascii_D ((uint8_t)('D')) |
| #define ascii_E ((uint8_t)('E')) |
| #define ascii_F ((uint8_t)('F')) |
| #define ascii_G ((uint8_t)('G')) |
| #define ascii_H ((uint8_t)('H')) |
| #define ascii_I ((uint8_t)('I')) |
| #define ascii_J ((uint8_t)('J')) |
| #define ascii_K ((uint8_t)('K')) |
| #define ascii_L ((uint8_t)('L')) |
| #define ascii_M ((uint8_t)('M')) |
| #define ascii_N ((uint8_t)('N')) |
| #define ascii_O ((uint8_t)('O')) |
| #define ascii_P ((uint8_t)('P')) |
| #define ascii_Q ((uint8_t)('Q')) |
| #define ascii_R ((uint8_t)('R')) |
| #define ascii_S ((uint8_t)('S')) |
| #define ascii_T ((uint8_t)('T')) |
| #define ascii_U ((uint8_t)('U')) |
| #define ascii_V ((uint8_t)('V')) |
| #define ascii_W ((uint8_t)('W')) |
| #define ascii_X ((uint8_t)('X')) |
| #define ascii_Y ((uint8_t)('Y')) |
| #define ascii_Z ((uint8_t)('Z')) |
| #define ascii_a ((uint8_t)('a')) |
| #define ascii_b ((uint8_t)('b')) |
| #define ascii_c ((uint8_t)('c')) |
| #define ascii_d ((uint8_t)('d')) |
| #define ascii_e ((uint8_t)('e')) |
| #define ascii_f ((uint8_t)('f')) |
| #define ascii_g ((uint8_t)('g')) |
| #define ascii_h ((uint8_t)('h')) |
| #define ascii_i ((uint8_t)('i')) |
| #define ascii_j ((uint8_t)('j')) |
| #define ascii_k ((uint8_t)('k')) |
| #define ascii_l ((uint8_t)('l')) |
| #define ascii_m ((uint8_t)('m')) |
| #define ascii_n ((uint8_t)('n')) |
| #define ascii_o ((uint8_t)('o')) |
| #define ascii_p ((uint8_t)('p')) |
| #define ascii_q ((uint8_t)('q')) |
| #define ascii_r ((uint8_t)('r')) |
| #define ascii_s ((uint8_t)('s')) |
| #define ascii_t ((uint8_t)('t')) |
| #define ascii_u ((uint8_t)('u')) |
| #define ascii_v ((uint8_t)('v')) |
| #define ascii_w ((uint8_t)('w')) |
| #define ascii_x ((uint8_t)('x')) |
| #define ascii_y ((uint8_t)('y')) |
| #define ascii_z ((uint8_t)('z')) |
| #define ascii_space ((uint8_t)(' ')) |
| #define ascii_zero ((uint8_t)('0')) |
| #define ascii_0 ((uint8_t)('0')) |
| #define ascii_1 ((uint8_t)('1')) |
| #define ascii_2 ((uint8_t)('2')) |
| #define ascii_3 ((uint8_t)('3')) |
| #define ascii_4 ((uint8_t)('4')) |
| #define ascii_5 ((uint8_t)('5')) |
| #define ascii_6 ((uint8_t)('6')) |
| #define ascii_7 ((uint8_t)('7')) |
| #define ascii_8 ((uint8_t)('8')) |
| #define ascii_9 ((uint8_t)('9')) |
| #define ascii_nine ((uint8_t)('9')) |
| #define ascii_period ((uint8_t)('.')) |
| #define ascii_colon ((uint8_t)(':')) |
| #define ascii_comma ((uint8_t)(',')) |
| #define ascii_dollar_sign ((uint8_t)('$')) |
| #define ascii_dquote ((uint8_t)('"')) |
| #define ascii_oparen ((uint8_t)('(')) |
| #define ascii_caret ((uint8_t)('^')) |
| #define ascii_slash ((uint8_t)('/')) |
| #define ascii_plus ((uint8_t)('+')) |
| #define ascii_minus ((uint8_t)('-')) |
| #define ascii_hyphen ((uint8_t)('-')) |
| #define ascii_underscore ((uint8_t)('_')) |
| #define ascii_asterisk ((uint8_t)('*')) |
| #define ascii_query ((uint8_t)('?')) |
| #define ascii_cr ((uint8_t)('\r')) |
| #define ascii_ff ((uint8_t)('\f')) |
| #define ascii_newline ((uint8_t)('\n')) |
| #define ascii_return ((uint8_t)('\r')) |
| |
| extern unsigned char __gg__data_space[1] ; |
| extern unsigned char __gg__data_low_values[1] ; |
| extern unsigned char __gg__data_zeros[1] ; |
| extern unsigned char __gg__data_high_values[1] ; |
| extern unsigned char __gg__data_quotes[1] ; |
| extern unsigned char __gg__data_upsi_0[2] ; |
| extern short __gg__data_return_code ; |
| |
| // These are the various hardcoded tables used for conversions. |
| extern const unsigned short __gg__one_to_one_values[256]; |
| extern const unsigned short __gg__cp1252_to_cp1140_values[256]; |
| extern const unsigned short __gg__cp1140_to_cp1252_values[256]; |
| |
| // These are the two standard collations. |
| extern const unsigned short __gg__cp1252_to_ebcdic_collation[256]; |
| extern const unsigned short __gg__ebcdic_to_cp1252_collation[256]; |
| |
| const char * __gg__encoding_iconv_name( cbl_encoding_t encoding ); |
| cbl_encoding_t __gg__encoding_iconv_type( const char *name ); |
| |
| char * __gg__iconverter(cbl_encoding_t from, |
| cbl_encoding_t to, |
| const char *str, |
| size_t length, |
| size_t *outlength); |
| |
| #define DEFAULT_SOURCE_ENCODING (iconv_CP1252_e) |
| |
| class charmap_t |
| { |
| private: |
| // This is the encoding of this character map |
| cbl_encoding_t m_encoding; |
| bool m_is_valid; |
| bool m_is_big_endian; |
| bool m_has_bom = false; |
| int m_stride; // Number of bytes between one character and the next |
| |
| enum |
| { |
| sign_type_ascii, |
| sign_type_ebcdic, |
| } m_numeric_sign_type; |
| |
| // This map retains the ASCII-to-encoded value in m_encoding, so that iconv |
| // need be called but once for each ASCII value. |
| std::unordered_map<int, int>m_map_of_encodings; |
| |
| public: |
| explicit charmap_t(cbl_encoding_t e) : m_encoding(e) |
| { |
| // We are constructing a new charmap_t from an arbitrary encoding. We |
| // need to figure out how wide it is, its endianness, whether or not |
| // it is EBCDIC-based, and so on. |
| |
| // We do that by converting "0" to the target encoding, and we analyze |
| // what we get back. |
| |
| size_t outlength = 0; |
| const char challenge[] = "0"; |
| const unsigned char *response = PTRCAST(unsigned char, |
| __gg__iconverter(DEFAULT_SOURCE_ENCODING, |
| m_encoding, |
| challenge, |
| 1, |
| &outlength)); |
| unsigned char char_0 = 0x00; |
| |
| m_is_valid = false; |
| m_has_bom = false; |
| m_is_big_endian = false; |
| |
| if( outlength == 1 ) |
| { |
| m_stride = 1; |
| // This is our happy place: A single-byte encoded character set. |
| char_0 = response[0]; |
| } |
| else if( outlength == 2 ) |
| { |
| m_stride = 2; |
| if( response[0] ) |
| { |
| char_0 = response[0]; |
| } |
| else if( response[1] ) |
| { |
| m_is_big_endian = true; |
| char_0 = response[1]; |
| } |
| } |
| else if( outlength == 4 ) |
| { |
| // Check for the Byte Order Mark (BOM) |
| if( response[0] == 0xFF && response[1] == 0xFE ) |
| { |
| m_stride = 2; |
| m_has_bom = true; |
| char_0 = response[2]; |
| } |
| else if( response[0] == 0xFE && response[1] == 0xFF ) |
| { |
| m_stride = 2; |
| m_has_bom = true; |
| m_is_big_endian = true; |
| char_0 = response[3]; |
| } |
| else if( response[0] ) |
| { |
| m_stride = 4; |
| char_0 = response[0]; |
| } |
| else |
| { |
| m_stride = 4; |
| m_is_big_endian = true; |
| char_0 = response[3]; |
| } |
| } |
| else if( outlength == 8 ) |
| { |
| m_stride = 4; |
| if( response[0] == 0xFF && response[1] == 0xFE ) |
| { |
| char_0 = response[4]; |
| } |
| else if( response[0] == 0xFE && response[1] == 0xFF ) |
| { |
| m_is_big_endian = true; |
| char_0 = response[7]; |
| } |
| } |
| |
| // With everything else established, we now check the zero character. |
| // We know about only 0x30 for ASCII and 0xF0 for EBCDIC. |
| if( char_0 == 0x30 ) |
| { |
| m_is_valid = true; |
| m_numeric_sign_type = sign_type_ascii; |
| } |
| else if( char_0 == 0xF0 ) |
| { |
| m_is_valid = true; |
| m_numeric_sign_type = sign_type_ebcdic; |
| } |
| } |
| |
| bool is_valid() const{return m_is_valid ;} |
| bool is_big_endian() const{return m_is_big_endian;} |
| bool has_bom() const{return m_has_bom ;} |
| int stride() const{return m_stride ;} |
| |
| int mapped_character(int ch) |
| { |
| // The assumption is that anybody calling this routine is providing |
| // a single-byte character in the DEFAULT_SOURCE_ENCODING encoding. We |
| // return the equivalent character in the m_encoding |
| int retval; |
| std::unordered_map<int, int>::const_iterator it = |
| m_map_of_encodings.find(ch); |
| if( it != m_map_of_encodings.end() ) |
| { |
| retval = it->second; |
| } |
| else |
| { |
| retval = 0; |
| size_t outlength = 0; |
| const char *mapped = __gg__iconverter(DEFAULT_SOURCE_ENCODING, |
| m_encoding, |
| PTRCAST(char, &ch), |
| 1, |
| &outlength); |
| memcpy(&retval, mapped, outlength); |
| m_map_of_encodings[ch] = retval; |
| } |
| return retval; |
| } |
| |
| int decimal_point() |
| { |
| return mapped_character(__gg__decimal_point); |
| } |
| int decimal_separator() |
| { |
| return mapped_character(__gg__decimal_separator); |
| } |
| int quote_character() |
| { |
| return mapped_character(__gg__quote_character); |
| } |
| int low_value_character() |
| { |
| return mapped_character(__gg__low_value_character); |
| } |
| int high_value_character() |
| { |
| return mapped_character(__gg__high_value_character); |
| } |
| |
| int figconst_character(cbl_figconst_t figconst) |
| { |
| int const_char = 0; // Head off a compiler warning |
| switch(figconst) |
| { |
| case normal_value_e : |
| const_char = -1; |
| break; |
| case low_value_e : |
| const_char = low_value_character(); |
| break; |
| case zero_value_e : |
| const_char = mapped_character(ascii_0); |
| break; |
| case space_value_e : |
| const_char = mapped_character(ascii_space); |
| break; |
| case quote_value_e : |
| const_char = quote_character(); |
| break; |
| case high_value_e : |
| const_char = high_value_character(); |
| break; |
| case null_value_e: |
| const_char = '\0'; |
| break; |
| default: |
| abort(); |
| break; |
| } |
| return const_char; |
| } |
| |
| bool |
| is_digit_negative(int digit) |
| { |
| bool retval; |
| switch(m_numeric_sign_type) |
| { |
| case sign_type_ascii: |
| retval = !!(digit & NUMERIC_DISPLAY_SIGN_BIT_ASCII); |
| break; |
| |
| case sign_type_ebcdic: |
| retval = !!((~digit) & NUMERIC_DISPLAY_SIGN_BIT_EBCDIC); |
| break; |
| } |
| return retval; |
| } |
| |
| int |
| set_digit_negative(int digit, bool is_negative) |
| { |
| switch(m_numeric_sign_type) |
| { |
| case sign_type_ascii: |
| if( is_negative ) |
| { |
| digit |= NUMERIC_DISPLAY_SIGN_BIT_ASCII; |
| } |
| else |
| { |
| digit &= ~NUMERIC_DISPLAY_SIGN_BIT_ASCII; |
| } |
| break; |
| |
| case sign_type_ebcdic: |
| if( is_negative ) |
| { |
| digit &= ~NUMERIC_DISPLAY_SIGN_BIT_EBCDIC; |
| } |
| else |
| { |
| digit |= NUMERIC_DISPLAY_SIGN_BIT_EBCDIC; |
| } |
| break; |
| } |
| return digit; |
| } |
| |
| bool |
| is_like_ebcdic() const |
| { |
| return m_numeric_sign_type == sign_type_ebcdic; |
| } |
| |
| }; |
| |
| charmap_t *__gg__get_charmap(cbl_encoding_t encoding); |
| |
| #endif |