| /* |
| * Copyright (c) 2021-2025 Symas Corporation |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of the Symas Corporation nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef CHARMAPS_H |
| #define CHARMAPS_H |
| |
| #include <unistd.h> |
| |
| /* There are four distinct codeset domains in the COBOL compiler. |
| * |
| * First is the codeset of the console. Established by looking at what |
| * setlocale() reports, this can be either UTF-8 or some ASCII based code |
| * page. (We assume CP1252). Data coming from the console or the system, |
| * ACCEPT statements; redirected console input, getenv() and other system |
| * calls are in the "console" domain. |
| * |
| * Second is the internal single-byte-coded codeset of the data, in memory, |
| * being manipulated by the generated code of the cobol executable. The actual |
| * codeset of "internal" is either EBCDIC (in the form of Code Page 1140 or |
| * ASCII (Code Page 1252) |
| * |
| * Third is the C++ source code of the GCOBOL compiler; this comment is |
| * in that environment. We neither know, nor care, if this code is encoded in |
| * in UTF-8 (as is probable, in these enlighted days of 2022) or something like |
| * Code Page1252. We are going to regard it as "ascii" under the |
| * assumption that there is no reason for any character in the compiler's |
| * source code to have a code point outside of the plain vanilla 0x20 through |
| * 0x7F range. |
| * |
| * Fourth is the "raw" COBOL source code that is the input to the GCOBOL |
| * compiler. This domain can be either UTF-8 or something like CodePage1252. |
| * Which encoding is relevant; The literal string MOVE "<euro>1234" is seven |
| * bytes long in UTF-8, and five bytes long in CP1252. We start with an |
| * assumption that it is UTF-8 and switch to CP1252 upon encountering a byte |
| * sequence with values above 0x80 that can't be UTF-8. We have provision for |
| * forcing it to be one or the other. Codepoints in that domain are referenced |
| * as "raw". Codepoint in the "raw" domain don't last long; they are be |
| * converted to either "ascii" or "internal" early on, as necessary. |
| */ |
| |
| |
| /* Notes on character codesets: |
| |
| This library is implemented to handle "native" codesets of either ASCII (in |
| the form of a single-byte-coded codeset like code page 1252) or EBCDIC (in |
| the form of a single-byte-coded codeset like code page 1140). |
| |
| This C/C++ source code, however, is assumed to be an ASCII-based codeset, |
| so that a character constant like a space is assumed to encode as 0x20. |
| |
| Furthermore, we assume that the codeset of the COBOL source code being |
| compiled is also ASCII-based, even if it is actually UTF-8. Said another |
| way, characters encoded between zero and 127 are regarded as ASCII. |
| |
| This means that we are not going to try to compile EBCDIC COBOL source code; |
| any such will have to be externally converted to ASCII before feeding it |
| through this compiler on an ASCII based Linux system. |
| |
| This situation is rife for confusion here in the source code for the |
| library. |
| |
| To help reduce that confusion, we are going to eschew character constants |
| in the C/C++ source code. Instead, we use symbolic versions. In general, |
| "source_space" means 0x20, while "internal_space" will be either 0x20 |
| when using the ASCII-based native codeset, or it will be 0x40 when using |
| the EBCDIC-based native codeset. |
| |
| Maintaining one's sanity while learning and working with this C/C++ code |
| will require a firm grip on context. You'll have to keep track of whether |
| the character is being used to analyze the ASCII-based COBOL source, or |
| whether the character in question is part of the native COBOL cobol data |
| that is being analyzed or generated. |
| |
| For example, when a PICTURE string has in it a source_nine, the generated |
| result in the variable is based on character_zero. |
| |
| Stay alert! */ |
| |
| |
| extern bool __gg__ebcdic_codeset_in_use; |
| #define internal_is_ebcdic (__gg__ebcdic_codeset_in_use) |
| |
| extern unsigned short const *__gg__internal_codeset_map; |
| |
| #define NULLCH ('\0') |
| #define DEGENERATE_HIGH_VALUE 0xFF |
| #define DEGENERATE_LOW_VALUE 0x00 |
| |
| #define ascii_A ((uint8_t)('A')) |
| #define ascii_B ((uint8_t)('B')) |
| #define ascii_C ((uint8_t)('C')) |
| #define ascii_D ((uint8_t)('D')) |
| #define ascii_E ((uint8_t)('E')) |
| #define ascii_F ((uint8_t)('F')) |
| #define ascii_G ((uint8_t)('G')) |
| #define ascii_H ((uint8_t)('H')) |
| #define ascii_I ((uint8_t)('I')) |
| #define ascii_J ((uint8_t)('J')) |
| #define ascii_K ((uint8_t)('K')) |
| #define ascii_L ((uint8_t)('L')) |
| #define ascii_M ((uint8_t)('M')) |
| #define ascii_N ((uint8_t)('N')) |
| #define ascii_O ((uint8_t)('O')) |
| #define ascii_P ((uint8_t)('P')) |
| #define ascii_Q ((uint8_t)('Q')) |
| #define ascii_R ((uint8_t)('R')) |
| #define ascii_S ((uint8_t)('S')) |
| #define ascii_T ((uint8_t)('T')) |
| #define ascii_U ((uint8_t)('U')) |
| #define ascii_V ((uint8_t)('V')) |
| #define ascii_W ((uint8_t)('W')) |
| #define ascii_X ((uint8_t)('X')) |
| #define ascii_Y ((uint8_t)('Y')) |
| #define ascii_Z ((uint8_t)('Z')) |
| #define ascii_a ((uint8_t)('a')) |
| #define ascii_b ((uint8_t)('b')) |
| #define ascii_c ((uint8_t)('c')) |
| #define ascii_d ((uint8_t)('d')) |
| #define ascii_e ((uint8_t)('e')) |
| #define ascii_f ((uint8_t)('f')) |
| #define ascii_g ((uint8_t)('g')) |
| #define ascii_h ((uint8_t)('h')) |
| #define ascii_i ((uint8_t)('i')) |
| #define ascii_j ((uint8_t)('j')) |
| #define ascii_k ((uint8_t)('k')) |
| #define ascii_l ((uint8_t)('l')) |
| #define ascii_m ((uint8_t)('m')) |
| #define ascii_n ((uint8_t)('n')) |
| #define ascii_o ((uint8_t)('o')) |
| #define ascii_p ((uint8_t)('p')) |
| #define ascii_q ((uint8_t)('q')) |
| #define ascii_r ((uint8_t)('r')) |
| #define ascii_s ((uint8_t)('s')) |
| #define ascii_t ((uint8_t)('t')) |
| #define ascii_u ((uint8_t)('u')) |
| #define ascii_v ((uint8_t)('v')) |
| #define ascii_w ((uint8_t)('w')) |
| #define ascii_x ((uint8_t)('x')) |
| #define ascii_y ((uint8_t)('y')) |
| #define ascii_z ((uint8_t)('z')) |
| #define ascii_space ((uint8_t)(' ')) |
| #define ascii_zero ((uint8_t)('0')) |
| #define ascii_0 ((uint8_t)('0')) |
| #define ascii_1 ((uint8_t)('1')) |
| #define ascii_2 ((uint8_t)('2')) |
| #define ascii_3 ((uint8_t)('3')) |
| #define ascii_4 ((uint8_t)('4')) |
| #define ascii_5 ((uint8_t)('5')) |
| #define ascii_6 ((uint8_t)('6')) |
| #define ascii_7 ((uint8_t)('7')) |
| #define ascii_8 ((uint8_t)('8')) |
| #define ascii_9 ((uint8_t)('9')) |
| #define ascii_nine ((uint8_t)('9')) |
| #define ascii_period ((uint8_t)('.')) |
| #define ascii_colon ((uint8_t)(':')) |
| #define ascii_comma ((uint8_t)(',')) |
| #define ascii_dollar_sign ((uint8_t)('$')) |
| #define ascii_dquote ((uint8_t)('"')) |
| #define ascii_oparen ((uint8_t)('(')) |
| #define ascii_caret ((uint8_t)('^')) |
| #define ascii_slash ((uint8_t)('/')) |
| #define ascii_plus ((uint8_t)('+')) |
| #define ascii_minus ((uint8_t)('-')) |
| #define ascii_hyphen ((uint8_t)('-')) |
| #define ascii_underscore ((uint8_t)('_')) |
| #define ascii_asterisk ((uint8_t)('*')) |
| #define ascii_query ((uint8_t)('?')) |
| #define ascii_cr ((uint8_t)('\r')) |
| #define ascii_ff ((uint8_t)('\f')) |
| #define ascii_newline ((uint8_t)('\n')) |
| #define ascii_return ((uint8_t)('\r')) |
| |
| #define internal_space ((uint8_t)__gg__internal_codeset_map[ascii_space]) |
| #define internal_zero ((uint8_t)__gg__internal_codeset_map[ascii_zero]) |
| #define internal_period ((uint8_t)__gg__internal_codeset_map[ascii_period]) |
| #define internal_comma ((uint8_t)__gg__internal_codeset_map[ascii_comma]) |
| #define internal_dquote ((uint8_t)__gg__internal_codeset_map[ascii_dquote]) |
| #define internal_asterisk ((uint8_t)__gg__internal_codeset_map[ascii_asterisk]) |
| #define internal_plus ((uint8_t)__gg__internal_codeset_map[ascii_plus]) |
| #define internal_minus ((uint8_t)__gg__internal_codeset_map[ascii_minus]) |
| #define internal_cr ((uint8_t)__gg__internal_codeset_map[ascii_cr]) |
| #define internal_ff ((uint8_t)__gg__internal_codeset_map[ascii_ff]) |
| #define internal_newline ((uint8_t)__gg__internal_codeset_map[ascii_newline]) |
| #define internal_return ((uint8_t)__gg__internal_codeset_map[ascii_return]) |
| #define internal_0 ((uint8_t)__gg__internal_codeset_map[ascii_0]) |
| #define internal_1 ((uint8_t)__gg__internal_codeset_map[ascii_1]) |
| #define internal_2 ((uint8_t)__gg__internal_codeset_map[ascii_2]) |
| #define internal_3 ((uint8_t)__gg__internal_codeset_map[ascii_3]) |
| #define internal_4 ((uint8_t)__gg__internal_codeset_map[ascii_4]) |
| #define internal_5 ((uint8_t)__gg__internal_codeset_map[ascii_5]) |
| #define internal_6 ((uint8_t)__gg__internal_codeset_map[ascii_6]) |
| #define internal_7 ((uint8_t)__gg__internal_codeset_map[ascii_7]) |
| #define internal_8 ((uint8_t)__gg__internal_codeset_map[ascii_8]) |
| #define internal_9 ((uint8_t)__gg__internal_codeset_map[ascii_9]) |
| #define internal_colon ((uint8_t)__gg__internal_codeset_map[ascii_colon]) |
| #define internal_query ((uint8_t)__gg__internal_codeset_map[ascii_query]) |
| #define internal_A ((uint8_t)__gg__internal_codeset_map[ascii_A]) |
| #define internal_B ((uint8_t)__gg__internal_codeset_map[ascii_B]) |
| #define internal_C ((uint8_t)__gg__internal_codeset_map[ascii_C]) |
| #define internal_D ((uint8_t)__gg__internal_codeset_map[ascii_D]) |
| #define internal_E ((uint8_t)__gg__internal_codeset_map[ascii_E]) |
| #define internal_F ((uint8_t)__gg__internal_codeset_map[ascii_F]) |
| #define internal_G ((uint8_t)__gg__internal_codeset_map[ascii_G]) |
| #define internal_H ((uint8_t)__gg__internal_codeset_map[ascii_H]) |
| #define internal_I ((uint8_t)__gg__internal_codeset_map[ascii_I]) |
| #define internal_J ((uint8_t)__gg__internal_codeset_map[ascii_J]) |
| #define internal_K ((uint8_t)__gg__internal_codeset_map[ascii_K]) |
| #define internal_L ((uint8_t)__gg__internal_codeset_map[ascii_L]) |
| #define internal_M ((uint8_t)__gg__internal_codeset_map[ascii_M]) |
| #define internal_N ((uint8_t)__gg__internal_codeset_map[ascii_N]) |
| #define internal_O ((uint8_t)__gg__internal_codeset_map[ascii_O]) |
| #define internal_P ((uint8_t)__gg__internal_codeset_map[ascii_P]) |
| #define internal_Q ((uint8_t)__gg__internal_codeset_map[ascii_Q]) |
| #define internal_R ((uint8_t)__gg__internal_codeset_map[ascii_R]) |
| #define internal_S ((uint8_t)__gg__internal_codeset_map[ascii_S]) |
| #define internal_T ((uint8_t)__gg__internal_codeset_map[ascii_T]) |
| #define internal_U ((uint8_t)__gg__internal_codeset_map[ascii_U]) |
| #define internal_V ((uint8_t)__gg__internal_codeset_map[ascii_V]) |
| #define internal_W ((uint8_t)__gg__internal_codeset_map[ascii_W]) |
| #define internal_X ((uint8_t)__gg__internal_codeset_map[ascii_X]) |
| #define internal_Y ((uint8_t)__gg__internal_codeset_map[ascii_Y]) |
| #define internal_Z ((uint8_t)__gg__internal_codeset_map[ascii_Z]) |
| #define internal_a ((uint8_t)__gg__internal_codeset_map[ascii_a]) |
| #define internal_b ((uint8_t)__gg__internal_codeset_map[ascii_b]) |
| #define internal_c ((uint8_t)__gg__internal_codeset_map[ascii_c]) |
| #define internal_d ((uint8_t)__gg__internal_codeset_map[ascii_d]) |
| #define internal_e ((uint8_t)__gg__internal_codeset_map[ascii_e]) |
| #define internal_f ((uint8_t)__gg__internal_codeset_map[ascii_f]) |
| #define internal_g ((uint8_t)__gg__internal_codeset_map[ascii_g]) |
| #define internal_h ((uint8_t)__gg__internal_codeset_map[ascii_h]) |
| #define internal_i ((uint8_t)__gg__internal_codeset_map[ascii_i]) |
| #define internal_j ((uint8_t)__gg__internal_codeset_map[ascii_j]) |
| #define internal_k ((uint8_t)__gg__internal_codeset_map[ascii_k]) |
| #define internal_l ((uint8_t)__gg__internal_codeset_map[ascii_l]) |
| #define internal_m ((uint8_t)__gg__internal_codeset_map[ascii_m]) |
| #define internal_n ((uint8_t)__gg__internal_codeset_map[ascii_n]) |
| #define internal_o ((uint8_t)__gg__internal_codeset_map[ascii_o]) |
| #define internal_p ((uint8_t)__gg__internal_codeset_map[ascii_p]) |
| #define internal_q ((uint8_t)__gg__internal_codeset_map[ascii_q]) |
| #define internal_r ((uint8_t)__gg__internal_codeset_map[ascii_r]) |
| #define internal_s ((uint8_t)__gg__internal_codeset_map[ascii_s]) |
| #define internal_t ((uint8_t)__gg__internal_codeset_map[ascii_t]) |
| #define internal_u ((uint8_t)__gg__internal_codeset_map[ascii_u]) |
| #define internal_v ((uint8_t)__gg__internal_codeset_map[ascii_v]) |
| #define internal_w ((uint8_t)__gg__internal_codeset_map[ascii_w]) |
| #define internal_x ((uint8_t)__gg__internal_codeset_map[ascii_x]) |
| #define internal_y ((uint8_t)__gg__internal_codeset_map[ascii_y]) |
| #define internal_z ((uint8_t)__gg__internal_codeset_map[ascii_z]) |
| |
| |
| enum text_device_t |
| { |
| td_default_e, |
| td_sourcecode_e, |
| td_console_e, |
| }; |
| |
| enum text_codeset_t |
| { |
| cs_default_e, |
| cs_utf8_e, |
| cs_cp1252_e, |
| cs_cp1140_e |
| }; |
| |
| |
| extern unsigned char __gg__data_space[1] ; |
| extern unsigned char __gg__data_low_values[1] ; |
| extern unsigned char __gg__data_zeros[1] ; |
| extern unsigned char __gg__data_high_values[1] ; |
| extern unsigned char __gg__data_quotes[1] ; |
| extern unsigned char __gg__data_upsi_0[2] ; |
| extern short __gg__data_return_code ; |
| |
| // These are the various hardcoded tables used for conversions. |
| extern const unsigned short __gg__one_to_one_values[256]; |
| extern const unsigned short __gg__cp1252_to_cp1140_values[256]; |
| extern const unsigned short __gg__cp1140_to_cp1252_values[256]; |
| |
| // These are the two standard collations. |
| extern const unsigned short __gg__cp1252_to_ebcdic_collation[256]; |
| extern const unsigned short __gg__ebcdic_to_cp1252_collation[256]; |
| |
| // As described above, we have a number of operations we need to accomplish. But |
| // the actual routines are dependent on whether EBCDIC or ASCII is in use. We |
| // implement that by having a function pointer for each function; those pointers |
| // are established when the __gg__ebcdic_codeset_in_use variable is established. |
| |
| // These routines convert a single ASCII character to either ASCII or EBCDIC |
| |
| extern "C" |
| char __gg__ascii_to_ascii_chr(char ch); |
| extern "C" |
| char __gg__ascii_to_ebcdic_chr(char ch); |
| extern "C" |
| char (*__gg__ascii_to_internal_chr)(char); |
| #define ascii_to_internal(a) ((*__gg__ascii_to_internal_chr)(a)) |
| |
| extern "C" |
| void __gg__ascii_to_ascii(char *str, size_t length); |
| extern "C" |
| void __gg__ascii_to_ebcdic(char *str, size_t length); |
| extern "C" |
| void (*__gg__ascii_to_internal_str)(char *str, size_t length); |
| #define ascii_to_internal_str(a, b) ((*__gg__ascii_to_internal_str)((a), (b))) |
| |
| extern "C" |
| char *__gg__raw_to_ascii(char **dest, size_t *dest_size, const char *str, size_t length); |
| extern "C" |
| char *__gg__raw_to_ebcdic(char **dest, size_t *dest_size, const char *in, size_t length); |
| extern "C" |
| char *(*__gg__raw_to_internal)(char **dest, size_t *dest_length, const char *in, size_t length); |
| #define raw_to_internal(a, b, c, d) ((*__gg__raw_to_internal)((a), (b), (c), (d))) |
| |
| extern "C" |
| char *__gg__ascii_to_console(char **dest, size_t *dest_size, char const * const str, const size_t length); |
| extern "C" |
| char *__gg__ebcdic_to_console(char **dest, size_t *dest_size, char const * const str, const size_t length); |
| extern "C" |
| char *(*__gg__internal_to_console_cm)(char **dest, size_t *dest_size, const char *in, size_t length); |
| #define internal_to_console(a, b, c, d) ((*__gg__internal_to_console_cm)((a), (b), (c), (d))) |
| |
| extern "C" |
| void __gg__console_to_ascii(char * const str, size_t length); |
| extern "C" |
| void __gg__console_to_ebcdic(char * const str, size_t length); |
| extern "C" |
| void (*__gg__console_to_internal_cm)(char * const str, size_t length); |
| #define console_to_internal(a, b) ((*__gg__console_to_internal_cm)((a), (b))) |
| |
| extern "C" |
| void __gg__ebcdic_to_ascii(char *str, const size_t length); |
| extern "C" |
| void (*__gg__internal_to_ascii)(char *str, size_t length); |
| #define internal_to_ascii(a, b) ((*__gg__internal_to_ascii)((a), (b))) |
| |
| extern "C" void __gg__set_internal_codeset(int use_ebcdic); |
| |
| extern "C" |
| void __gg__text_conversion_override(text_device_t device, |
| text_codeset_t codeset); |
| |
| #endif |