| /* |
| * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved. |
| * |
| * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED |
| * OR IMPLIED. ANY USE IS AT YOUR OWN RISK. |
| * |
| * Permission is hereby granted to use or copy this program |
| * for any purpose, provided the above notices are retained on all copies. |
| * Permission to modify the code and to distribute modified code is granted, |
| * provided the above notices are retained, and a notice that the code was |
| * modified is included with the above copyright notice. |
| * |
| * Author: Hans-J. Boehm (boehm@parc.xerox.com) |
| */ |
| /* |
| * These are functions on cords that do not need to understand their |
| * implementation. They serve also serve as example client code for |
| * cord_basics. |
| */ |
| /* Boehm, December 8, 1995 1:53 pm PST */ |
| # include <stdio.h> |
| # include <string.h> |
| # include <stdlib.h> |
| # include <stdarg.h> |
| # include "cord.h" |
| # include "ec.h" |
| # define I_HIDE_POINTERS /* So we get access to allocation lock. */ |
| /* We use this for lazy file reading, */ |
| /* so that we remain independent */ |
| /* of the threads primitives. */ |
| # include "gc.h" |
| |
| /* For now we assume that pointer reads and writes are atomic, */ |
| /* i.e. another thread always sees the state before or after */ |
| /* a write. This might be false on a Motorola M68K with */ |
| /* pointers that are not 32-bit aligned. But there probably */ |
| /* aren't too many threads packages running on those. */ |
| # define ATOMIC_WRITE(x,y) (x) = (y) |
| # define ATOMIC_READ(x) (*(x)) |
| |
| /* The standard says these are in stdio.h, but they aren't always: */ |
| # ifndef SEEK_SET |
| # define SEEK_SET 0 |
| # endif |
| # ifndef SEEK_END |
| # define SEEK_END 2 |
| # endif |
| |
| # define BUFSZ 2048 /* Size of stack allocated buffers when */ |
| /* we want large buffers. */ |
| |
| typedef void (* oom_fn)(void); |
| |
| # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \ |
| ABORT("Out of memory\n"); } |
| # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); } |
| |
| CORD CORD_cat_char(CORD x, char c) |
| { |
| register char * string; |
| |
| if (c == '\0') return(CORD_cat(x, CORD_nul(1))); |
| string = GC_MALLOC_ATOMIC(2); |
| if (string == 0) OUT_OF_MEMORY; |
| string[0] = c; |
| string[1] = '\0'; |
| return(CORD_cat_char_star(x, string, 1)); |
| } |
| |
| CORD CORD_catn(int nargs, ...) |
| { |
| register CORD result = CORD_EMPTY; |
| va_list args; |
| register int i; |
| |
| va_start(args, nargs); |
| for (i = 0; i < nargs; i++) { |
| register CORD next = va_arg(args, CORD); |
| result = CORD_cat(result, next); |
| } |
| va_end(args); |
| return(result); |
| } |
| |
| typedef struct { |
| size_t len; |
| size_t count; |
| char * buf; |
| } CORD_fill_data; |
| |
| int CORD_fill_proc(char c, void * client_data) |
| { |
| register CORD_fill_data * d = (CORD_fill_data *)client_data; |
| register size_t count = d -> count; |
| |
| (d -> buf)[count] = c; |
| d -> count = ++count; |
| if (count >= d -> len) { |
| return(1); |
| } else { |
| return(0); |
| } |
| } |
| |
| int CORD_batched_fill_proc(const char * s, void * client_data) |
| { |
| register CORD_fill_data * d = (CORD_fill_data *)client_data; |
| register size_t count = d -> count; |
| register size_t max = d -> len; |
| register char * buf = d -> buf; |
| register const char * t = s; |
| |
| while((buf[count] = *t++) != '\0') { |
| count++; |
| if (count >= max) { |
| d -> count = count; |
| return(1); |
| } |
| } |
| d -> count = count; |
| return(0); |
| } |
| |
| /* Fill buf with len characters starting at i. */ |
| /* Assumes len characters are available. */ |
| void CORD_fill_buf(CORD x, size_t i, size_t len, char * buf) |
| { |
| CORD_fill_data fd; |
| |
| fd.len = len; |
| fd.buf = buf; |
| fd.count = 0; |
| (void)CORD_iter5(x, i, CORD_fill_proc, CORD_batched_fill_proc, &fd); |
| } |
| |
| int CORD_cmp(CORD x, CORD y) |
| { |
| CORD_pos xpos; |
| CORD_pos ypos; |
| register size_t avail, yavail; |
| |
| if (y == CORD_EMPTY) return(x != CORD_EMPTY); |
| if (x == CORD_EMPTY) return(-1); |
| if (CORD_IS_STRING(y) && CORD_IS_STRING(x)) return(strcmp(x,y)); |
| CORD_set_pos(xpos, x, 0); |
| CORD_set_pos(ypos, y, 0); |
| for(;;) { |
| if (!CORD_pos_valid(xpos)) { |
| if (CORD_pos_valid(ypos)) { |
| return(-1); |
| } else { |
| return(0); |
| } |
| } |
| if (!CORD_pos_valid(ypos)) { |
| return(1); |
| } |
| if ((avail = CORD_pos_chars_left(xpos)) <= 0 |
| || (yavail = CORD_pos_chars_left(ypos)) <= 0) { |
| register char xcurrent = CORD_pos_fetch(xpos); |
| register char ycurrent = CORD_pos_fetch(ypos); |
| if (xcurrent != ycurrent) return(xcurrent - ycurrent); |
| CORD_next(xpos); |
| CORD_next(ypos); |
| } else { |
| /* process as many characters as we can */ |
| register int result; |
| |
| if (avail > yavail) avail = yavail; |
| result = strncmp(CORD_pos_cur_char_addr(xpos), |
| CORD_pos_cur_char_addr(ypos), avail); |
| if (result != 0) return(result); |
| CORD_pos_advance(xpos, avail); |
| CORD_pos_advance(ypos, avail); |
| } |
| } |
| } |
| |
| int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len) |
| { |
| CORD_pos xpos; |
| CORD_pos ypos; |
| register size_t count; |
| register long avail, yavail; |
| |
| CORD_set_pos(xpos, x, x_start); |
| CORD_set_pos(ypos, y, y_start); |
| for(count = 0; count < len;) { |
| if (!CORD_pos_valid(xpos)) { |
| if (CORD_pos_valid(ypos)) { |
| return(-1); |
| } else { |
| return(0); |
| } |
| } |
| if (!CORD_pos_valid(ypos)) { |
| return(1); |
| } |
| if ((avail = CORD_pos_chars_left(xpos)) <= 0 |
| || (yavail = CORD_pos_chars_left(ypos)) <= 0) { |
| register char xcurrent = CORD_pos_fetch(xpos); |
| register char ycurrent = CORD_pos_fetch(ypos); |
| if (xcurrent != ycurrent) return(xcurrent - ycurrent); |
| CORD_next(xpos); |
| CORD_next(ypos); |
| count++; |
| } else { |
| /* process as many characters as we can */ |
| register int result; |
| |
| if (avail > yavail) avail = yavail; |
| count += avail; |
| if (count > len) avail -= (count - len); |
| result = strncmp(CORD_pos_cur_char_addr(xpos), |
| CORD_pos_cur_char_addr(ypos), (size_t)avail); |
| if (result != 0) return(result); |
| CORD_pos_advance(xpos, (size_t)avail); |
| CORD_pos_advance(ypos, (size_t)avail); |
| } |
| } |
| return(0); |
| } |
| |
| char * CORD_to_char_star(CORD x) |
| { |
| register size_t len = CORD_len(x); |
| char * result = GC_MALLOC_ATOMIC(len + 1); |
| |
| if (result == 0) OUT_OF_MEMORY; |
| CORD_fill_buf(x, 0, len, result); |
| result[len] = '\0'; |
| return(result); |
| } |
| |
| CORD CORD_from_char_star(const char *s) |
| { |
| char * result; |
| size_t len = strlen(s); |
| |
| if (0 == len) return(CORD_EMPTY); |
| result = GC_MALLOC_ATOMIC(len + 1); |
| if (result == 0) OUT_OF_MEMORY; |
| memcpy(result, s, len+1); |
| return(result); |
| } |
| |
| const char * CORD_to_const_char_star(CORD x) |
| { |
| if (x == 0) return(""); |
| if (CORD_IS_STRING(x)) return((const char *)x); |
| return(CORD_to_char_star(x)); |
| } |
| |
| char CORD_fetch(CORD x, size_t i) |
| { |
| CORD_pos xpos; |
| |
| CORD_set_pos(xpos, x, i); |
| if (!CORD_pos_valid(xpos)) ABORT("bad index?"); |
| return(CORD_pos_fetch(xpos)); |
| } |
| |
| |
| int CORD_put_proc(char c, void * client_data) |
| { |
| register FILE * f = (FILE *)client_data; |
| |
| return(putc(c, f) == EOF); |
| } |
| |
| int CORD_batched_put_proc(const char * s, void * client_data) |
| { |
| register FILE * f = (FILE *)client_data; |
| |
| return(fputs(s, f) == EOF); |
| } |
| |
| |
| int CORD_put(CORD x, FILE * f) |
| { |
| if (CORD_iter5(x, 0, CORD_put_proc, CORD_batched_put_proc, f)) { |
| return(EOF); |
| } else { |
| return(1); |
| } |
| } |
| |
| typedef struct { |
| size_t pos; /* Current position in the cord */ |
| char target; /* Character we're looking for */ |
| } chr_data; |
| |
| int CORD_chr_proc(char c, void * client_data) |
| { |
| register chr_data * d = (chr_data *)client_data; |
| |
| if (c == d -> target) return(1); |
| (d -> pos) ++; |
| return(0); |
| } |
| |
| int CORD_rchr_proc(char c, void * client_data) |
| { |
| register chr_data * d = (chr_data *)client_data; |
| |
| if (c == d -> target) return(1); |
| (d -> pos) --; |
| return(0); |
| } |
| |
| int CORD_batched_chr_proc(const char *s, void * client_data) |
| { |
| register chr_data * d = (chr_data *)client_data; |
| register char * occ = strchr(s, d -> target); |
| |
| if (occ == 0) { |
| d -> pos += strlen(s); |
| return(0); |
| } else { |
| d -> pos += occ - s; |
| return(1); |
| } |
| } |
| |
| size_t CORD_chr(CORD x, size_t i, int c) |
| { |
| chr_data d; |
| |
| d.pos = i; |
| d.target = c; |
| if (CORD_iter5(x, i, CORD_chr_proc, CORD_batched_chr_proc, &d)) { |
| return(d.pos); |
| } else { |
| return(CORD_NOT_FOUND); |
| } |
| } |
| |
| size_t CORD_rchr(CORD x, size_t i, int c) |
| { |
| chr_data d; |
| |
| d.pos = i; |
| d.target = c; |
| if (CORD_riter4(x, i, CORD_rchr_proc, &d)) { |
| return(d.pos); |
| } else { |
| return(CORD_NOT_FOUND); |
| } |
| } |
| |
| /* Find the first occurrence of s in x at position start or later. */ |
| /* This uses an asymptotically poor algorithm, which should typically */ |
| /* perform acceptably. We compare the first few characters directly, */ |
| /* and call CORD_ncmp whenever there is a partial match. */ |
| /* This has the advantage that we allocate very little, or not at all. */ |
| /* It's very fast if there are few close misses. */ |
| size_t CORD_str(CORD x, size_t start, CORD s) |
| { |
| CORD_pos xpos; |
| size_t xlen = CORD_len(x); |
| size_t slen; |
| register size_t start_len; |
| const char * s_start; |
| unsigned long s_buf = 0; /* The first few characters of s */ |
| unsigned long x_buf = 0; /* Start of candidate substring. */ |
| /* Initialized only to make compilers */ |
| /* happy. */ |
| unsigned long mask = 0; |
| register size_t i; |
| register size_t match_pos; |
| |
| if (s == CORD_EMPTY) return(start); |
| if (CORD_IS_STRING(s)) { |
| s_start = s; |
| slen = strlen(s); |
| } else { |
| s_start = CORD_to_char_star(CORD_substr(s, 0, sizeof(unsigned long))); |
| slen = CORD_len(s); |
| } |
| if (xlen < start || xlen - start < slen) return(CORD_NOT_FOUND); |
| start_len = slen; |
| if (start_len > sizeof(unsigned long)) start_len = sizeof(unsigned long); |
| CORD_set_pos(xpos, x, start); |
| for (i = 0; i < start_len; i++) { |
| mask <<= 8; |
| mask |= 0xff; |
| s_buf <<= 8; |
| s_buf |= s_start[i]; |
| x_buf <<= 8; |
| x_buf |= CORD_pos_fetch(xpos); |
| CORD_next(xpos); |
| } |
| for (match_pos = start; ; match_pos++) { |
| if ((x_buf & mask) == s_buf) { |
| if (slen == start_len || |
| CORD_ncmp(x, match_pos + start_len, |
| s, start_len, slen - start_len) == 0) { |
| return(match_pos); |
| } |
| } |
| if ( match_pos == xlen - slen ) { |
| return(CORD_NOT_FOUND); |
| } |
| x_buf <<= 8; |
| x_buf |= CORD_pos_fetch(xpos); |
| CORD_next(xpos); |
| } |
| } |
| |
| void CORD_ec_flush_buf(CORD_ec x) |
| { |
| register size_t len = x[0].ec_bufptr - x[0].ec_buf; |
| char * s; |
| |
| if (len == 0) return; |
| s = GC_MALLOC_ATOMIC(len+1); |
| memcpy(s, x[0].ec_buf, len); |
| s[len] = '\0'; |
| x[0].ec_cord = CORD_cat_char_star(x[0].ec_cord, s, len); |
| x[0].ec_bufptr = x[0].ec_buf; |
| } |
| |
| void CORD_ec_append_cord(CORD_ec x, CORD s) |
| { |
| CORD_ec_flush_buf(x); |
| x[0].ec_cord = CORD_cat(x[0].ec_cord, s); |
| } |
| |
| /*ARGSUSED*/ |
| char CORD_nul_func(size_t i, void * client_data) |
| { |
| return((char)(unsigned long)client_data); |
| } |
| |
| |
| CORD CORD_chars(char c, size_t i) |
| { |
| return(CORD_from_fn(CORD_nul_func, (void *)(unsigned long)c, i)); |
| } |
| |
| CORD CORD_from_file_eager(FILE * f) |
| { |
| register int c; |
| CORD_ec ecord; |
| |
| CORD_ec_init(ecord); |
| for(;;) { |
| c = getc(f); |
| if (c == 0) { |
| /* Append the right number of NULs */ |
| /* Note that any string of NULs is rpresented in 4 words, */ |
| /* independent of its length. */ |
| register size_t count = 1; |
| |
| CORD_ec_flush_buf(ecord); |
| while ((c = getc(f)) == 0) count++; |
| ecord[0].ec_cord = CORD_cat(ecord[0].ec_cord, CORD_nul(count)); |
| } |
| if (c == EOF) break; |
| CORD_ec_append(ecord, c); |
| } |
| (void) fclose(f); |
| return(CORD_balance(CORD_ec_to_cord(ecord))); |
| } |
| |
| /* The state maintained for a lazily read file consists primarily */ |
| /* of a large direct-mapped cache of previously read values. */ |
| /* We could rely more on stdio buffering. That would have 2 */ |
| /* disadvantages: */ |
| /* 1) Empirically, not all fseek implementations preserve the */ |
| /* buffer whenever they could. */ |
| /* 2) It would fail if 2 different sections of a long cord */ |
| /* were being read alternately. */ |
| /* We do use the stdio buffer for read ahead. */ |
| /* To guarantee thread safety in the presence of atomic pointer */ |
| /* writes, cache lines are always replaced, and never modified in */ |
| /* place. */ |
| |
| # define LOG_CACHE_SZ 14 |
| # define CACHE_SZ (1 << LOG_CACHE_SZ) |
| # define LOG_LINE_SZ 9 |
| # define LINE_SZ (1 << LOG_LINE_SZ) |
| |
| typedef struct { |
| size_t tag; |
| char data[LINE_SZ]; |
| /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */ |
| } cache_line; |
| |
| typedef struct { |
| FILE * lf_file; |
| size_t lf_current; /* Current file pointer value */ |
| cache_line * volatile lf_cache[CACHE_SZ/LINE_SZ]; |
| } lf_state; |
| |
| # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1)) |
| # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ) |
| # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1)) |
| # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ) |
| # define LINE_START(n) ((n) & ~(LINE_SZ - 1)) |
| |
| typedef struct { |
| lf_state * state; |
| size_t file_pos; /* Position of needed character. */ |
| cache_line * new_cache; |
| } refill_data; |
| |
| /* Executed with allocation lock. */ |
| static char refill_cache(client_data) |
| refill_data * client_data; |
| { |
| register lf_state * state = client_data -> state; |
| register size_t file_pos = client_data -> file_pos; |
| FILE *f = state -> lf_file; |
| size_t line_start = LINE_START(file_pos); |
| size_t line_no = DIV_LINE_SZ(MOD_CACHE_SZ(file_pos)); |
| cache_line * new_cache = client_data -> new_cache; |
| |
| if (line_start != state -> lf_current |
| && fseek(f, line_start, SEEK_SET) != 0) { |
| ABORT("fseek failed"); |
| } |
| if (fread(new_cache -> data, sizeof(char), LINE_SZ, f) |
| <= file_pos - line_start) { |
| ABORT("fread failed"); |
| } |
| new_cache -> tag = DIV_LINE_SZ(file_pos); |
| /* Store barrier goes here. */ |
| ATOMIC_WRITE(state -> lf_cache[line_no], new_cache); |
| state -> lf_current = line_start + LINE_SZ; |
| return(new_cache->data[MOD_LINE_SZ(file_pos)]); |
| } |
| |
| char CORD_lf_func(size_t i, void * client_data) |
| { |
| register lf_state * state = (lf_state *)client_data; |
| register cache_line * volatile * cl_addr = |
| &(state -> lf_cache[DIV_LINE_SZ(MOD_CACHE_SZ(i))]); |
| register cache_line * cl = (cache_line *)ATOMIC_READ(cl_addr); |
| |
| if (cl == 0 || cl -> tag != DIV_LINE_SZ(i)) { |
| /* Cache miss */ |
| refill_data rd; |
| |
| rd.state = state; |
| rd.file_pos = i; |
| rd.new_cache = GC_NEW_ATOMIC(cache_line); |
| if (rd.new_cache == 0) OUT_OF_MEMORY; |
| return((char)(GC_word) |
| GC_call_with_alloc_lock((GC_fn_type) refill_cache, &rd)); |
| } |
| return(cl -> data[MOD_LINE_SZ(i)]); |
| } |
| |
| /*ARGSUSED*/ |
| void CORD_lf_close_proc(void * obj, void * client_data) |
| { |
| if (fclose(((lf_state *)obj) -> lf_file) != 0) { |
| ABORT("CORD_lf_close_proc: fclose failed"); |
| } |
| } |
| |
| CORD CORD_from_file_lazy_inner(FILE * f, size_t len) |
| { |
| register lf_state * state = GC_NEW(lf_state); |
| register int i; |
| |
| if (state == 0) OUT_OF_MEMORY; |
| if (len != 0) { |
| /* Dummy read to force buffer allocation. */ |
| /* This greatly increases the probability */ |
| /* of avoiding deadlock if buffer allocation */ |
| /* is redirected to GC_malloc and the */ |
| /* world is multithreaded. */ |
| char buf[1]; |
| |
| (void) fread(buf, 1, 1, f); |
| rewind(f); |
| } |
| state -> lf_file = f; |
| for (i = 0; i < CACHE_SZ/LINE_SZ; i++) { |
| state -> lf_cache[i] = 0; |
| } |
| state -> lf_current = 0; |
| GC_REGISTER_FINALIZER(state, CORD_lf_close_proc, 0, 0, 0); |
| return(CORD_from_fn(CORD_lf_func, state, len)); |
| } |
| |
| CORD CORD_from_file_lazy(FILE * f) |
| { |
| register long len; |
| |
| if (fseek(f, 0l, SEEK_END) != 0) { |
| ABORT("Bad fd argument - fseek failed"); |
| } |
| if ((len = ftell(f)) < 0) { |
| ABORT("Bad fd argument - ftell failed"); |
| } |
| rewind(f); |
| return(CORD_from_file_lazy_inner(f, (size_t)len)); |
| } |
| |
| # define LAZY_THRESHOLD (128*1024 + 1) |
| |
| CORD CORD_from_file(FILE * f) |
| { |
| register long len; |
| |
| if (fseek(f, 0l, SEEK_END) != 0) { |
| ABORT("Bad fd argument - fseek failed"); |
| } |
| if ((len = ftell(f)) < 0) { |
| ABORT("Bad fd argument - ftell failed"); |
| } |
| rewind(f); |
| if (len < LAZY_THRESHOLD) { |
| return(CORD_from_file_eager(f)); |
| } else { |
| return(CORD_from_file_lazy_inner(f, (size_t)len)); |
| } |
| } |