libcody/buffer.cc - gcc - Git at Google

 // CODYlib		-*- mode:c++ -*-
 // Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
 // License: Apache v2.0

 // Cody
 #include "internal.hh"
 // C++
 #include <algorithm>
 // C
 #include <cstring>
 // OS
 #include <unistd.h>
 #include <cerrno>

 // MessageBuffer code

 // Lines consist of words and end with a NEWLINE (0xa) char
 // Whitespace characters are TAB (0x9) and SPACE (0x20)
 // Words consist of non-whitespace chars separated by whitespace.
 // Multiple lines in one transaction are indicated by ending non-final
 // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
 // Continuations with ; preceding it
 // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
 // Quoting with '...'
 // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
 // Anything outside of <= <space> or DEL or \' or \\ needs escaping.
 // Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
 // Spaces separate words, UTF8 encoding for non-ascii chars

 namespace Cody {
 namespace Detail {

 static const char CONTINUE = S2C(u8";");

 void MessageBuffer::BeginLine ()
 {
   if (!buffer.empty ())
     {
       // Terminate the previous line with a continuation
       buffer.reserve (buffer.size () + 3);
       buffer.push_back (S2C(u8" "));
       buffer.push_back (CONTINUE);
       buffer.push_back (S2C(u8"\n"));
     }
   lastBol = buffer.size ();
 }

 // QUOTE means 'maybe quote', we search it for quote-needing chars

 void MessageBuffer::Append (char const *str, bool quote, size_t len)
 {
   if (len == ~size_t (0))
     len = strlen (str);

   if (!len && !quote)
     return;

   // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
   // that could remotely be shell-active.  UTF8 encoding for non-ascii.
   if (quote && len)
     {
       quote = false;
       // Scan looking for quote-needing characters.  We could just
       // append until we find one, but that's probably confusing
       for (size_t ix = len; ix--;)
 	{
 	  unsigned char c = (unsigned char)str[ix];
 	  if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
 		|| (c >= S2C(u8"A") && c <= S2C(u8"Z"))
 		|| (c >= S2C(u8"0") && c <= S2C(u8"9"))
 		|| c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
 		|| c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
 	    {
 	      quote = true;
 	      break;
 	    }
 	}
     }

   // Maximal length of appended string
   buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);

   if (quote)
     buffer.push_back (S2C(u8"'"));

   for (auto *end = str + len; str != end;)
     {
       auto *e = end;

       if (quote)
 	// Look for next escape-needing char.  More relaxed than
 	// the earlier needs-quoting check.
 	for (e = str; e != end; ++e)
 	  {
 	    unsigned char c = (unsigned char)*e;
 	    if (c < S2C(u8" ") || c == 0x7f
 		|| c == S2C(u8"\\") || c == S2C(u8"'"))
 	      break;
 	  }
       buffer.insert (buffer.end (), str, e);
       str = e;

       if (str == end)
 	break;

       buffer.push_back (S2C(u8"\\"));
       switch (unsigned char c = (unsigned char)*str++)
 	{
 	case S2C(u8"\t"):
 	  c = S2C(u8"t");
 	  goto append;

 	case S2C(u8"\n"):
 	  c = S2C(u8"n");
 	  goto append;

 	case S2C(u8"'"):
 	case S2C(u8"\\"):
 	append:
 	  buffer.push_back (c);
 	  break;

 	default:
 	  // Full-on escape.  Use 2 lower-case hex chars
 	  for (unsigned shift = 8; shift;)
 	    {
 	      shift -= 4;

 	      char nibble = (c >> shift) & 0xf;
 	      nibble += S2C(u8"0");
 	      if (nibble > S2C(u8"9"))
 		nibble += S2C(u8"a") - (S2C(u8"9") + 1);
 	      buffer.push_back (nibble);
 	    }
 	}
     }

   if (quote)
     buffer.push_back (S2C(u8"'"));
 }

 void MessageBuffer::Append (char c)
 {
   buffer.push_back (c);
 }

 void MessageBuffer::AppendInteger (unsigned u)
 {
   // Sigh, even though std::to_string is C++11, we support building on
   // gcc 4.8, which is a C++11 compiler lacking std::to_string.  so
   // have something horrible.
   std::string v (20, 0);
   size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
   v.erase (len);

   AppendWord (v);
 }

 int MessageBuffer::Write (int fd) noexcept
 {
   size_t limit = buffer.size () - lastBol;
   ssize_t count = write (fd, &buffer.data ()[lastBol], limit);

   int err = 0;
   if (count < 0)
     err = errno;
   else
     {
       lastBol += count;
       if (size_t (count) != limit)
 	err = EAGAIN;
     }

   if (err != EAGAIN && err != EINTR)
     {
       // Reset for next message
       buffer.clear ();
       lastBol = 0;
     }

   return err;
 }

 int MessageBuffer::Read (int fd) noexcept
 {
   constexpr size_t blockSize = 200;

   size_t lwm = buffer.size ();
   size_t hwm = buffer.capacity ();
   if (hwm - lwm < blockSize / 2)
     hwm += blockSize;
   buffer.resize (hwm);

   auto iter = buffer.begin () + lwm;
   ssize_t count = read (fd, &*iter, hwm - lwm);
   buffer.resize (lwm + (count >= 0 ? count : 0));

   if (count < 0)
     return errno;

   if (!count)
     // End of file
     return -1;

   bool more = true;
   for (;;)
     {
       auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
       if (newline == buffer.end ())
 	break;
       more = newline != buffer.begin () && newline[-1] == CONTINUE;
       iter = newline + 1;

       if (iter == buffer.end ())
 	break;

       if (!more)
 	{
 	  // There is no continuation, but there are chars after the
 	  // newline.  Truncate the buffer and return an error
 	  buffer.resize (iter - buffer.begin ());
 	  return EINVAL;
 	}
     }

   return more ? EAGAIN : 0;
 }

 int MessageBuffer::Lex (std::vector<std::string> &result)
 {
   result.clear ();

   if (IsAtEnd ())
     return ENOENT;

   Assert (buffer.back () == S2C(u8"\n"));

   auto iter = buffer.begin () + lastBol;

   for (std::string *word = nullptr;;)
     {
       char c = *iter;

       ++iter;
       if (c == S2C(u8" ") || c == S2C(u8"\t"))
 	{
 	  word = nullptr;
 	  continue;
 	}

       if (c == S2C(u8"\n"))
 	break;

       if (c == CONTINUE)
 	{
 	  // Line continuation
 	  if (word || *iter != S2C(u8"\n"))
 	    goto malformed;
 	  ++iter;
 	  break;
 	}

       if (c <= S2C(u8" ") || c >= 0x7f)
 	goto malformed;

       if (!word)
 	{
 	  result.emplace_back ();
 	  word = &result.back ();
 	}

       if (c == S2C(u8"'"))
 	{
 	  // Quoted word
 	  for (;;)
 	    {
 	      c = *iter;

 	      if (c == S2C(u8"\n"))
 		{
 		malformed:;
 		  result.clear ();
 		  iter = std::find (iter, buffer.end (), S2C(u8"\n"));
 		  auto back = iter;
 		  if (back[-1] == CONTINUE  && back[-2] == S2C(u8" "))
 		    // Smells like a line continuation
 		    back -= 2;
 		  result.emplace_back (&buffer[lastBol],
 				       back - buffer.begin () - lastBol);
 		  ++iter;
 		  lastBol = iter - buffer.begin ();
 		  return EINVAL;
 		}

 	      if (c < S2C(u8" ") || c >= 0x7f)
 		goto malformed;

 	      ++iter;
 	      if (c == S2C(u8"'"))
 		break;

 	      if (c == S2C(u8"\\"))
 		// escape
 		switch (c = *iter)
 		  {
 		    case S2C(u8"\\"):
 		    case S2C(u8"'"):
 		      ++iter;
 		      break;

 		    case S2C(u8"n"):
 		      c = S2C(u8"\n");
 		      ++iter;
 		      break;

 		    case S2C(u8"_"):
 		      // We used to escape SPACE as \_, so accept that
 		      c = S2C(u8" ");
 		      ++iter;
 		      break;

 		    case S2C(u8"t"):
 		      c = S2C(u8"\t");
 		      ++iter;
 		      break;

 		    default:
 		      {
 			unsigned v = 0;
 			for (unsigned nibble = 0; nibble != 2; nibble++)
 			  {
 			    c = *iter;
 			    if (c < S2C(u8"0"))
 			      {
 				if (!nibble)
 				  goto malformed;
 				break;
 			      }
 			    else if (c <= S2C(u8"9"))
 			      c -= S2C(u8"0");
 			    else if (c < S2C(u8"a"))
 			      {
 				if (!nibble)
 				  goto malformed;
 				break;
 			      }
 			    else if (c <= S2C(u8"f"))
 			      c -= S2C(u8"a") - 10;
 			    else
 			      {
 				if (!nibble)
 				  goto malformed;
 				break;
 			      }
 			    ++iter;
 			    v = (v << 4) | c;
 			  }
 			c = v;
 		      }
 		  }
 	      word->push_back (c);
 	    }
 	}
       else
 	// Unquoted character
 	word->push_back (c);
     }
   lastBol = iter - buffer.begin ();
   if (result.empty ())
     return ENOENT;

   return 0;
 }

 void MessageBuffer::LexedLine (std::string &str)
 {
   if (lastBol)
     {
       size_t pos = lastBol - 1;
       for (; pos; pos--)
 	if (buffer[pos-1] == S2C(u8"\n"))
 	  break;

       size_t end = lastBol - 1;
       if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
 	// Strip line continuation
 	end -= 2;
       str.append (&buffer[pos], end - pos);
     }
 }
 } // Detail
 } // Cody
	// CODYlib -- mode:c++ --
	// Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
	// License: Apache v2.0

	// Cody
	#include "internal.hh"
	// C++
	#include <algorithm>
	// C
	#include <cstring>
	// OS
	#include <unistd.h>
	#include <cerrno>

	// MessageBuffer code

	// Lines consist of words and end with a NEWLINE (0xa) char
	// Whitespace characters are TAB (0x9) and SPACE (0x20)
	// Words consist of non-whitespace chars separated by whitespace.
	// Multiple lines in one transaction are indicated by ending non-final
	// lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
	// Continuations with ; preceding it
	// Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
	// Quoting with '...'
	// Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
	// Anything outside of <= <space> or DEL or \' or \\ needs escaping.
	// Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
	// Spaces separate words, UTF8 encoding for non-ascii chars

	namespace Cody {
	namespace Detail {

	static const char CONTINUE = S2C(u8";");

	void MessageBuffer::BeginLine ()
	{
	if (!buffer.empty ())
	{
	// Terminate the previous line with a continuation
	buffer.reserve (buffer.size () + 3);
	buffer.push_back (S2C(u8" "));
	buffer.push_back (CONTINUE);
	buffer.push_back (S2C(u8"\n"));
	}
	lastBol = buffer.size ();
	}

	// QUOTE means 'maybe quote', we search it for quote-needing chars

	void MessageBuffer::Append (char const *str, bool quote, size_t len)
	{
	if (len == ~size_t (0))
	len = strlen (str);

	if (!len && !quote)
	return;

	// We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
	// that could remotely be shell-active. UTF8 encoding for non-ascii.
	if (quote && len)
	{
	quote = false;
	// Scan looking for quote-needing characters. We could just
	// append until we find one, but that's probably confusing
	for (size_t ix = len; ix--;)
	{
	unsigned char c = (unsigned char)str[ix];
	if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
	\|\| (c >= S2C(u8"A") && c <= S2C(u8"Z"))
	\|\| (c >= S2C(u8"0") && c <= S2C(u8"9"))
	\|\| c == S2C(u8"-") \|\| c == S2C(u8"+") \|\| c == S2C(u8"_")
	\|\| c == S2C(u8"/") \|\| c == S2C(u8"%") \|\| c == S2C(u8".")))
	{
	quote = true;
	break;
	}
	}
	}

	// Maximal length of appended string
	buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);

	if (quote)
	buffer.push_back (S2C(u8"'"));

	for (auto *end = str + len; str != end;)
	{
	auto *e = end;

	if (quote)
	// Look for next escape-needing char. More relaxed than
	// the earlier needs-quoting check.
	for (e = str; e != end; ++e)
	{
	unsigned char c = (unsigned char)*e;
	if (c < S2C(u8" ") \|\| c == 0x7f
	\|\| c == S2C(u8"\\") \|\| c == S2C(u8"'"))
	break;
	}
	buffer.insert (buffer.end (), str, e);
	str = e;

	if (str == end)
	break;

	buffer.push_back (S2C(u8"\\"));
	switch (unsigned char c = (unsigned char)*str++)
	{
	case S2C(u8"\t"):
	c = S2C(u8"t");
	goto append;

	case S2C(u8"\n"):
	c = S2C(u8"n");
	goto append;

	case S2C(u8"'"):
	case S2C(u8"\\"):
	append:
	buffer.push_back (c);
	break;

	default:
	// Full-on escape. Use 2 lower-case hex chars
	for (unsigned shift = 8; shift;)
	{
	shift -= 4;

	char nibble = (c >> shift) & 0xf;
	nibble += S2C(u8"0");
	if (nibble > S2C(u8"9"))
	nibble += S2C(u8"a") - (S2C(u8"9") + 1);
	buffer.push_back (nibble);
	}
	}
	}

	if (quote)
	buffer.push_back (S2C(u8"'"));
	}

	void MessageBuffer::Append (char c)
	{
	buffer.push_back (c);
	}

	void MessageBuffer::AppendInteger (unsigned u)
	{
	// Sigh, even though std::to_string is C++11, we support building on
	// gcc 4.8, which is a C++11 compiler lacking std::to_string. so
	// have something horrible.
	std::string v (20, 0);
	size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
	v.erase (len);

	AppendWord (v);
	}

	int MessageBuffer::Write (int fd) noexcept
	{
	size_t limit = buffer.size () - lastBol;
	ssize_t count = write (fd, &buffer.data ()[lastBol], limit);

	int err = 0;
	if (count < 0)
	err = errno;
	else
	{
	lastBol += count;
	if (size_t (count) != limit)
	err = EAGAIN;
	}

	if (err != EAGAIN && err != EINTR)
	{
	// Reset for next message
	buffer.clear ();
	lastBol = 0;
	}

	return err;
	}

	int MessageBuffer::Read (int fd) noexcept
	{
	constexpr size_t blockSize = 200;

	size_t lwm = buffer.size ();
	size_t hwm = buffer.capacity ();
	if (hwm - lwm < blockSize / 2)
	hwm += blockSize;
	buffer.resize (hwm);

	auto iter = buffer.begin () + lwm;
	ssize_t count = read (fd, &*iter, hwm - lwm);
	buffer.resize (lwm + (count >= 0 ? count : 0));

	if (count < 0)
	return errno;

	if (!count)
	// End of file
	return -1;

	bool more = true;
	for (;;)
	{
	auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
	if (newline == buffer.end ())
	break;
	more = newline != buffer.begin () && newline[-1] == CONTINUE;
	iter = newline + 1;

	if (iter == buffer.end ())
	break;

	if (!more)
	{
	// There is no continuation, but there are chars after the
	// newline. Truncate the buffer and return an error
	buffer.resize (iter - buffer.begin ());
	return EINVAL;
	}
	}

	return more ? EAGAIN : 0;
	}

	int MessageBuffer::Lex (std::vector<std::string> &result)
	{
	result.clear ();

	if (IsAtEnd ())
	return ENOENT;

	Assert (buffer.back () == S2C(u8"\n"));

	auto iter = buffer.begin () + lastBol;

	for (std::string *word = nullptr;;)
	{
	char c = *iter;

	++iter;
	if (c == S2C(u8" ") \|\| c == S2C(u8"\t"))
	{
	word = nullptr;
	continue;
	}

	if (c == S2C(u8"\n"))
	break;

	if (c == CONTINUE)
	{
	// Line continuation
	if (word \|\| *iter != S2C(u8"\n"))
	goto malformed;
	++iter;
	break;
	}

	if (c <= S2C(u8" ") \|\| c >= 0x7f)
	goto malformed;

	if (!word)
	{
	result.emplace_back ();
	word = &result.back ();
	}

	if (c == S2C(u8"'"))
	{
	// Quoted word
	for (;;)
	{
	c = *iter;

	if (c == S2C(u8"\n"))
	{
	malformed:;
	result.clear ();
	iter = std::find (iter, buffer.end (), S2C(u8"\n"));
	auto back = iter;
	if (back[-1] == CONTINUE && back[-2] == S2C(u8" "))
	// Smells like a line continuation
	back -= 2;
	result.emplace_back (&buffer[lastBol],
	back - buffer.begin () - lastBol);
	++iter;
	lastBol = iter - buffer.begin ();
	return EINVAL;
	}

	if (c < S2C(u8" ") \|\| c >= 0x7f)
	goto malformed;

	++iter;
	if (c == S2C(u8"'"))
	break;

	if (c == S2C(u8"\\"))
	// escape
	switch (c = *iter)
	{
	case S2C(u8"\\"):
	case S2C(u8"'"):
	++iter;
	break;

	case S2C(u8"n"):
	c = S2C(u8"\n");
	++iter;
	break;

	case S2C(u8"_"):
	// We used to escape SPACE as \_, so accept that
	c = S2C(u8" ");
	++iter;
	break;

	case S2C(u8"t"):
	c = S2C(u8"\t");
	++iter;
	break;

	default:
	{
	unsigned v = 0;
	for (unsigned nibble = 0; nibble != 2; nibble++)
	{
	c = *iter;
	if (c < S2C(u8"0"))
	{
	if (!nibble)
	goto malformed;
	break;
	}
	else if (c <= S2C(u8"9"))
	c -= S2C(u8"0");
	else if (c < S2C(u8"a"))
	{
	if (!nibble)
	goto malformed;
	break;
	}
	else if (c <= S2C(u8"f"))
	c -= S2C(u8"a") - 10;
	else
	{
	if (!nibble)
	goto malformed;
	break;
	}
	++iter;
	v = (v << 4) \| c;
	}
	c = v;
	}
	}
	word->push_back (c);
	}
	}
	else
	// Unquoted character
	word->push_back (c);
	}
	lastBol = iter - buffer.begin ();
	if (result.empty ())
	return ENOENT;

	return 0;
	}

	void MessageBuffer::LexedLine (std::string &str)
	{
	if (lastBol)
	{
	size_t pos = lastBol - 1;
	for (; pos; pos--)
	if (buffer[pos-1] == S2C(u8"\n"))
	break;

	size_t end = lastBol - 1;
	if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
	// Strip line continuation
	end -= 2;
	str.append (&buffer[pos], end - pos);
	}
	}
	} // Detail
	} // Cody