Index: doc/theses/mike_brooks_MMath/string.tex
===================================================================
--- doc/theses/mike_brooks_MMath/string.tex	(revision f8913b7c11ea07372f32703126c8512a598f1cdb)
+++ doc/theses/mike_brooks_MMath/string.tex	(revision 56ec508d488938933b41b33414874400c539d949)
@@ -3,11 +3,11 @@
 \vspace*{-20pt}
 This chapter presents my work on designing and building a modern string type in \CFA.
-The discussion starts with examples of interesting string problems, followed by examples of how these issues are resolved in my design.
+The discussion starts with an overview of string API, then a number of interesting string problems, followed by how these issues are resolved in this work.
 
 
 \section{String Operations}
 
-To prepare for the following discussion, comparisons among C, \CC, Java and \CFA strings are presented, beginning in \VRef[Figure]{f:StrApiCompare}.
-It provides a classic ``cheat sheet'' presentation, summarizing the names of the most-common closely-equivalent operations.
+\VRef[Figure]{f:StrApiCompare} shows a general comparison of string APIs for C, \CC, Java and \CFA.
+It provides a classic ``cheat sheet'', summarizing the names of the most-common closely-equivalent operations.
 The over-arching commonality is that operations work on groups of characters for assigning, copying, scanning, and updating.
 
@@ -31,5 +31,5 @@
 \end{tabular}
 \end{cquote}
-\caption{Comparison of languages' strings, API/``cheat-sheet'' perspective.}
+\caption{Language comparison of string API}
 \label{f:StrApiCompare}
 \end{figure}
@@ -42,5 +42,5 @@
 int open( @const char * pathname@, int flags );
 string fname{ "test.cc" );
-open( fname.@c_str()@, O_RDONLY );
+open( fname.@c_str()@, O_RDONLY );		// null terminated value of string
 \end{cfa}
 Here, the \CC @c_str@ function does not create a new null-terminated C string from the \CC string, as that requires passing ownership of the C string to the caller for eventual deletion.\footnote{
@@ -72,7 +72,7 @@
 \begin{cfa}
 #include @<string.hfa>@
-@string@ s, name, digit, alpha, punctuation, ifstmt;
+@string@ s = "abcde", name = "MIKE", digit, alpha, punctuation, ifstmt;
+const char cs[] = "abc";
 int i;
-name  = "MIKE";
 digit  = "0123456789";
 punctuation = "().,";
@@ -86,54 +86,27 @@
 The ability to convert from internal (machine) to external (human) format is useful in situations other than I/O.
 Hence, the basic types @char@, @char *@, @int@, @double@, @_Complex@, including any signness and size variations, implicitly convert to type @string@.
-\VRef[Figure]{f:ImplicitConversionsString} shows examples of implicit conversions.
-Conversions can be explicitly specified using a compound literal:
-\begin{cfa}
-s = (string){ "abc" };				$\C{// converts char * to string}$
-s = (string){ 5 };					$\C{// converts int to string}$
-s = (string){ 5.5 };				$\C{// converts double to string}$
-\end{cfa}
-Conversions from @string@ to @char *@, attempt to be safe:
-either by requiring the maximum length of the @char *@ storage (@strncpy@) or allocating the @char *@ storage for the string characters (ownership), meaning the programmer must free the storage.
-As well, a C string is always null terminates, implying a minimum size of 1 character.
 \begin{cquote}
 \setlength{\tabcolsep}{15pt}
-\begin{tabular}{@{}l|l@{}}
-\begin{cfa}
-string s = "abcde";
-char cs[3];
-strncpy( cs, s, sizeof(cs) );
-char * cp = s;
-delete( cp );
-cp = s + ' ' + s;
-delete( cp );
-\end{cfa}
-&
-\begin{cfa}
-"abcde"
-
-"ab\0", in place
-"abcde\0", malloc
-
-"abcde abcde\0", malloc
-
-\end{cfa}
-\end{tabular}
-\end{cquote}
-
-\begin{figure}
-\begin{tabular}{@{}l|l@{}}
-\setlength{\tabcolsep}{15pt}
+\begin{tabular}{@{}l|ll|l@{}}
 \begin{cfa}
 //	string s = 5;
-	string s;
-	// conversion of char and char * to string
 	s = 'x';
 	s = "abc";
-	char cs[] = "abc";
 	s = cs;
-	// conversion of integral, floating-point, and complex to string
 	s = 45hh;
 	s = 45h;
-	s = -(ssize_t)MAX - 1;
+\end{cfa}
+&
+\begin{cfa}
+
+"x"
+"abc"
+"abc"
+"45"
+"45"
+\end{cfa}
+&
+\begin{cfa}
+	s = (ssize_t)MIN;
 	s = (size_t)MAX;
 	s = 5.5;
@@ -144,14 +117,4 @@
 &
 \begin{cfa}
-
-
-
-"x"
-"abc"
-
-"abc"
-
-"45"
-"45"
 "-9223372036854775808"
 "18446744073709551615"
@@ -162,28 +125,58 @@
 \end{cfa}
 \end{tabular}
-\caption{Implicit Conversions to String}
-\label{f:ImplicitConversionsString}
-\end{figure}
-
-
-\subsection{Length}
-
-The @len@ operation returns the length of a string using prefix call.
+\end{cquote}
+Conversions can be explicitly specified using a compound literal.
+\begin{cfa}
+s = (string){ "abc" };				$\C{// converts char * to string}$
+s = (string){ 5 };					$\C{// converts int to string}$
+s = (string){ 5.5 };				$\C{// converts double to string}$
+\end{cfa}
+Conversions from @string@ to @char *@, attempt to be safe:
+either by requiring the maximum length of the @char *@ storage (@strncpy@) or allocating the @char *@ storage for the string characters (ownership), meaning the programmer must free the storage.
+Note, a C string is always null terminated, implying a minimum size of 1 character.
 \begin{cquote}
 \setlength{\tabcolsep}{15pt}
 \begin{tabular}{@{}l|l@{}}
 \begin{cfa}
-const char * cs = "abc";
-i = ""`len;
-i = "abc"`len;
-i = cs`len;
-i = name`len;
-\end{cfa}
-&
-\begin{cfa}
-
+strncpy( cs, s, sizeof(cs) );
+char * cp = s;
+delete( cp );
+cp = s + ' ' + s;
+delete( cp );
+\end{cfa}
+&
+\begin{cfa}
+"abc\0", in place
+"abcde\0", malloc
+ownership
+"abcde abcde\0", malloc
+ownership
+\end{cfa}
+\end{tabular}
+\end{cquote}
+
+
+\subsection{Length}
+
+The @len@ operation (short for @strlen@) returns the length of a C or \CFA string.
+For consistency, @strlen@ also works with \CFA strings.
+\begin{cquote}
+\setlength{\tabcolsep}{15pt}
+\begin{tabular}{@{}l|l@{}}
+\begin{cfa}
+i = len( "" );
+i = len( "abc" );
+i = len( cs );
+i = strlen( cs );
+i = len( name );
+i = strlen( name );
+\end{cfa}
+&
+\begin{cfa}
 0
 3
 3
+3
+4
 4
 \end{cfa}
@@ -195,44 +188,107 @@
 
 The binary relational, @<@, @<=@, @>@, @>=@, and equality, @==@, @!=@, operators compare strings using lexicographical ordering, where longer strings are greater than shorter strings.
-C strings use function @strcmp@, as the relational/equality operators compare C string pointers not their values, which does not match normal programmer expectation.
+C strings use function @strcmp@, as the relational/equality operators compare C string pointers not their values, which does not match programmer expectation.
 
 
 \subsection{Concatenation}
 
-The binary operators @+@ and @+=@ concatenate two strings, creating the sum of the strings.
+The binary operators @+@ and @+=@ concatenate characters, C strings and \CFA strings, creating the sum of the characters.
+\begin{cquote}
+\begin{tabular}{@{}l|l@{\hspace{25pt}}l|l@{\hspace{25pt}}l|l@{}}
+\begin{cfa}
+s = "";
+s = 'a' + 'b';
+s = 'a' + "b";
+s = "a" + 'b';
+s = "a" + "b";
+\end{cfa}
+&
+\begin{cfa}
+
+"ab"
+"ab"
+"ab"
+"ab"
+\end{cfa}
+&
+\begin{cfa}
+s = "";
+s = 'a' + 'b' + s;
+s = 'a' + 'b' + s;
+s = 'a' + "b" + s;
+s = "a" + 'b' + s;
+\end{cfa}
+&
+\begin{cfa}
+
+"ab"
+"abab"
+"ababab"
+"abababab"
+\end{cfa}
+&
+\begin{cfa}
+s = "";
+s = s + 'a' + 'b';
+s = s + 'a' + "b";
+s = s + "a" + 'b';
+s = s + "a" + "b";
+\end{cfa}
+&
+\begin{cfa}
+
+"ab"
+"abab"
+"ababab"
+"abababab"
+\end{cfa}
+\end{tabular}
+\end{cquote}
+For these operations to meet programmer expectations, \CFA introduces two C non-backward compatibilities.
+Note, subtracting pointers or characters has a low-level use case.
+\begin{cfa}
+ch - '0'    $\C[2in]{// find character offset}$
+cp1 - cp2;  $\C{// find pointer offset}\CRT$
+\end{cfa}
+However, there is no obvious use case for addition.
+\begin{cfa}
+ch + 'b'    $\C[2in]{// add character values}$
+cp1 + 'a';  $\C{// move pointer cp1['a']}\CRT$
+\end{cfa}
+Adding character values or advancing a pointer with a character are unusual operations, and hence, unlikely to existing in C programs.
+Stealing these two cases for use with strings, allows all combinations of concatenation among @char@, @char *@, and @string@.
+Note, stealing only occurs if a program includes @string.hfa@, resulting is ambiguities in existing C code where there is no way to disambiguate.
+\begin{cfa}
+ch = 'a' + 'b'; $\C[2in]{// LHS disambiguate, add character values}$
+s = 'a' + 'b'; $\C{// LHS disambiguate, concatenation characters}$
+sout | 'a' + 'b'; $\C{// ambiguous with string.hfa, add or concatenate?}$
+sout | (char)'a' + 'b'; $\C{// disambiguate}$
+sout | "a" + "b"; $\C{// disambiguate}\CRT$
+\end{cfa}
+Again, the possibility of this scenario is extremely rare, as adding characters is meaningless.
+
+\CC cannot support this generality because it does not use the left-hand side of assignment in expression resolution.
+While it can special case some combinations:
+\begin{c++}
+s = 'a' + s; $\C[2in]{// compiles in C++}$
+s = "a" + s;
+\end{c++}
+it cannot generalize to any number of steps:
+\begin{c++}
+s = 'a' + 'b' + s; $\C{// does not compile in C++}\CRT$
+s = "a" + "b" + s;
+\end{c++}
+
+
+\subsection{Repetition}
+
+The binary operators @*@ and @*=@ repeat a string $N$ times.
+If $N = 0$, a zero length string, @""@, is returned.
+Like concatenation, multiplication is stolen for @char@;
+multiplication for pointers does not exist in C.
 \begin{cquote}
 \setlength{\tabcolsep}{15pt}
 \begin{tabular}{@{}l|l@{}}
 \begin{cfa}
-s = name + ' ' + digit;
-s += name;
-s = s + 'a' + 'b';
-s = s + "a" + "abc";
-s = 'a' + 'b' + s;
-s = "a" + "abc" + s;
-\end{cfa}
-&
-\begin{cfa}
-"MIKE 0123456789"
-"MIKE 0123456789MIKE"
-
-
-$\CC$ unsupported
-$\CC$ unsupported
-\end{cfa}
-\end{tabular}
-\end{cquote}
-The \CFA type-system allows full  commutativity with character and C strings;
-\CC does not.
-
-
-\subsection{Repetition}
-
-The binary operators @*@ and @*=@ repeat a string $N$ times.
-If $N = 0$, a zero length string, @""@ is returned.
-\begin{cquote}
-\setlength{\tabcolsep}{15pt}
-\begin{tabular}{@{}l|l@{}}
-\begin{cfa}
 s = 'x' * 3;
 s = "abc" * 3;
@@ -241,7 +297,7 @@
 &
 \begin{cfa}
-xxx
-abcabcabc
-MIKE MIKE MIKE 
+"xxx"
+"abcabcabc"
+"MIKE MIKE MIKE "
 \end{cfa}
 \end{tabular}
@@ -250,8 +306,8 @@
 
 \subsection{Substring}
-The substring operation returns a subset of the string starting at a position in the string and traversing a length.
+The substring operation returns a subset of a string starting at a position in the string and traversing a length or matching a pattern string.
 \begin{cquote}
-\setlength{\tabcolsep}{15pt}
-\begin{tabular}{@{}l|l@{}}
+\setlength{\tabcolsep}{10pt}
+\begin{tabular}{@{}l|ll|l@{}}
 \begin{cfa}
 s = name( 2, 2 );
@@ -265,9 +321,27 @@
 \begin{cfa}
 "KE"
-"IK", length is opposite direction
-"KE", length is clipped to 2
-"", beyond string so clipped to null
-"K", start $and$ length are negative
+"IK"
+"KE", clipped length to 2
+"", beyond string clipped to null
+"K"
 "IKE", to end of string
+\end{cfa}
+&
+\begin{cfa}
+s = name( "IK" );
+s = name( "WW" );
+
+
+
+
+\end{cfa}
+&
+\begin{cfa}
+"IK"
+""
+
+
+
+
 \end{cfa}
 \end{tabular}
@@ -277,4 +351,6 @@
 If the substring request extends beyond the beginning or end of the string, it is clipped (shortened) to the bounds of the string.
 If the substring request is completely outside of the original string, a null string is returned.
+The pattern form either returns the pattern string is the pattern matches or a null string if the pattern does not match.
+This mechanism is discussed next.
 
 The substring operation can also appear on the left side of an assignment and replaced by the string value on the right side.
@@ -284,65 +360,37 @@
 \setlength{\tabcolsep}{15pt}
 \begin{tabular}{@{}l|l@{}}
-\begin{cfa}
+\begin{cfa}[escapechar={}]
 digit( 3, 3 ) = "";
 digit( 4, 3 ) = "xyz";
 digit( 7, 0 ) = "***";
-digit(-4, 3 ) = "$\tt\$\$\$$";
-\end{cfa}
-&
-\begin{cfa}
-0126789
-0126xyz
-0126xyz
-012$\$\$\$$z
+digit(-4, 3 ) = "$$$";
+digit( 5 ) = "LLL";
+\end{cfa}
+&
+\begin{cfa}[escapechar={}]
+"0126789"
+"0126xyz"
+"0126xyz"
+"012$$$z"
+"012$$LLL"
 \end{cfa}
 \end{tabular}
 \end{cquote}
-A substring is treated as a pointer into the base (substringed) string rather than creating a copy of the subtext.
-Hence, if the referenced item is changed, then the pointer sees the change.
-Pointers to the result value of a substring operation are defined to always start at the same location in their base string as long as that starting location exists, independent of changes to themselves or the base string.
-However, if the base string value changes, this may affect the values of one or more of the substrings to that base string.
-If the base string value shortens so that its end is before the starting location of a substring, resulting in the substring starting location disappearing, the substring becomes a null string located at the end of the base string.
-
-The following example illustrates passing the results of substring operations by reference and by value to a subprogram.
-Notice the side-effects to other reference parameters as one is modified.
-\begin{cfa}
-main() {
-	string x = "xxxxxxxxxxxxx";
-	test( x, x(1,3), x(3,3), x(5,5), x(9,5), x(9,5) );
-}
-
-// x, a, b, c, & d are substring results passed by reference
-// e is a substring result passed by value
-void test(string &x, string &a, string &b, string &c, string &d, string e) {
-									$\C{//   x			  	  a	 	  b	 	  c		  d		  e}$
-	a( 1, 2 ) = "aaa";				$\C{// aaaxxxxxxxxxxx	aaax	axx		xxxxx	xxxxx	xxxxx}$
-	b( 2, 12 ) = "bbb";				$\C{// aaabbbxxxxxxxxx	aaab	abbb	bbxxx	xxxxx	xxxxx}$
-	c( 4, 5 ) = "ccc";				$\C{// aaabbbxcccxxxxxx	aaab	abbb	bbxccc	ccxxx	xxxxx}$
-	c = "yyy";						$\C{// aaabyyyxxxxxx	aaab	abyy	yyy		xxxxx	xxxxx}$
-	d( 1, 3 ) = "ddd";				$\C{// aaabyyyxdddxx	aaab	abyy	yyy		dddxx	xxxxx}$
-	e( 1, 3 ) = "eee";				$\C{// aaabyyyxdddxx	aaab	abyy	yyy		dddxx	eeexx}$
-	x = e;							$\C{// eeexx			eeex	exx		x				eeexx}$
-}
-\end{cfa}
-
-There is an assignment form of substring in which only the starting position is specified and the length is assumed to be the remainder of the string.
-\begin{cfa}
-string operator () (int start);
-\end{cfa}
-For example:
-\begin{cfa}
-s = name( 2 );						$\C{// s is assigned "ETER"}$
-name( 2 ) = "IPER";					$\C{// name is assigned "PIPER"}$
-\end{cfa}
-It is also possible to substring using a string as the index for selecting the substring portion of the string.
-\begin{cfa}
-string operator () (const string &index);
-\end{cfa}
-For example:
-\begin{cfa}[mathescape=false]
-digit( "xyz$\$\$\$$" ) = "678";	 	$\C{// digit is assigned "0156789"}$
-digit( "234") = "***";				$\C{// digit is assigned "0156789***"}$
-\end{cfa}
+Pattern matching is useful on the left-hand side of the assignment.
+\begin{cquote}
+\setlength{\tabcolsep}{15pt}
+\begin{tabular}{@{}l|l@{}}
+\begin{cfa}[escapechar={}]
+digit( "$$" ) = "345";
+digit( "LLL") = "6789";
+\end{cfa}
+&
+\begin{cfa}
+"012345LLL"
+"0123456789"
+\end{cfa}
+\end{tabular}
+\end{cquote}
+Extending the pattern to a regular expression is a possible extension.
 
 
@@ -481,4 +529,5 @@
 When an instance of the @from@ string is found and changed to the @to@ string, it is NOT examined again for further replacement.
 
+
 \subsection{Returning N+1 on Failure}
 
@@ -514,5 +563,5 @@
 
 To ease conversion from C to \CFA, there are companion @string@ routines for C strings.
-\VRef[Table]{t:CompanionStringRoutines} shows the C routines on the left that also work with @string@ and the rough equivalent @string@ opeation of the right.
+\VRef[Table]{t:CompanionStringRoutines} shows the C routines on the left that also work with @string@ and the rough equivalent @string@ operation of the right.
 Hence, it is possible to directly convert a block of C string operations into @string@ just by changing the 
 
@@ -552,4 +601,35 @@
 
 
+\subsection{Parameter Passing}
+
+A substring is treated as a pointer into the base (substringed) string rather than creating a copy of the subtext.
+Hence, if the referenced item is changed, then the pointer sees the change.
+Pointers to the result value of a substring operation are defined to always start at the same location in their base string as long as that starting location exists, independent of changes to themselves or the base string.
+However, if the base string value changes, this may affect the values of one or more of the substrings to that base string.
+If the base string value shortens so that its end is before the starting location of a substring, resulting in the substring starting location disappearing, the substring becomes a null string located at the end of the base string.
+
+The following example illustrates passing the results of substring operations by reference and by value to a subprogram.
+Notice the side-effects to other reference parameters as one is modified.
+\begin{cfa}
+main() {
+	string x = "xxxxxxxxxxxxx";
+	test( x, x(1,3), x(3,3), x(5,5), x(9,5), x(9,5) );
+}
+
+// x, a, b, c, & d are substring results passed by reference
+// e is a substring result passed by value
+void test(string &x, string &a, string &b, string &c, string &d, string e) {
+									$\C{//   x			  	  a	 	  b	 	  c		  d		  e}$
+	a( 1, 2 ) = "aaa";				$\C{// aaaxxxxxxxxxxx	aaax	axx		xxxxx	xxxxx	xxxxx}$
+	b( 2, 12 ) = "bbb";				$\C{// aaabbbxxxxxxxxx	aaab	abbb	bbxxx	xxxxx	xxxxx}$
+	c( 4, 5 ) = "ccc";				$\C{// aaabbbxcccxxxxxx	aaab	abbb	bbxccc	ccxxx	xxxxx}$
+	c = "yyy";						$\C{// aaabyyyxxxxxx	aaab	abyy	yyy		xxxxx	xxxxx}$
+	d( 1, 3 ) = "ddd";				$\C{// aaabyyyxdddxx	aaab	abyy	yyy		dddxx	xxxxx}$
+	e( 1, 3 ) = "eee";				$\C{// aaabyyyxdddxx	aaab	abyy	yyy		dddxx	eeexx}$
+	x = e;							$\C{// eeexx			eeex	exx		x				eeexx}$
+}
+\end{cfa}
+
+
 \subsection{Input/Output Operators}
 
@@ -559,5 +639,5 @@
 
 
-\section{Implementation Details}
+\section{Implementation}
 
 While \VRef[Figure]{f:StrApiCompare} emphasizes cross-language similarities, it elides many specific operational differences.
