Ignore:
Timestamp:
Apr 6, 2025, 10:41:07 PM (9 months ago)
Author:
Peter A. Buhr <pabuhr@…>
Branches:
master
Children:
ed5023d1
Parents:
96a11655
Message:

more string API updates

File:
1 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/mike_brooks_MMath/string.tex

    r96a11655 r56ec508  
    33\vspace*{-20pt}
    44This chapter presents my work on designing and building a modern string type in \CFA.
    5 The discussion starts with examples of interesting string problems, followed by examples of how these issues are resolved in my design.
     5The discussion starts with an overview of string API, then a number of interesting string problems, followed by how these issues are resolved in this work.
    66
    77
    88\section{String Operations}
    99
    10 To prepare for the following discussion, comparisons among C, \CC, Java and \CFA strings are presented, beginning in \VRef[Figure]{f:StrApiCompare}.
    11 It provides a classic ``cheat sheet'' presentation, summarizing the names of the most-common closely-equivalent operations.
     10\VRef[Figure]{f:StrApiCompare} shows a general comparison of string APIs for C, \CC, Java and \CFA.
     11It provides a classic ``cheat sheet'', summarizing the names of the most-common closely-equivalent operations.
    1212The over-arching commonality is that operations work on groups of characters for assigning, copying, scanning, and updating.
    1313
     
    3131\end{tabular}
    3232\end{cquote}
    33 \caption{Comparison of languages' strings, API/``cheat-sheet'' perspective.}
     33\caption{Language comparison of string API}
    3434\label{f:StrApiCompare}
    3535\end{figure}
     
    4242int open( @const char * pathname@, int flags );
    4343string fname{ "test.cc" );
    44 open( fname.@c_str()@, O_RDONLY );
     44open( fname.@c_str()@, O_RDONLY );              // null terminated value of string
    4545\end{cfa}
    4646Here, the \CC @c_str@ function does not create a new null-terminated C string from the \CC string, as that requires passing ownership of the C string to the caller for eventual deletion.\footnote{
     
    7272\begin{cfa}
    7373#include @<string.hfa>@
    74 @string@ s, name, digit, alpha, punctuation, ifstmt;
     74@string@ s = "abcde", name = "MIKE", digit, alpha, punctuation, ifstmt;
     75const char cs[] = "abc";
    7576int i;
    76 name  = "MIKE";
    7777digit  = "0123456789";
    7878punctuation = "().,";
     
    8686The ability to convert from internal (machine) to external (human) format is useful in situations other than I/O.
    8787Hence, the basic types @char@, @char *@, @int@, @double@, @_Complex@, including any signness and size variations, implicitly convert to type @string@.
    88 \VRef[Figure]{f:ImplicitConversionsString} shows examples of implicit conversions.
    89 Conversions can be explicitly specified using a compound literal:
    90 \begin{cfa}
    91 s = (string){ "abc" };                          $\C{// converts char * to string}$
    92 s = (string){ 5 };                                      $\C{// converts int to string}$
    93 s = (string){ 5.5 };                            $\C{// converts double to string}$
    94 \end{cfa}
    95 Conversions from @string@ to @char *@, attempt to be safe:
    96 either by requiring the maximum length of the @char *@ storage (@strncpy@) or allocating the @char *@ storage for the string characters (ownership), meaning the programmer must free the storage.
    97 As well, a C string is always null terminates, implying a minimum size of 1 character.
    9888\begin{cquote}
    9989\setlength{\tabcolsep}{15pt}
    100 \begin{tabular}{@{}l|l@{}}
    101 \begin{cfa}
    102 string s = "abcde";
    103 char cs[3];
    104 strncpy( cs, s, sizeof(cs) );
    105 char * cp = s;
    106 delete( cp );
    107 cp = s + ' ' + s;
    108 delete( cp );
    109 \end{cfa}
    110 &
    111 \begin{cfa}
    112 "abcde"
    113 
    114 "ab\0", in place
    115 "abcde\0", malloc
    116 
    117 "abcde abcde\0", malloc
    118 
    119 \end{cfa}
    120 \end{tabular}
    121 \end{cquote}
    122 
    123 \begin{figure}
    124 \begin{tabular}{@{}l|l@{}}
    125 \setlength{\tabcolsep}{15pt}
     90\begin{tabular}{@{}l|ll|l@{}}
    12691\begin{cfa}
    12792//      string s = 5;
    128         string s;
    129         // conversion of char and char * to string
    13093        s = 'x';
    13194        s = "abc";
    132         char cs[] = "abc";
    13395        s = cs;
    134         // conversion of integral, floating-point, and complex to string
    13596        s = 45hh;
    13697        s = 45h;
    137         s = -(ssize_t)MAX - 1;
     98\end{cfa}
     99&
     100\begin{cfa}
     101
     102"x"
     103"abc"
     104"abc"
     105"45"
     106"45"
     107\end{cfa}
     108&
     109\begin{cfa}
     110        s = (ssize_t)MIN;
    138111        s = (size_t)MAX;
    139112        s = 5.5;
     
    144117&
    145118\begin{cfa}
    146 
    147 
    148 
    149 "x"
    150 "abc"
    151 
    152 "abc"
    153 
    154 "45"
    155 "45"
    156119"-9223372036854775808"
    157120"18446744073709551615"
     
    162125\end{cfa}
    163126\end{tabular}
    164 \caption{Implicit Conversions to String}
    165 \label{f:ImplicitConversionsString}
    166 \end{figure}
    167 
    168 
    169 \subsection{Length}
    170 
    171 The @len@ operation returns the length of a string using prefix call.
     127\end{cquote}
     128Conversions can be explicitly specified using a compound literal.
     129\begin{cfa}
     130s = (string){ "abc" };                          $\C{// converts char * to string}$
     131s = (string){ 5 };                                      $\C{// converts int to string}$
     132s = (string){ 5.5 };                            $\C{// converts double to string}$
     133\end{cfa}
     134Conversions from @string@ to @char *@, attempt to be safe:
     135either by requiring the maximum length of the @char *@ storage (@strncpy@) or allocating the @char *@ storage for the string characters (ownership), meaning the programmer must free the storage.
     136Note, a C string is always null terminated, implying a minimum size of 1 character.
    172137\begin{cquote}
    173138\setlength{\tabcolsep}{15pt}
    174139\begin{tabular}{@{}l|l@{}}
    175140\begin{cfa}
    176 const char * cs = "abc";
    177 i = ""`len;
    178 i = "abc"`len;
    179 i = cs`len;
    180 i = name`len;
    181 \end{cfa}
    182 &
    183 \begin{cfa}
    184 
     141strncpy( cs, s, sizeof(cs) );
     142char * cp = s;
     143delete( cp );
     144cp = s + ' ' + s;
     145delete( cp );
     146\end{cfa}
     147&
     148\begin{cfa}
     149"abc\0", in place
     150"abcde\0", malloc
     151ownership
     152"abcde abcde\0", malloc
     153ownership
     154\end{cfa}
     155\end{tabular}
     156\end{cquote}
     157
     158
     159\subsection{Length}
     160
     161The @len@ operation (short for @strlen@) returns the length of a C or \CFA string.
     162For consistency, @strlen@ also works with \CFA strings.
     163\begin{cquote}
     164\setlength{\tabcolsep}{15pt}
     165\begin{tabular}{@{}l|l@{}}
     166\begin{cfa}
     167i = len( "" );
     168i = len( "abc" );
     169i = len( cs );
     170i = strlen( cs );
     171i = len( name );
     172i = strlen( name );
     173\end{cfa}
     174&
     175\begin{cfa}
    1851760
    1861773
    1871783
     1793
     1804
    1881814
    189182\end{cfa}
     
    195188
    196189The binary relational, @<@, @<=@, @>@, @>=@, and equality, @==@, @!=@, operators compare strings using lexicographical ordering, where longer strings are greater than shorter strings.
    197 C strings use function @strcmp@, as the relational/equality operators compare C string pointers not their values, which does not match normal programmer expectation.
     190C strings use function @strcmp@, as the relational/equality operators compare C string pointers not their values, which does not match programmer expectation.
    198191
    199192
    200193\subsection{Concatenation}
    201194
    202 The binary operators @+@ and @+=@ concatenate two strings, creating the sum of the strings.
     195The binary operators @+@ and @+=@ concatenate characters, C strings and \CFA strings, creating the sum of the characters.
     196\begin{cquote}
     197\begin{tabular}{@{}l|l@{\hspace{25pt}}l|l@{\hspace{25pt}}l|l@{}}
     198\begin{cfa}
     199s = "";
     200s = 'a' + 'b';
     201s = 'a' + "b";
     202s = "a" + 'b';
     203s = "a" + "b";
     204\end{cfa}
     205&
     206\begin{cfa}
     207
     208"ab"
     209"ab"
     210"ab"
     211"ab"
     212\end{cfa}
     213&
     214\begin{cfa}
     215s = "";
     216s = 'a' + 'b' + s;
     217s = 'a' + 'b' + s;
     218s = 'a' + "b" + s;
     219s = "a" + 'b' + s;
     220\end{cfa}
     221&
     222\begin{cfa}
     223
     224"ab"
     225"abab"
     226"ababab"
     227"abababab"
     228\end{cfa}
     229&
     230\begin{cfa}
     231s = "";
     232s = s + 'a' + 'b';
     233s = s + 'a' + "b";
     234s = s + "a" + 'b';
     235s = s + "a" + "b";
     236\end{cfa}
     237&
     238\begin{cfa}
     239
     240"ab"
     241"abab"
     242"ababab"
     243"abababab"
     244\end{cfa}
     245\end{tabular}
     246\end{cquote}
     247For these operations to meet programmer expectations, \CFA introduces two C non-backward compatibilities.
     248Note, subtracting pointers or characters has a low-level use case.
     249\begin{cfa}
     250ch - '0'    $\C[2in]{// find character offset}$
     251cp1 - cp2;  $\C{// find pointer offset}\CRT$
     252\end{cfa}
     253However, there is no obvious use case for addition.
     254\begin{cfa}
     255ch + 'b'    $\C[2in]{// add character values}$
     256cp1 + 'a';  $\C{// move pointer cp1['a']}\CRT$
     257\end{cfa}
     258Adding character values or advancing a pointer with a character are unusual operations, and hence, unlikely to existing in C programs.
     259Stealing these two cases for use with strings, allows all combinations of concatenation among @char@, @char *@, and @string@.
     260Note, stealing only occurs if a program includes @string.hfa@, resulting is ambiguities in existing C code where there is no way to disambiguate.
     261\begin{cfa}
     262ch = 'a' + 'b'; $\C[2in]{// LHS disambiguate, add character values}$
     263s = 'a' + 'b'; $\C{// LHS disambiguate, concatenation characters}$
     264sout | 'a' + 'b'; $\C{// ambiguous with string.hfa, add or concatenate?}$
     265sout | (char)'a' + 'b'; $\C{// disambiguate}$
     266sout | "a" + "b"; $\C{// disambiguate}\CRT$
     267\end{cfa}
     268Again, the possibility of this scenario is extremely rare, as adding characters is meaningless.
     269
     270\CC cannot support this generality because it does not use the left-hand side of assignment in expression resolution.
     271While it can special case some combinations:
     272\begin{c++}
     273s = 'a' + s; $\C[2in]{// compiles in C++}$
     274s = "a" + s;
     275\end{c++}
     276it cannot generalize to any number of steps:
     277\begin{c++}
     278s = 'a' + 'b' + s; $\C{// does not compile in C++}\CRT$
     279s = "a" + "b" + s;
     280\end{c++}
     281
     282
     283\subsection{Repetition}
     284
     285The binary operators @*@ and @*=@ repeat a string $N$ times.
     286If $N = 0$, a zero length string, @""@, is returned.
     287Like concatenation, multiplication is stolen for @char@;
     288multiplication for pointers does not exist in C.
    203289\begin{cquote}
    204290\setlength{\tabcolsep}{15pt}
    205291\begin{tabular}{@{}l|l@{}}
    206292\begin{cfa}
    207 s = name + ' ' + digit;
    208 s += name;
    209 s = s + 'a' + 'b';
    210 s = s + "a" + "abc";
    211 s = 'a' + 'b' + s;
    212 s = "a" + "abc" + s;
    213 \end{cfa}
    214 &
    215 \begin{cfa}
    216 "MIKE 0123456789"
    217 "MIKE 0123456789MIKE"
    218 
    219 
    220 $\CC$ unsupported
    221 $\CC$ unsupported
    222 \end{cfa}
    223 \end{tabular}
    224 \end{cquote}
    225 The \CFA type-system allows full  commutativity with character and C strings;
    226 \CC does not.
    227 
    228 
    229 \subsection{Repetition}
    230 
    231 The binary operators @*@ and @*=@ repeat a string $N$ times.
    232 If $N = 0$, a zero length string, @""@ is returned.
    233 \begin{cquote}
    234 \setlength{\tabcolsep}{15pt}
    235 \begin{tabular}{@{}l|l@{}}
    236 \begin{cfa}
    237293s = 'x' * 3;
    238294s = "abc" * 3;
     
    241297&
    242298\begin{cfa}
    243 xxx
    244 abcabcabc
    245 MIKE MIKE MIKE
     299"xxx"
     300"abcabcabc"
     301"MIKE MIKE MIKE "
    246302\end{cfa}
    247303\end{tabular}
     
    250306
    251307\subsection{Substring}
    252 The substring operation returns a subset of the string starting at a position in the string and traversing a length.
     308The substring operation returns a subset of a string starting at a position in the string and traversing a length or matching a pattern string.
    253309\begin{cquote}
    254 \setlength{\tabcolsep}{15pt}
    255 \begin{tabular}{@{}l|l@{}}
     310\setlength{\tabcolsep}{10pt}
     311\begin{tabular}{@{}l|ll|l@{}}
    256312\begin{cfa}
    257313s = name( 2, 2 );
     
    265321\begin{cfa}
    266322"KE"
    267 "IK", length is opposite direction
    268 "KE", length is clipped to 2
    269 "", beyond string so clipped to null
    270 "K", start $and$ length are negative
     323"IK"
     324"KE", clipped length to 2
     325"", beyond string clipped to null
     326"K"
    271327"IKE", to end of string
     328\end{cfa}
     329&
     330\begin{cfa}
     331s = name( "IK" );
     332s = name( "WW" );
     333
     334
     335
     336
     337\end{cfa}
     338&
     339\begin{cfa}
     340"IK"
     341""
     342
     343
     344
     345
    272346\end{cfa}
    273347\end{tabular}
     
    277351If the substring request extends beyond the beginning or end of the string, it is clipped (shortened) to the bounds of the string.
    278352If the substring request is completely outside of the original string, a null string is returned.
     353The pattern form either returns the pattern string is the pattern matches or a null string if the pattern does not match.
     354This mechanism is discussed next.
    279355
    280356The substring operation can also appear on the left side of an assignment and replaced by the string value on the right side.
     
    284360\setlength{\tabcolsep}{15pt}
    285361\begin{tabular}{@{}l|l@{}}
    286 \begin{cfa}
     362\begin{cfa}[escapechar={}]
    287363digit( 3, 3 ) = "";
    288364digit( 4, 3 ) = "xyz";
    289365digit( 7, 0 ) = "***";
    290 digit(-4, 3 ) = "$\tt\$\$\$$";
    291 \end{cfa}
    292 &
    293 \begin{cfa}
    294 0126789
    295 0126xyz
    296 0126xyz
    297 012$\$\$\$$z
     366digit(-4, 3 ) = "$$$";
     367digit( 5 ) = "LLL";
     368\end{cfa}
     369&
     370\begin{cfa}[escapechar={}]
     371"0126789"
     372"0126xyz"
     373"0126xyz"
     374"012$$$z"
     375"012$$LLL"
    298376\end{cfa}
    299377\end{tabular}
    300378\end{cquote}
    301 A substring is treated as a pointer into the base (substringed) string rather than creating a copy of the subtext.
    302 Hence, if the referenced item is changed, then the pointer sees the change.
    303 Pointers to the result value of a substring operation are defined to always start at the same location in their base string as long as that starting location exists, independent of changes to themselves or the base string.
    304 However, if the base string value changes, this may affect the values of one or more of the substrings to that base string.
    305 If the base string value shortens so that its end is before the starting location of a substring, resulting in the substring starting location disappearing, the substring becomes a null string located at the end of the base string.
    306 
    307 The following example illustrates passing the results of substring operations by reference and by value to a subprogram.
    308 Notice the side-effects to other reference parameters as one is modified.
    309 \begin{cfa}
    310 main() {
    311         string x = "xxxxxxxxxxxxx";
    312         test( x, x(1,3), x(3,3), x(5,5), x(9,5), x(9,5) );
    313 }
    314 
    315 // x, a, b, c, & d are substring results passed by reference
    316 // e is a substring result passed by value
    317 void test(string &x, string &a, string &b, string &c, string &d, string e) {
    318                                                                         $\C{//   x                                a               b               c               d               e}$
    319         a( 1, 2 ) = "aaa";                              $\C{// aaaxxxxxxxxxxx   aaax    axx             xxxxx   xxxxx   xxxxx}$
    320         b( 2, 12 ) = "bbb";                             $\C{// aaabbbxxxxxxxxx  aaab    abbb    bbxxx   xxxxx   xxxxx}$
    321         c( 4, 5 ) = "ccc";                              $\C{// aaabbbxcccxxxxxx aaab    abbb    bbxccc  ccxxx   xxxxx}$
    322         c = "yyy";                                              $\C{// aaabyyyxxxxxx    aaab    abyy    yyy             xxxxx   xxxxx}$
    323         d( 1, 3 ) = "ddd";                              $\C{// aaabyyyxdddxx    aaab    abyy    yyy             dddxx   xxxxx}$
    324         e( 1, 3 ) = "eee";                              $\C{// aaabyyyxdddxx    aaab    abyy    yyy             dddxx   eeexx}$
    325         x = e;                                                  $\C{// eeexx                    eeex    exx             x                               eeexx}$
    326 }
    327 \end{cfa}
    328 
    329 There is an assignment form of substring in which only the starting position is specified and the length is assumed to be the remainder of the string.
    330 \begin{cfa}
    331 string operator () (int start);
    332 \end{cfa}
    333 For example:
    334 \begin{cfa}
    335 s = name( 2 );                                          $\C{// s is assigned "ETER"}$
    336 name( 2 ) = "IPER";                                     $\C{// name is assigned "PIPER"}$
    337 \end{cfa}
    338 It is also possible to substring using a string as the index for selecting the substring portion of the string.
    339 \begin{cfa}
    340 string operator () (const string &index);
    341 \end{cfa}
    342 For example:
    343 \begin{cfa}[mathescape=false]
    344 digit( "xyz$\$\$\$$" ) = "678";         $\C{// digit is assigned "0156789"}$
    345 digit( "234") = "***";                          $\C{// digit is assigned "0156789***"}$
    346 \end{cfa}
     379Pattern matching is useful on the left-hand side of the assignment.
     380\begin{cquote}
     381\setlength{\tabcolsep}{15pt}
     382\begin{tabular}{@{}l|l@{}}
     383\begin{cfa}[escapechar={}]
     384digit( "$$" ) = "345";
     385digit( "LLL") = "6789";
     386\end{cfa}
     387&
     388\begin{cfa}
     389"012345LLL"
     390"0123456789"
     391\end{cfa}
     392\end{tabular}
     393\end{cquote}
     394Extending the pattern to a regular expression is a possible extension.
    347395
    348396
     
    481529When an instance of the @from@ string is found and changed to the @to@ string, it is NOT examined again for further replacement.
    482530
     531
    483532\subsection{Returning N+1 on Failure}
    484533
     
    514563
    515564To ease conversion from C to \CFA, there are companion @string@ routines for C strings.
    516 \VRef[Table]{t:CompanionStringRoutines} shows the C routines on the left that also work with @string@ and the rough equivalent @string@ opeation of the right.
     565\VRef[Table]{t:CompanionStringRoutines} shows the C routines on the left that also work with @string@ and the rough equivalent @string@ operation of the right.
    517566Hence, it is possible to directly convert a block of C string operations into @string@ just by changing the
    518567
     
    552601
    553602
     603\subsection{Parameter Passing}
     604
     605A substring is treated as a pointer into the base (substringed) string rather than creating a copy of the subtext.
     606Hence, if the referenced item is changed, then the pointer sees the change.
     607Pointers to the result value of a substring operation are defined to always start at the same location in their base string as long as that starting location exists, independent of changes to themselves or the base string.
     608However, if the base string value changes, this may affect the values of one or more of the substrings to that base string.
     609If the base string value shortens so that its end is before the starting location of a substring, resulting in the substring starting location disappearing, the substring becomes a null string located at the end of the base string.
     610
     611The following example illustrates passing the results of substring operations by reference and by value to a subprogram.
     612Notice the side-effects to other reference parameters as one is modified.
     613\begin{cfa}
     614main() {
     615        string x = "xxxxxxxxxxxxx";
     616        test( x, x(1,3), x(3,3), x(5,5), x(9,5), x(9,5) );
     617}
     618
     619// x, a, b, c, & d are substring results passed by reference
     620// e is a substring result passed by value
     621void test(string &x, string &a, string &b, string &c, string &d, string e) {
     622                                                                        $\C{//   x                                a               b               c               d               e}$
     623        a( 1, 2 ) = "aaa";                              $\C{// aaaxxxxxxxxxxx   aaax    axx             xxxxx   xxxxx   xxxxx}$
     624        b( 2, 12 ) = "bbb";                             $\C{// aaabbbxxxxxxxxx  aaab    abbb    bbxxx   xxxxx   xxxxx}$
     625        c( 4, 5 ) = "ccc";                              $\C{// aaabbbxcccxxxxxx aaab    abbb    bbxccc  ccxxx   xxxxx}$
     626        c = "yyy";                                              $\C{// aaabyyyxxxxxx    aaab    abyy    yyy             xxxxx   xxxxx}$
     627        d( 1, 3 ) = "ddd";                              $\C{// aaabyyyxdddxx    aaab    abyy    yyy             dddxx   xxxxx}$
     628        e( 1, 3 ) = "eee";                              $\C{// aaabyyyxdddxx    aaab    abyy    yyy             dddxx   eeexx}$
     629        x = e;                                                  $\C{// eeexx                    eeex    exx             x                               eeexx}$
     630}
     631\end{cfa}
     632
     633
    554634\subsection{Input/Output Operators}
    555635
     
    559639
    560640
    561 \section{Implementation Details}
     641\section{Implementation}
    562642
    563643While \VRef[Figure]{f:StrApiCompare} emphasizes cross-language similarities, it elides many specific operational differences.
Note: See TracChangeset for help on using the changeset viewer.