Changeset 5e0b6657 for doc/theses


Ignore:
Timestamp:
Dec 8, 2025, 11:29:33 AM (3 months ago)
Author:
Michael Brooks <mlbrooks@…>
Branches:
master, stuck-waitfor-destruct
Children:
79ba50c
Parents:
8f448e0 (diff), 79ec8c3 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge remote-tracking branch 'refs/remotes/origin/master'

Location:
doc/theses
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/fangren_yu_MMath/test.adb

    r8f448e0 r5e0b6657  
    66        Function Random return Float is begin return 3.5; end;
    77        Function Random return Unbounded_String is begin return To_Unbounded_String( "abc" ); end;
    8        
     8
    99        Procedure Print( V : Integer ) is begin Put_Line( Integer'Image(V) ); end;
    1010        Procedure Print( V : Float ) is begin Put_Line( Float'Image(V) ); end;
    1111        Procedure Print( V : Unbounded_String ) is begin Put_Line( Ada.Strings.Unbounded.To_String(V) ); end;
    12        
     12
    1313        Function Func( V : Integer ) return Integer is begin return V; end;
    1414        Function Func( V : Float ) return Float is begin return V; end;
    1515        Function Func( V : Unbounded_String ) return Unbounded_String is begin return V; end;
    1616        Function Func( V1 : Integer; V2 : Float ) return Float is begin return Float(V1) + V2; end;
    17        
    18         subtype Int      is Integer;
     17
     18        subtype Int              is Integer;
    1919        J : Int;
    2020        Function "-"( L, R : Int ) return Int is begin Put_Line( "X" ); return Integer(L) + (-Integer(R)); end; --  prevent infinite recusion
    21        
    22 --      duplicate body for "-" declared at line 20
    23 --      subtype SInt is Integer range -10 .. 10;
    24 --      Function "-"( L, R : SInt ) return SInt is begin Put_Line( "X" ); return Integer(L) + (-Integer(R)); end; --  prevent infinite recusion
    25        
     21
     22        --      duplicate body for "-" declared at line 20
     23        --      subtype SInt is Integer range -10 .. 10;
     24        --      Function "-"( L, R : SInt ) return SInt is begin Put_Line( "X" ); return Integer(L) + (-Integer(R)); end; --  prevent infinite recusion
     25
    2626        i : Integer;
    2727        f : Float;
    2828        s : Unbounded_String;
    29        
     29
    3030        Type Complex is
    31         record
    32             Re, Im : Float;
    33         end record;
     31                record
     32                Re, Im : Float;
     33                end record;
    3434        c : Complex := (Re => 1.0, Im => 1.0);
    35     Procedure Grind (X : Complex) is begin Put_Line( "Grind1" ); end;
    36     Procedure Grind (X : Unbounded_String) is begin Put_Line( "Grind2" ); end;
    37        
     35        Procedure Grind (X : Complex) is begin Put_Line( "Grind1" ); end;
     36        Procedure Grind (X : Unbounded_String) is begin Put_Line( "Grind2" ); end;
     37
    3838        generic
    39            type T is private;
    40            with function "+"( X, Y: T ) return T;
     39                type T is private;
     40                with function "+"( X, Y: T ) return T;
    4141        function twice( X : T ) return T;
    42        
     42
    4343        function twice( X: T ) return T is
    4444        begin
    45            Put_Line( "XXX" ); return X + X;   -- The formal operator "*".
     45                Put_Line( "XXX" ); return X + X;        -- The formal operator "*".
    4646        end twice;
    4747
    4848        function Int_Twice is new Twice( Integer, "+" => "+" );
    4949        function float_Twice is new Twice( float, "+" => "+" );
    50        
    51 --      generic units cannot be overloaded
    52 --      generic
    53 --        type T is private;
    54 --        with function "+"( X, Y: T ) return T;
    55 --      function twice( X : T; Y : T ) return T;
    56        
    57 --      function twice( X: T; Y : T ) return T is
    58 --      begin
    59 --        Put_Line( "XXX" ); return X + X;   -- The formal operator "*".
    60 --      end twice;
     50
     51        --      generic units cannot be overloaded
     52        --      generic
     53        --              type T is private;
     54        --              with function "+"( X, Y: T ) return T;
     55        --      function twice( X : T; Y : T ) return T;
     56
     57        --      function twice( X: T; Y : T ) return T is
     58        --      begin
     59        --              Put_Line( "XXX" ); return X + X;        -- The formal operator "*".
     60        --      end twice;
    6161begin
    62    I := 3;
    63    I := 7 - 3;
    64         Print( i );
    65    F := 3.8;
    66    I := 7 - 2;
    67         Print( i );
    68    J := 7 - 3;
    69         Print( j );
    70    
     62        I := 3;
     63        I := 7 - 3;
     64        Print( i );
     65        F := 3.8;
     66        I := 7 - 2;
     67        Print( i );
     68        J := 7 - 3;
     69        Print( j );
     70
    7171        i := Random;
    72         Print( i );
     72        Print( i );
    7373        f := Random;
    74         Print( f );
     74        Print( f );
    7575        s := Random;
    76         Print( s );
    77        
    78         Print( Func( V => 7 ) );
    79         Print( Func( 7.5 ) );
    80         Print( Func( To_Unbounded_String( "abc" ) ) );
    81         Print( Func( 3, 3.5 ) );
    82 --      Print( Func( 3, 3 ) );
    83        
     76        Print( s );
     77
     78        Print( Func( V => 7 ) );
     79        Print( Func( 7.5 ) );
     80        Print( Func( To_Unbounded_String( "abc" ) ) );
     81        Print( Func( 3, 3.5 ) );
     82        --              Print( Func( 3, 3 ) );
     83
    8484        Grind( X => (Re => 1.0, Im => 1.0) );
    8585        Grind( c );
    8686        Grind( To_Unbounded_String( "abc" ) );
    87        
     87
    8888        i := Int_Twice( 2 );
    8989        Put_Line( Integer'Image(i) );
    9090        f := float_Twice( 2.5 );
    91         Print( f );
     91        Print( f );
    9292end test;
    9393
  • doc/theses/mike_brooks_MMath/intro.tex

    r8f448e0 r5e0b6657  
    22
    33All modern programming languages provide three high-level containers (collections): array, linked-list, and string.
    4 Often array is part of the programming language, while linked-list is built from (recursive) pointer types, and string from a combination of array and linked-list.
     4Often array is part of the programming language, while linked-lists are built from (recursive) pointer types, and strings from a combination of array and linked-list.
    55For all three types, languages and/or their libraries supply varying degrees of high-level mechanisms for manipulating these objects at the bulk and component level, such as copying, slicing, extracting, and iterating among elements.
    66
    77Unfortunately, these three aspects of C cause a significant number of memory errors~\cite{Oorschot23}.
     8Estimates suggest 50\%~\cite{Mendio24} of total reported open-source vulnerabilities occurring in C result from errors using these facilities (memory errors).
    89For operating system and browser vendors, who heavily use systems languages, 60\%--70\% of reported software vulnerabilities involved memory errors~\cite{Kehrer23}.
    910For Microsoft, 70\% of vulnerabilities addressed via security updates between 2006--2018 are memory safety issues~\cite[slide 10]{Miller19}.
    1011In a study of software exploits in the U.S. National Vulnerability Database over 2013--2017, the top reported vulnerability is (memory) buffer errors, among 19 vulnerability categories~\cite{Cifuentes19}.
    11 Therefore, hardening these three C types goes a long way to make the majority of C programs safer.
     12Therefore, hardening these three C types goes a long way to make the majority of C programs safer and eliminating major hacker attack-vectors.
    1213
    1314This work looks at extending these three foundational container types in the programming language \CFA, which is a new dialect of the C programming language.
     
    2627The array size can be static, dynamic but fixed after creation, or dynamic and variable after creation.
    2728For static and dynamic-fixed, an array can be stack allocated, while dynamic-variable requires the heap.
    28 Because array layout has contiguous components, subscripting is a computation (some form of pointer arithmetic).
     29Because array layout has contiguous components, subscripting is a computation, \ie some form of pointer arithmetic.
    2930
    3031C provides a simple array type as a language feature.
    31 However, it adopts the controversial language position of treating pointer and array as duals, leading to multiple problems.
     32However, it adopts the controversial position of treating pointer and array as duals, leading to multiple problems.
    3233
    3334
     
    3637A linked-list provides a homogeneous container often with $O(log N)$ or $O(N)$ access to elements using successor and predecessor operations that normally involve pointer chasing.
    3738Subscripting by value (rather than position or location as for array) is sometimes available, \eg hash table.
    38 Linked types are normally dynamically sized by adding and removing nodes using link fields internal or external to the elements (nodes).
    39 If a programming language allows pointers to stack storage, linked-list nodes can be allocated on the stack and connected with stack addresses (pointers);
     39Linked types are dynamically sized by adding and removing nodes using link fields internal or external to the list elements (nodes).
     40If a programming language allows pointers to stack storage, linked-list nodes can be allocated on the stack and connected with stack addresses;
    4041otherwise, elements are heap allocated with internal or external link fields.
    4142
     
    4748hash search table consisting of a key (string) with associated data (@<search.h>@)
    4849\end{itemize}
    49 Because these libraries are simple, awkward to use, and unsafe, C programmers commonly build bespoke linked data-structures.
     50Because these container libraries can be restrictive, awkward to use, and unsafe, C programmers often build bespoke linked data-structures, which further increases the potential for memory errors.
    5051
    5152
     
    5455A string provides a dynamic array of homogeneous elements, where the elements are (often) some form of human-readable characters.
    5556What differentiates a string from other types in that many of its operations work on groups of elements for scanning and changing, \eg @index@ and @substr@.
    56 While subscripting individual elements is usually available, working at the individual character level is considered poor practise, \ie underutilizing the powerful string operations.
     57While subscripting individual elements is usually available, working at the character level is considered poor practise, \ie underutilizing the powerful string operations.
    5758Therefore, the cost of a string operation is usually less important than the power of the operation to accomplish complex text manipulation, \eg search, analysing, composing, and decomposing string text.
    5859The dynamic nature of a string means storage is normally heap allocated but often implicitly managed, even in unmanaged languages.
    59 In some cases, string management is separate from heap management, \ie strings roll their own heap.
     60In many cases, string management is separate from heap management, \ie strings roll their own heap.
    6061
    6162The C string type is just a character array and requires user storage-management to handle varying size.
    62 The character array uses the convention of marking its (variable) array length by placing the 0-valued control character at the end (null-terminated).
     63C adopts a terminating sentinel character, @'\0'@, to mark the end of a variable-length character-array versus a preceding length field.
     64Hence, the sentinel character is excluded from a string and determining the string length is an $O(N)$ operation.
    6365The C standard library includes a number of high-level operations for working with this representation.
    64 Most of these operations are awkward and error prone.
     66Most of these operations are awkward to use and error prone.
    6567
    6668
     
    8082\begin{enumerate}[leftmargin=*]
    8183\item
    82 These three aspects of C are difficult to understand, teach, and get right because they are correspondingly low level.
     84The three primary container types in C are difficult to understand, teach, and get right because they are too low level.
    8385Providing higher-level, feature-rich versions of these containers in \CFA is a major component of the primary goal.
     86The result is a simplify programming experience, which increases productivity and maintainability.
    8487\item
    85 These three aspects of C cause the greatest safety issues because there are few or no safe guards when a programmer misunderstands or misuses these features~\cite{Elliott18, Blache19, Ruef19, Oorschot23}.
    86 Estimates suggest 50\%~\cite{Mendio24} of total reported open-source vulnerabilities occurring in C result from errors using these facilities (memory errors), providing the major hacker attack-vectors.
     88The new container types must be as correct and safe to use as those in other modern programming languages, which has been shown to a primary concern of industry, government, and military~\cite{ONCD}.
     89Prior approaches focus on out-of-bound array accesses using a model-based approach (ASCET) in embedded systems (\eg cars)~\cite{Blache19}, and general and null-terminated string arrays using a \CC template syntax (Checked C)~\cite{Elliott18,Ruef19}.
     90Both White House~\cite{WhiteHouse24} and DARPA~\cite{DARPA24} recently released a recommendation to \emph{move away} from C and \CC, because of cybersecurity threats exploiting vulnerabilities in these languages.
     91Fixing these vulnerabilities negates this need, allowing C and its ecosystem to continue into the future.
    8792\end{enumerate}
    88 Both White House~\cite{WhiteHouse24} and DARPA~\cite{DARPA24} recently released a recommendation to move away from C and \CC, because of cybersecurity threats exploiting vulnerabilities in these older languages.
    89 Hardening these three types goes a long way to make the majority of C programs safer.
    9093
    91 
    92 While multiple new languages purport to be systems languages replacing C, the reality is that rewriting massive C code-bases is impractical and a non-starter if the new runtime uses garage collection.
     94While new languages, \eg Go, Rust, Swift, purport to be systems languages replacing C, the reality is that rewriting massive C code-bases is impractical and a non-starter if the runtime uses garbage collection.
     95Even assuming automated conversion of C programs to other safe languages, who will maintain this massive new code-base?
    9396Furthermore, new languages must still interact with the underlying C operating system through fragile, type-unsafe, interlanguage-communication.
    94 Switching to \CC is equally impractical as its complex and interdependent type-system (\eg objects, overloading, inheritance, templates) means idiomatic \CC code is difficult to use from C, and C programmers must expend significant effort learning new \CC.
    95 Hence, rewriting and retraining costs for these languages can be prohibitive for companies with a large C software-base (Google, Apple, Microsoft, Amazon, AMD, Nvidia).
     97Switching to \CC is equally impractical as its complex and interdependent type-system (\eg objects, overloading, inheritance, templates) means idiomatic \CC code is difficult to use from C, and C programmers must expend significant effort learning new \CC features and idioms.
     98Finally, projects claiming to be modern C replacements (Cyclone~\cite{Grossman06}, Polymorphic C~\cite{Smith98}, GObject~\cite{GObject}) do not retain C backwards compatibility in syntax, programing model, or semantic compatibility;
     99these languages are equivalent to switching to a different programming language.
     100Hence, rewriting and retraining costs for other languages are prohibitive when companies have a large C software-base (Google, Apple, Microsoft, Amazon, AMD, Nvidia).
    96101
    97102
     
    100105Like many established programming languages, C has a standards committee and multiple ANSI/\-ISO language manuals~\cite{C99,C11,C18,C23}.
    101106However, most programming languages are only partially explained by their standard's manual(s).
    102 When it comes to explaining how C works, the definitive source is the @gcc@ compiler, which is mimicked by other C compilers, such as Clang~\cite{clang}.
    103 Often other C compilers \emph{must} mimic @gcc@ because a large part of the C library (runtime) system (@glibc@ on Linux) contains @gcc@ features.
     107When it comes to explaining how C works, the definitive source is the @gcc@ compiler, which is mimicked by other C compilers, such as Clang~\cite{clang}, because a large part of the C library (runtime) system (@glibc@ on Linux) contains @gcc@ features.
    104108Some key aspects of C need to be explained and understood by quoting from the language reference manual.
    105109However, to illustrate actual program semantics, this thesis constructs multiple small programs whose behaviour exercises a particular point and then confirms the behaviour by running the program using multiple @gcc@ compilers.
    106110These example programs show
    107 \begin{itemize}
     111\begin{itemize}[itemsep=0pt]
    108112        \item if the compiler accepts or rejects certain syntax,
    109113        \item prints output to buttress a behavioural claim,
     
    111115\end{itemize}
    112116These programs are tested across @gcc@ versions 8--14 and clang versions 10--14 running on ARM, AMD, and Intel architectures.
    113 Any discovered anomalies among compilers, versions, or architectures is discussed.
     117Any discovered anomalies among compilers, versions, or architectures are discussed.
    114118In general, it is never clear whether the \emph{truth} lies in the C standard or the compiler(s), which may be true for other programming languages.
    115119
     
    118122
    119123Overall, this work has produced significant syntactic and semantic improvements to C's arrays, linked-lists and string types.
    120 As well, a strong plan for general iteration has been sketched out.
     124% As well, a strong plan for general iteration has been sketched out.
    121125The following are the detailed contributions, where performance and safety were always the motivating factors.
    122126
    123127\subsection{Array}
    124128
    125 This work's array improvements are:
     129The improvements to C arrays are:
    126130\begin{enumerate}[leftmargin=*]
    127131\item
    128 Introduce a small number of subtle changes to the typing rules for the C array, while still achieving significant backwards compatibility
     132Introduce a small number of subtle changes to the typing rules for the C array, while still achieving significant backwards compatibility.
    129133\item
    130134Create a new polymorphic mechanism in the \CFA @forall@ clause to specify array dimension values, similar to a fixed-typed parameter in a \CC \lstinline[language=C++]{template}.
  • doc/theses/mike_brooks_MMath/uw-ethesis-frontpgs.tex

    r8f448e0 r5e0b6657  
    129129\begin{center}\textbf{Abstract}\end{center}
    130130
     131\CFA strives to fix mistakes in C, chief among them, safety.
     132This thesis presents a significant step forward in \CFA's goal to remove unsafe pointer operations.
     133The thesis presents improvements to the \CFA language design, both syntax and semantics, to support advanced container features.
     134These features are implemented across the \CFA compiler, libraries, and runtime system.
     135The results maintain another \CFA goal of remaining 99\% backwards compatible with C.
     136This thesis leverages preexisting work within the compiler's type and runtime systems generated by prior students working on the \CFA project.
     137
    131138All modern programming languages provide three high-level containers (collections): array, linked-list, and string.
    132 Often array is part of the programming language, while linked-list is built from (recursive) pointer types, and string from a combination of array and linked-list.
     139Often array is part of the programming language, while linked-lists are built from (recursive) pointer types, and strings from a combination of array and linked-list.
    133140For all three types, languages and/or their libraries supply varying degrees of high-level mechanisms for manipulating these objects at the bulk and component level, such as copying, slicing, extracting, and iterating among elements.
    134 Unfortunately, these three aspects of C cause 60\%--70\% of the reported software vulnerabilities involved memory errors, and 70\%--80\% of hacker attack-vectors target these types.
     141Unfortunately, these three aspects of C cause 60\%--70\% of the reported software vulnerabilities involving memory errors, and 70\%--80\% of hacker attack-vectors target these types.
    135142Therefore, hardening these three C types goes a long way to make the majority of C programs safer.
    136143
    137 This work looks at extending these three foundational container types in the programming language \CFA, which is a new dialect of the C programming language.
    138 The thesis describes improvements made to the \CFA language design, both syntax and semantics, to support the container features, and the source code created within the \CFA compiler, libraries, and runtime system to implement these features.
    139 This work leverages preexisting work within the compiler's type and runtime systems generated by prior students working on the \CFA project.
    140 
    141 Overall, this work has produced significant syntactic and semantic improvements to C's container types.
    142 \begin{enumerate}[leftmargin=*]
    143 \item
    144 Introduce a small number of subtle changes to the typing rules for the C array, while still achieving significant backwards compatibility.
    145 \item
    146 Create a new polymorphic mechanism in the \CFA @forall@ clause to specify array dimension values, similar to a fixed-typed parameter in a \CC \lstinline[language=C++]{template}.
    147 The new array type, enabled by prior features, defines an array with guaranteed runtime bound checks (often optimizer-removable) and implicit (guaranteed accurate) inter-function length communication.
    148 \item
    149 Create a new polymorphic list type and its runtime library following the established design pattern of intrusive link-fields for performance reasons, especially in concurrent programs.
    150 \item
    151 Create a new string type and runtime library comparable to the \CC @string@ type, including analogous coexistence with raw-character pointers, enabling programs to work with strings by value, without incurring excessive copying.
    152 Substrings are supported, including the ability for overlapping ranges to share edits transparently.
    153 \end{enumerate}
    154 The thesis includes a performance evaluation that shows the new \CFA containers perform comparably with their C counterparts in many programming cases.
     144Specifically, an array utility is provided that tracks length internally, relieving the user of managing explicit length parameters and stopping buffer-overrun errors.
     145This feature requires augmenting the \CFA type system, making array length available at compile and runtime.
     146A linked-list utility is provided, which obviates many explicit recursive pointers by catering directly to system-programming uses (intrusive lists) for which a library solution is often dismissed.
     147Finally, a string utility is provided with implicit memory management of text in a specialized heap, relieving error-prone buffer management, including overrun, and providing a copy-on-write speed boost.
     148For all three utilities, performance is argued to be on-par with, and occasionally surpassing relevant comparators.
     149With the array, this case is made by showing complete erasure down to a naked C array, modulo runtime bound checks, which are removable more often than with Java-style length management.
     150With the linked list and string, empirical measures are compared with relevant libraries.
     151These utilities offer a system programmer workable alternatives to hand-rolling several common causes of system vulnerabilities, thereby improving \CFA's position as a safety-forward system-programming alternative.
    155152
    156153\cleardoublepage
Note: See TracChangeset for help on using the changeset viewer.