Context Navigation

array.hfa@ 80e83b6c

Visit:

Last change on this file since 80e83b6c was eb0d9b7, checked in by Michael Brooks <mlbrooks@…>, 5 weeks ago

Improve libcfa-array's bound-check removal and write that thesis section.

The libcfa change adds a more performant alternative for a subset of multidimensional indexing cases that were already functionally correct.
That the new alternative is more performant is not shown in the test suite.
There is an associated new high-performance option for passing an array-or-slice to a function.
The added test cases cover those options.

The added in-thesis demos rely on the new more-performant alternative for multidimensional indexing.

Property mode set to 100644

File size: 13.5 KB

Rev	Line
[a5e26821]	1	#pragma once
	2
[c7625e0]	3
	4
[6e50a6b]	5	forall( __CFA_tysys_id_only_X & ) struct tag {};
[c7625e0]	6	#define ttag(T) ((tag(T)){})
[6e50a6b]	7	#define ztag(n) ttag(n)
[c7625e0]	8
[fee4436]	9	#ifdef __CFA_DEBUG__
[8ee211d]	10	#define subcheck( arr, sub, len ) \
	11	if ( (sub) < 0 \|\| (sub) >= (len) ) \
[1f6623c]	12	abort( "Subscript %ld exceeds dimension range [0,%zu) for array %p.\n", \
[8ee211d]	13	(sub), (len), (arr) )
	14	#define subchecku( arr, sub, len ) \
	15	if ( (sub) >= (len) ) \
[1f6623c]	16	abort( "Subscript %ld exceeds dimension range [0,%zu) for array %p.\n", \
[8ee211d]	17	(sub), (len), (arr) )
[fee4436]	18	#else
[8ee211d]	19	#define subcheck( arr, sub, len ) do {} while (0)
	20	#define subchecku( arr, sub, len ) do {} while (0)
[fee4436]	21	#endif
[c7625e0]	22
[ad24245]	23	//
	24	// The `array` macro is the public interface.
	25	// It computes the type of a dense (trivially strided) array.
	26	// All user-declared objects are dense arrays.
[c7625e0]	27	//
[ad24245]	28	// The `arpk` (ARray with PacKing info explicit) type is, generally, a slice with _any_ striding.
	29	// This type is meant for internal use.
	30	// CFA programmers should not instantiate it directly, nor access its field.
	31	// CFA programmers should call ?[?] on it.
	32	// Yet user-given `array(stuff)` expands to `arpk(stuff')`.
	33	// The comments here explain the resulting internals.
	34	//
	35	// Just as a plain-C "multidimesional" array is really array-of-array-of-...,
	36	// so does arpk generally show up as arpk-of-arpk-of...
	37	//
	38	// In the example of `array(float, 3, 4, 5) a;`,
	39	// `typeof(a)` is an `arpk` instantiation.
	40	// These comments explain _its_ arguments, i.e. those of the topmost `arpk` level.
	41	//
	42	// [N] : the number of elements in `a`; 3 in the example
	43	// S : carries the stride size (distance in bytes between &myA[0] and &myA[1]), in sizeof(S);
	44	// same as Timmed when striding is trivial, same as Timmed in the example
	45	// Timmed : (T-immediate) the inner type; conceptually, `typeof(a)` is "arpk of Timmed";
	46	// array(float, 4, 5) in the example
	47	// Tbase : (T-base) the deepest element type that is not arpk; float in the example
[c7625e0]	48	//
[63f42a8]	49	forall( [N], S & \| sized(S), Timmed &, Tbase & ) {
[1bb0170]	50	//
[b8e047a]	51	// Single-dim array struct (with explicit packing and atom)
[1bb0170]	52	//
	53	struct arpk {
	54	S strides[N];
	55	};
	56
	57	// About the choice of integral types offered as subscript overloads:
	58	// Intent is to cover these use cases:
	59	// a[0] // i : zero_t
	60	// a[1] // i : one_t
	61	// a[2] // i : int
	62	// float foo( ptrdiff_t i ) { return a[i]; } // i : ptrdiff_t
	63	// float foo( size_t i ) { return a[i]; } // i : size_t
	64	// forall( [N] ) ... for( i; N ) { total += a[i]; } // i : typeof( sizeof(42) )
	65	// for( i; 5 ) { total += a[i]; } // i : int
	66	//
	67	// It gets complicated by:
	68	// - CFA does overloading on concrete types, like int and unsigned int, not on typedefed
	69	// types like size_t. So trying to overload on ptrdiff_t vs int works in 64-bit mode
	70	// but not in 32-bit mode.
	71	//
[0210a543]	72	// cfa -m32 (and gcc) cfa -m64 (and gcc)
	73	// ptrdiff_t int long int
	74	// size_t unsigned int unsigned long int
	75	// typeof( sizeof(42) ) unsigned int unsigned long int
	76	// int int int
[1bb0170]	77	//
	78	// So the solution must support types {zero_t, one_t, int, unsigned int, long int, unsigned long int}
	79	//
	80	// The solution cannot rely on implicit conversions (e.g. just have one overload for ptrdiff_t)
	81	// because assertion satisfaction requires types to match exacly. Both higher-dimensional
	82	// subscripting and operations on slices use asserted subscript operators. The test case
[0210a543]	83	// array-collections/array-sbscr-types covers the combinations. Mike beleives that commenting out
[1bb0170]	84	// any of the current overloads leads to one of those cases failing, either on 64- or 32-bit.
	85	// Mike is open to being shown a smaller set of overloads that still passes the test.
	86
[0210a543]	87
[b8e047a]	88	static inline Timmed & ?[?]( arpk( N, S, Timmed, Tbase ) & a, zero_t ) {
[8ee211d]	89	subcheck( a, 0L, N );
[b8e047a]	90	return (Timmed &)a.strides[0];
[1bb0170]	91	}
	92
[b8e047a]	93	static inline Timmed & ?[?]( arpk( N, S, Timmed, Tbase ) & a, one_t ) {
[8ee211d]	94	subcheck( a, 1L, N );
[b8e047a]	95	return (Timmed &)a.strides[1];
[1bb0170]	96	}
	97
[b8e047a]	98	static inline Timmed & ?[?]( arpk( N, S, Timmed, Tbase ) & a, int i ) {
[8ee211d]	99	subcheck( a, (long int)i, N );
[b8e047a]	100	return (Timmed &)a.strides[i];
[1bb0170]	101	}
	102
[b8e047a]	103	static inline const Timmed & ?[?]( const arpk( N, S, Timmed, Tbase ) & a, int i ) {
[8ee211d]	104	subcheck( a, (long int)i, N );
[b8e047a]	105	return (Timmed &)a.strides[i];
[1bb0170]	106	}
	107
[b8e047a]	108	static inline Timmed & ?[?]( arpk( N, S, Timmed, Tbase ) & a, unsigned int i ) {
[8ee211d]	109	subchecku( a, (unsigned long int)i, N );
[b8e047a]	110	return (Timmed &)a.strides[i];
[1bb0170]	111	}
	112
[b8e047a]	113	static inline const Timmed & ?[?]( const arpk( N, S, Timmed, Tbase ) & a, unsigned int i ) {
[8ee211d]	114	subchecku( a, (unsigned long int)i, N );
[b8e047a]	115	return (Timmed &)a.strides[i];
[1bb0170]	116	}
	117
[b8e047a]	118	static inline Timmed & ?[?]( arpk( N, S, Timmed, Tbase ) & a, long int i ) {
[8ee211d]	119	subcheck( a, i, N );
[b8e047a]	120	return (Timmed &)a.strides[i];
[1bb0170]	121	}
	122
[b8e047a]	123	static inline const Timmed & ?[?]( const arpk( N, S, Timmed, Tbase ) & a, long int i ) {
[8ee211d]	124	subcheck( a, i, N );
[b8e047a]	125	return (Timmed &)a.strides[i];
[1bb0170]	126	}
	127
[b8e047a]	128	static inline Timmed & ?[?]( arpk( N, S, Timmed, Tbase ) & a, unsigned long int i ) {
[8ee211d]	129	subchecku( a, i, N );
[b8e047a]	130	return (Timmed &)a.strides[i];
[1bb0170]	131	}
	132
[b8e047a]	133	static inline const Timmed & ?[?]( const arpk( N, S, Timmed, Tbase ) & a, unsigned long int i ) {
[8ee211d]	134	subchecku( a, i, N );
[b8e047a]	135	return (Timmed &)a.strides[i];
[1bb0170]	136	}
	137
[ee70ff5]	138	static inline size_t len( arpk( N, S, Timmed, Tbase ) & ) {
[1bb0170]	139	return N;
	140	}
	141
[b8e047a]	142	static inline void __taglen( tag(arpk( N, S, Timmed, Tbase )), tag(N) ) {}
[cfbc56ec]	143	}
[a5e26821]	144
[cfbc56ec]	145	// RAII pattern has workarounds for
	146	// - Trac 226: Simplest handling would be, require immediate element to be otype, let autogen
[1bb0170]	147	// raii happen. Performance on even a couple dimensions is unacceptable because of exponential
	148	// thunk creation: ?{}() needs all four otype funcs from next level, so does ^?{}(), so do the
	149	// other two. This solution offers ?{}() that needs only ?{}(), and similar for ^?{}.
[cfbc56ec]	150
[1665ee5]	151	// skip initializing elements
	152	// array(float, 5) x = { delay_init };
[cdf7d43]	153	enum () delay_init_t { delay_init };
[1665ee5]	154	forall( [N], S & \| sized(S), Timmed &, Tbase & )
	155	static inline void ?{}( arpk( N, S, Timmed, Tbase ) & this, delay_init_t ) {
[1bb0170]	156	void ?{}( S (&)[N] ) {}
	157	?{}(this.strides);
[1665ee5]	158	}
[cfbc56ec]	159
[1665ee5]	160	// call default ctor on elements
	161	// array(float, 5) x;
	162	forall( [N], S & \| sized(S), Timmed &, Tbase & \| { void ?{}( Timmed & ); } )
	163	static inline void ?{}( arpk( N, S, Timmed, Tbase ) & this ) {
	164	?{}( this, delay_init );
[b8e047a]	165	for (i; N) ?{}( (Timmed &)this.strides[i] );
[cfbc56ec]	166	}
	167
	168	forall( [N], S & \| sized(S), Timmed &, Tbase & \| { void ^?{}( Timmed & ); } )
[b8e047a]	169	static inline void ^?{}( arpk( N, S, Timmed, Tbase ) & this ) {
[1bb0170]	170	void ^?{}( S (&)[N] ) {}
	171	^?{}(this.strides);
[cfbc56ec]	172
[1bb0170]	173	for (i; N ) {
[b8e047a]	174	^?{}( (Timmed &)this.strides[N-i-1] );
[1bb0170]	175	}
[c7625e0]	176	}
	177
[1665ee5]	178
[c7625e0]	179	//
	180	// Sugar for declaring array structure instances
	181	//
	182
[cfbc56ec]	183	forall( Te * )
[9fa538c]	184	static inline Te mkar_( tag(Te) ) {}
[c7625e0]	185
[b9dae14c]	186	forall( [N], ZTags ... , Trslt &, Tatom & \| { Trslt mkar_( tag(Tatom), ZTags ); } )
[b8e047a]	187	static inline arpk( N, Trslt, Trslt, Tatom) mkar_( tag(Tatom), tag(N), ZTags ) {}
[c7625e0]	188
	189	// based on https://stackoverflow.com/questions/1872220/is-it-possible-to-iterate-over-arguments-in-variadic-macros
	190
[1bb0170]	191	// Make a FOREACH macro
	192	#define FE_0(WHAT)
	193	#define FE_1(WHAT, X) WHAT(X)
	194	#define FE_2(WHAT, X, ...) WHAT(X)FE_1(WHAT, __VA_ARGS__)
	195	#define FE_3(WHAT, X, ...) WHAT(X)FE_2(WHAT, __VA_ARGS__)
	196	#define FE_4(WHAT, X, ...) WHAT(X)FE_3(WHAT, __VA_ARGS__)
	197	#define FE_5(WHAT, X, ...) WHAT(X)FE_4(WHAT, __VA_ARGS__)
	198	//... repeat as needed
[c7625e0]	199
[1bb0170]	200	#define GET_MACRO(_0,_1,_2,_3,_4,_5,NAME,...) NAME
	201	#define FOR_EACH(action,...) \
	202	GET_MACRO(_0,__VA_ARGS__,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)(action,__VA_ARGS__)
[c7625e0]	203
	204	#define COMMA_ttag(X) , ttag(X)
	205	#define array( TE, ...) typeof( mkar_( ttag(TE) FOR_EACH( COMMA_ttag, __VA_ARGS__ ) ) )
	206
	207	#define COMMA_ztag(X) , ztag(X)
	208	#define zarray( TE, ...) typeof( mkar_( ttag(TE) FOR_EACH( COMMA_ztag, __VA_ARGS__ ) ) )
	209
	210	//
	211	// Sugar for multidimensional indexing
	212	//
	213
	214	// Core -[[-,-,-]] operator
	215
[63a4b92]	216	#ifdef TRY_BROKEN_DESIRED_MD_SUBSCRIPT
	217
[c7625e0]	218	// Desired form. One definition with recursion on IxBC (worked until Jan 2021, see trac #__TODO__)
	219
[63a4b92]	220	forall( TA &, TB &, TC &, IxAB, IxBC ... \| { TB & ?[?]( TA &, IxAB ); TC & ?[?]( TB &, IxBC ); } )
[9fa538c]	221	static inline TC & ?[?]( TA & this, IxAB ab, IxBC bc ) {
[1bb0170]	222	return this[ab][bc];
[c7625e0]	223	}
	224
[d1abc63c]	225	#else
[c7625e0]	226
[63a4b92]	227	// Workaround form. Listing all possibilities up to 4 dims.
[c7625e0]	228
[63a4b92]	229	forall( TA &, TB &, TC &, IxAB_0, IxBC \| { TB & ?[?]( TA &, IxAB_0 ); TC & ?[?]( TB &, IxBC ); } )
[9fa538c]	230	static inline TC & ?[?]( TA & this, IxAB_0 ab, IxBC bc ) {
[1bb0170]	231	return this[ab][bc];
[c7625e0]	232	}
	233
[63a4b92]	234	forall( TA &, TB &, TC &, IxAB_0, IxAB_1, IxBC \| { TB & ?[?]( TA &, IxAB_0, IxAB_1 ); TC & ?[?]( TB &, IxBC ); } )
[9fa538c]	235	static inline TC & ?[?]( TA & this, IxAB_0 ab0, IxAB_1 ab1, IxBC bc ) {
[1bb0170]	236	return this[[ab0,ab1]][bc];
[63a4b92]	237	}
	238
	239	forall( TA &, TB &, TC &, IxAB_0, IxAB_1, IxAB_2, IxBC \| { TB & ?[?]( TA &, IxAB_0, IxAB_1, IxAB_2 ); TC & ?[?]( TB &, IxBC ); } )
[9fa538c]	240	static inline TC & ?[?]( TA & this, IxAB_0 ab0, IxAB_1 ab1, IxAB_2 ab2, IxBC bc ) {
[1bb0170]	241	return this[[ab0,ab1,ab2]][bc];
[63a4b92]	242	}
	243
[eb0d9b7]	244	// Further form of -[-,-,-] that avoids using the trait system.
	245	// Above overloads work for any type with (recursively valid) subscript operator,
	246	// provided said subscript is passed as an assertion.
	247	// Below works only on arpk variations but never passes its subscript though an assertion.
	248	//
	249	// When arpk implements the trait used above,
	250	// the critical assertion is backed by a nontrivial thunk.
	251	// There is no "thunk problem" (lifetime) issue, when used as shown in the test suite.
	252	// But the optimizer has shown difficulty removing these thunks in cases where "it should,"
	253	// i.e. when all user code is in one compilation unit.
	254	// Not that every attempt at removing such a thunk fails; cases have been found going both ways.
	255	// Cases have been found with unnecessary bound-checks removed successfully,
	256	// on user code written against the overloads below,
	257	// but where these bound checks (which occur within `call`ed thunks) are not removed,
	258	// on user code written against the overloads above.
	259	//
	260	// The overloads below provide specializations of the above
	261	// that are a little harder to use than the ones above,
	262	// but where array API erasure has been seen to be more effective.
	263	// Note that the style below does not appeal to a case where thunk inlining is more effective;
	264	// rather, it simply does not rely on thunks in the first place.
	265	//
	266	// Both usage styles are shown in test array-md-sbscr-cases#numSubscrTypeCompatibility,
	267	// with the more general one above being "high abstraction,"
	268	// and the more performant one below being "mid abstraction" and "low abstraction."
	269	//
	270	// A breadth of index types is not given here (providing -[size_t,size_t,...] only)
	271	// because these declarations are not feeding a trait, so safe implicit arithmetic conversion kiks in.
	272	// Even so, there may still be an un-met need for accepting
	273	// either ptrdiff_t or size_t (signed or unsigned)
	274	// because Mike has seen the optimizer resist removing bound checks when sign-conversion is in play.
	275	// "Only size_t" is meeting today's need
	276	// and no solution is known that avoids 2^D overloads for D dimensions
	277	// while offering multiple subscript types and staying assertion-free.
	278	//
	279	// This approach, of avoiding traits entirely, is likely incompatible with the original desire
	280	// to have one recursive multidimensional subscript operator (TRY_BROKEN_DESIRED_MD_SUBSCRIPT).
	281	// To make a single declaration work,
	282	// we would probably have to get better at coaxing the optimizer into inlining thunks.
	283
	284	forall( [N2], S2, [N1], S1, Timmed1, Tbase )
	285	static inline Timmed1 & ?[?]( arpk( N2, S2, arpk( N1, S1, Timmed1, Tbase ), Tbase ) & this, size_t ix2, size_t ix1 ) {
	286	return this[ix2][ix1];
	287	}
	288
	289	forall( [N3], S3, [N2], S2, [N1], S1*, Timmed1, Tbase )
	290	static inline Timmed1 & ?[?]( arpk( N3, S3, arpk( N2, S2, arpk( N1, S1, Timmed1, Tbase ), Tbase ), Tbase ) & this, size_t ix3, size_t ix2, size_t ix1 ) {
	291	return this[ix3][ix2][ix1];
	292	}
	293
	294	forall( [N4], S4, [N3], S3, [N2], S2, [N1], S1, Timmed1, Tbase )
	295	static inline Timmed1 & ?[?]( arpk( N4, S4, arpk( N3, S3, arpk( N2, S2, arpk( N1, S1, Timmed1, Tbase ), Tbase ), Tbase ), Tbase ) & this, size_t ix4, size_t ix3, size_t ix2, size_t ix1 ) {
	296	return this[ix4][ix3][ix2][ix1];
	297	}
	298
	299
	300
[63a4b92]	301	#endif
	302
[997324c]	303	// Available for users to work around Trac #265
	304	// If `a[...0...]` isn't working, try `a[...ix0...]` instead.
[a5e26821]	305
[997324c]	306	#define ix0 ((ptrdiff_t)0)
[a5e26821]	307
	308
	309
[c7625e0]	310	//
	311	// Rotation
	312	//
	313
	314	// Base
[63f42a8]	315	forall( [Nq], Sq & \| sized(Sq), Tbase & )
[b8e047a]	316	static inline tag(arpk( Nq, Sq, Tbase, Tbase )) enq_( tag(Tbase ), tag(Nq), tag(Sq), tag(Tbase ) ) {
	317	tag(arpk( Nq, Sq, Tbase, Tbase )) ret;
[1bb0170]	318	return ret;
[6448f7d]	319	}
[c7625e0]	320
	321	// Rec
[b8e047a]	322	forall( [Nq], Sq & \| sized(Sq), [N], S & \| sized(S), recq &, recr &, Tbase & \| { tag(recr) enq_( tag(Tbase), tag(Nq), tag(Sq), tag(recq) ); } )
	323	static inline tag(arpk( N, S, recr, Tbase )) enq_( tag(Tbase ), tag(Nq), tag(Sq), tag(arpk( N, S, recq, Tbase )) ) {
	324	tag(arpk( N, S, recr, Tbase )) ret;
[1bb0170]	325	return ret;
[6448f7d]	326	}
[c7625e0]	327
	328	// Wrapper
[058ece2]	329	extern struct all_t {} all;
[b8e047a]	330	forall( [N], S & \| sized(S), Te &, result &, Tbase & \| { tag(result) enq_( tag(Tbase), tag(N), tag(S), tag(Te) ); } )
	331	static inline result & ?[?]( arpk( N, S, Te, Tbase ) & this, all_t ) {
[1bb0170]	332	return (result&) this;
[c7625e0]	333	}
	334
	335	//
	336	// Trait of array or slice
	337	//
	338
[a5e26821]	339	// desired:
[ee70ff5]	340	// forall( A &, Tv &, [N] )
[7882c58]	341	// trait ar {
[ee70ff5]	342	// Tv& ?[?]( A &, zero_t );
	343	// Tv& ?[?]( A &, one_t );
	344	// Tv& ?[?]( A &, int );
[1bb0170]	345	// ...
[ee70ff5]	346	// size_t len( A & );
[1bb0170]	347	// void __taglen( tag(C), tag(N) );
[a5e26821]	348	// };
	349
	350	// working around N's not being accepted as arguments to traits
	351
[1bb0170]	352	#define ar( A, Tv, N ) { \
[ee70ff5]	353	Tv& ?[?]( A &, zero_t ); \
	354	Tv& ?[?]( A &, one_t ); \
	355	Tv& ?[?]( A &, int ); \
	356	Tv& ?[?]( A &, unsigned int ); \
	357	Tv& ?[?]( A &, long int ); \
	358	Tv& ?[?]( A &, unsigned long int ); \
	359	size_t len( A & ); \
[1bb0170]	360	void __taglen( tag(A), tag(N) ); \
[a5e26821]	361	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format