Richard Guenther - [PATCH][PING][RFC] Add support for calling external library for vectori (original) (raw)
This is the mail archive of the gcc-patches@gcc.gnu.orgmailing list for the GCC project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
[PATCH][PING][RFC] Add support for calling external library for vectorizing (on x86_64)
- From: Richard Guenther
- To: gcc-patches at gcc dot gnu dot org
- Date: Thu, 1 Feb 2007 16:26:01 +0100 (CET)
- Subject: [PATCH][PING][RFC] Add support for calling external library for vectorizing (on x86_64)
This adds support for vectorization of intrinsics through an external library (similar to how gfortran got -fblas). To quote the mail from Dec.
I expect the set of libraries supported to grow, but as only ACML provides the "simple" two-value wrappers for sin, cos, etc. it is a natural start.The patch doesn't add automagic linking - just the interface is specified (so OSS implementations are possible, of course the primary target would be ACML itself or an off-gcc libgcc-math).
Use of more of the routines in those kind of libraries requires changes to the vectorizer infrastructure (support for v4df and v8sf mode interfaces, support for whole-array functions which also Intel MKL supports (sin(n, double*, double*)-style), more idiom recognition to dispatch to the blas/lapack routines, etc.)
I made this a target option - we can move it to common.opt if ppc folks or others want to use it, too, but -march and -mtune are also targte specific, so I just followed that example. A mechanism for automatic linking to the libraries via some configury can be added later (or omitted), likewise selecting a (configurable) default.
How do people feel about such interfacing? I suppose the Cell folks are going to interface in a similar way (but supposedly have a standard library they can use).
Thanks, Richard.
2006-12-10 Richard Guenther rguenther@suse.de
* doc/invoke.texi (-mveclib): Document new target option.
* config/i386/i386.opt (-mveclib): New target option.
* config/i386/i386.c (ix86_veclib_handler): Handler for
vectorization library support.
(override_options): Handle the -mveclib option, initialize
the vectorization library handler.
(ix86_builtin_vectorized_function): As fallback call the
vectorization library handler, if set.
(ix86_veclib_acml): New static function for ACML style
vectorization support.
Index: doc/invoke.texi
*** doc/invoke.texi (revision 119706)
--- doc/invoke.texi (working copy)
*************** Objective-C and Objective-C++ Dialects}.
*** 537,543 ****
-mthreads -mno-align-stringops -minline-all-stringops @gol
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
-m96bit-long-double -mregparm=@var{num} -mx87regparm @gol
! -msseregparm @gol -mstackrealign @gol
-momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol
-mcmodel=@var{code-model} @gol
-m32 -m64 -mlarge-data-threshold=@var{num}}
--- 537,543 ----
-mthreads -mno-align-stringops -minline-all-stringops @gol
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
-m96bit-long-double -mregparm=@var{num} -mx87regparm @gol
! -msseregparm -mveclib=@var{type} -mstackrealign @gol
-momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol
-mcmodel=@var{code-model} @gol
-m32 -m64 -mlarge-data-threshold=@var{num}}
*************** supported architecture, using the approp
*** 9693,9698 ****
--- 9693,9709 ----
the file containing the CPU detection code should be compiled without
these options.
+ @item -mveclib=@var{type}
+ @opindex mveclib
+ Specifies the ABI type to use for vectorizing intrinsics using an
+ external library. Supported types are @code{acml} for the AMD
+ math core library style of interfacing. GCC will currently emit
+ calls to @code{__vrd2_sin}, @code{__vrd2_cos}, @code{__vrd2_exp},
+ @code{__vrd2_log}, @code{__vrd2_log2}, @code{__vrd2_log10},
+ @code{__vrs4_sinf}, @code{__vrs4_cosf}, @code{__vrs4_expf},
+ @code{__vrs4_logf}, @code{__vrs4_log2f}, @code{__vrs4_log10f}
+ and @code{__vrs4_powf} when using this type.
+
@item -mpush-args
@itemx -mno-push-args
@opindex mpush-args
Index: config/i386/i386.opt
*** config/i386/i386.opt (revision 119706)
--- config/i386/i386.opt (working copy)
*************** mtune=
*** 241,245 ****
--- 241,249 ----
Target RejectNegative Joined Var(ix86_tune_string)
Schedule code for given CPU
+ mveclib=
+ Target RejectNegative Joined Var(ix86_veclib_string)
+ Vector library interface to use
+
;; Support Athlon 3Dnow builtins
Mask(3DNOW_A)
Index: config/i386/i386.c
*** config/i386/i386.c (revision 119706) --- config/i386/i386.c (working copy) *************** static void x86_64_elf_unique_section (t *** 1403,1408 **** --- 1403,1412 ---- static section *x86_64_elf_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) ATTRIBUTE_UNUSED;
+
- /* Vectorization library interface and handlers. */
- tree (*ix86_veclib_handler)(enum built_in_function, tree) = NULL;
- static tree ix86_veclib_acml (enum built_in_function, tree); /* Initialize the GCC target structure. */ #undef TARGET_ATTRIBUTE_TABLE
*************** override_options (void) *** 2199,2204 **** --- 2203,2218 ---- if (!TARGET_80387) target_flags &= ~MASK_FLOAT_RETURNS;
- /* Use external vectorized library in vectorizing intrinsics. */
- if (ix86_veclib_string)
{
if (strcmp (ix86_veclib_string, "acml") == 0)
ix86_veclib_handler = ix86_veclib_acml;
else
error ("unknown vectorization library type (%s) for -mveclib= switch",
ix86_veclib_string);
}
- if ((x86_accumulate_outgoing_args & TUNEMASK) && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) && !optimize_size)
*************** ix86_builtin_vectorized_function (enum b *** 17625,17644 **** case BUILT_IN_SQRT: if (el_mode == DFmode && n == 2) return ix86_builtins[IX86_BUILTIN_SQRTPD]; ! return NULL_TREE;
case BUILT_IN_SQRTF:
if (el_mode == SFmode && n == 4)
return ix86_builtins[IX86_BUILTIN_SQRTPS];
! return NULL_TREE;
default:
;
}
return NULL_TREE;
}
/* Store OPERAND to the memory after reload is completed. This means that we can't easily use assign_stack_local. */ rtx --- 17639,17741 ---- case BUILT_IN_SQRT: if (el_mode == DFmode && n == 2) return ix86_builtins[IX86_BUILTIN_SQRTPD]; ! break;
case BUILT_IN_SQRTF:
if (el_mode == SFmode && n == 4)
return ix86_builtins[IX86_BUILTIN_SQRTPS];
! break;
default:
;
}
/* Dispatch to a handler for a vectorization library. */
if (ix86_veclib_handler)
return (*ix86_veclib_handler)(fn, type);
return NULL_TREE; }
/* Handler for an ACML-style interface to a library with vectorized
intrinsics. */
static tree
ix86_veclib_acml (enum built_in_function fn, tree type)
{
char name[20] = "_vr..";
tree fntype, new_fndecl, args;
unsigned arity;
const char *bname;
enum machine_mode el_mode;
int n;
/* The ACML is 64bits only and suitable for unsafe math only as
it does not correctly support parts of IEEE with the required
precision such as denormals. */
if (!TARGET_64BIT
|| !flag_unsafe_math_optimizations)
return NULL_TREE;
el_mode = TYPE_MODE (TREE_TYPE (type));
n = TYPE_VECTOR_SUBPARTS (type);
switch (fn)
{
case BUILT_IN_SIN:
case BUILT_IN_COS:
case BUILT_IN_EXP:
case BUILT_IN_LOG:
case BUILT_IN_LOG2:
case BUILT_IN_LOG10:
name[4] = 'd';
name[5] = '2';
if (el_mode != DFmode
|| n != 2)
return NULL_TREE;
break;
case BUILT_IN_SINF:
case BUILT_IN_COSF:
case BUILT_IN_EXPF:
case BUILT_IN_POWF:
case BUILT_IN_LOGF:
case BUILT_IN_LOG2F:
case BUILT_IN_LOG10F:
name[4] = 's';
name[5] = '4';
if (el_mode != SFmode
|| n != 4)
return NULL_TREE;
break;
default:
return NULL_TREE;
}
bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
sprintf (name + 7, "%s", bname+10);
arity = 0;
for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
args = TREE_CHAIN (args))
arity++;
if (arity == 1)
fntype = build_function_type_list (type, type, NULL);
else
fntype = build_function_type_list (type, type, type, NULL);
/* Build a function declaration for the vectorized function. */
new_fndecl = build_decl (FUNCTION_DECL, get_identifier (name), fntype);
TREE_PUBLIC (new_fndecl) = 1;
DECL_EXTERNAL (new_fndecl) = 1;
DECL_IS_NOVOPS (new_fndecl) = 1;
TREE_READONLY (new_fndecl) = 1;
return new_fndecl;
}
/* Store OPERAND to the memory after reload is completed. This means that we can't easily use assign_stack_local. */ rtx
- Follow-Ups:
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |