- Notifications
You must be signed in to change notification settings - Fork 15.3k
Labels
clang:headersHeaders provided by Clang, e.g. for intrinsicsHeaders provided by Clang, e.g. for intrinsics
Description
Here is the current Clang implementation of vec_pack_to_short_fp32:
llvm-project/clang/lib/Headers/altivec.h
Lines 7518 to 7527 in 5f5cf60
| static __inline__ vector unsigned short __ATTRS_o_ai | |
| vec_pack_to_short_fp32(vector float __a, vector float __b) { | |
| vector float __resa = __builtin_vsx_xvcvsphp(__a); | |
| vector float __resb = __builtin_vsx_xvcvsphp(__b); | |
| #ifdef __LITTLE_ENDIAN__ | |
| return (vector unsigned short)vec_mergee(__resa, __resb); | |
| #else | |
| return (vector unsigned short)vec_mergeo(__resa, __resb); | |
| #endif | |
| } |
Here is the corrected implementation of vec_pack_to_short_fp32:
static __inline__ vector unsigned short __ATTRS_o_ai vec_pack_to_short_fp32(vector float __a, vector float __b) { vector unsigned int __resa = (vector unsigned int)__builtin_vsx_xvcvsphp(__a); vector unsigned int __resb = (vector unsigned int)__builtin_vsx_xvcvsphp(__b); return vec_pack(__resa, __resb); }Here is a test program (which needs to be compiled with the -std=c11 -mcpu=power9 options) that can be used to check the results of the vec_pack_to_short_fp32 operation:
#include <stdint.h> #include <stdlib.h> #include <stdio.h> #include <stdalign.h> #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include <altivec.h> #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") inline __attribute__((__always_inline__, __artificial__)) double Float16BitsToDouble(uint16_t f16_bits) { double dbl_result; __vector unsigned short f16_vect = vec_splats(f16_bits); __asm__("xscvhpdp %x0,%x1" : "=wa" (dbl_result) : "wa" (f16_vect)); return dbl_result; } int main(int argc, char** argv) { alignas(16) float input_vals[8]; alignas(16) uint16_t result_vals[8]; for(int i = 1; i < argc; ) { input_vals[0] = strtof(argv[i++], NULL); input_vals[1] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; input_vals[2] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; input_vals[3] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; input_vals[4] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; input_vals[5] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; input_vals[6] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; input_vals[7] = (i < argc) ? strtof(argv[i++], NULL) : 0.0f; __vector float src_vect_a = *((const __vector float*)input_vals); __vector float src_vect_b = *((const __vector float*)(input_vals + 4)); __vector unsigned short result_vect = vec_pack_to_short_fp32(src_vect_a, src_vect_b); *((__vector unsigned short*)result_vals) = result_vect; for(int j = 0; j < 8; j++) printf("Float32ToFloat16(%g) = %g\n", input_vals[j], Float16BitsToDouble(result_vals[j])); } return 0; }Here are the results of running the above test program when compiled with GCC 12:
$ ./vsx_vec_pack_to_short_fp32_test_021623_gcc 1.518 2.4447 3.3932 6.4842 -1.4912 -3.3938 -7.532 6.6662 Float32ToFloat16(1.518) = 1.51758 Float32ToFloat16(2.4447) = 2.44531 Float32ToFloat16(3.3932) = 3.39258 Float32ToFloat16(6.4842) = 6.48438 Float32ToFloat16(-1.4912) = -1.49121 Float32ToFloat16(-3.3938) = -3.39453 Float32ToFloat16(-7.532) = -7.53125 Float32ToFloat16(6.6662) = 6.66797Here are the results of running the above test program when compiled with Clang 15 or Clang 17:
$ ./vsx_vec_pack_to_short_fp32_test_021623_clang 1.518 2.4447 3.3932 6.4842 -1.4912 -3.3938 -7.532 6.6662 Float32ToFloat16(1.518) = 1.51758 Float32ToFloat16(2.4447) = 0 Float32ToFloat16(3.3932) = -1.49121 Float32ToFloat16(6.4842) = 0 Float32ToFloat16(-1.4912) = 3.39258 Float32ToFloat16(-3.3938) = 0 Float32ToFloat16(-7.532) = -7.53125 Float32ToFloat16(6.6662) = 0Metadata
Metadata
Assignees
Labels
clang:headersHeaders provided by Clang, e.g. for intrinsicsHeaders provided by Clang, e.g. for intrinsics