Skip to content
Snippets Groups Projects

Resolve "--optimize compiler option does not work for ARM-based CPUs"

1 file
+ 19
17
Compare changes
  • Side-by-side
  • Inline
+ 19
17
@@ -22,6 +22,7 @@ from __future__ import print_function
@@ -22,6 +22,7 @@ from __future__ import print_function
import argparse
import argparse
import ctypes
import ctypes
import os
import os
 
import platform
import subprocess
import subprocess
import sys
import sys
@@ -211,21 +212,23 @@ def compile_cpp_library(args, cflags, float_flags, libs, cpp_files):
@@ -211,21 +212,23 @@ def compile_cpp_library(args, cflags, float_flags, libs, cpp_files):
# Format the output list
# Format the output list
stdout = ret.stdout.replace('#define ', '').replace(
stdout = ret.stdout.replace('#define ', '').replace(
'__ 1', '').replace('__', '').split('\n')
'__ 1', '').replace('__', '').split('\n')
# Add the appropriate vectorization flag (not use avx512)
# following options exist only on x86 processors
if 'AVX2' in stdout:
if 'arm' not in platform.machine():
cflags += ['-mavx2']
# Add the appropriate vectorization flag (not use avx512)
elif 'AVX' in stdout:
if 'AVX2' in stdout:
cflags += ['-mavx']
cflags += ['-mavx2']
elif 'SSE4_2' in stdout or 'SSE4_1' in stdout:
elif 'AVX' in stdout:
cflags += ['-msse4']
cflags += ['-mavx']
elif 'SSE3' in stdout:
elif 'SSE4_2' in stdout or 'SSE4_1' in stdout:
cflags += ['-msse3']
cflags += ['-msse4']
else:
elif 'SSE3' in stdout:
cflags += ['-msse']
cflags += ['-msse3']
else:
# Add FMA if supported
cflags += ['-msse']
if 'FMA' in stdout:
cflags += ['-mfma']
# Add FMA if supported
 
if 'FMA' in stdout:
 
cflags += ['-mfma']
root, ext = os.path.splitext(args['libname'])
root, ext = os.path.splitext(args['libname'])
if not ext:
if not ext:
@@ -312,10 +315,9 @@ def compile_cuda_library(args, nvccflags, float_flags, cuda_files, nvcc):
@@ -312,10 +315,9 @@ def compile_cuda_library(args, nvccflags, float_flags, cuda_files, nvcc):
# Compile the GPU library
# Compile the GPU library
# print('\n' + ''.join(['='] * 80))
# print('\n' + ''.join(['='] * 80))
print('\nCompiling the CUDA library')
print('\nCompiling the CUDA library')
import cupy as cp
if args['gpu'] == 'discover':
if args['gpu'] == 'discover':
print('Discovering the device compute capability..')
print('Discovering the device compute capability..')
 
import cupy as cp
dev = cp.cuda.Device(0)
dev = cp.cuda.Device(0)
dev_name = cp.cuda.runtime.getDeviceProperties(dev)['name']
dev_name = cp.cuda.runtime.getDeviceProperties(dev)['name']
Loading