From b86bd6947c7e556753de4b937d256a2cbf91076b Mon Sep 17 00:00:00 2001
From: Markus Schwarz <markus.schwarz@kit.edu>
Date: Fri, 14 Feb 2025 16:29:22 +0100
Subject: [PATCH] fix for --optimize compile option and ARM chips

The --optimize compile option included some x86 specific instruction sets. These lead to errors when an ARM-based CPU is used. This fix checks for ARM CPU and then ignores the x86 specific instruction.
---
 blond/compile.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/blond/compile.py b/blond/compile.py
index 63162b08..822f011a 100644
--- a/blond/compile.py
+++ b/blond/compile.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import argparse
 import ctypes
 import os
+import platform
 import subprocess
 import sys
 
@@ -211,21 +212,23 @@ def compile_cpp_library(args, cflags, float_flags, libs, cpp_files):
                 # Format the output list
                 stdout = ret.stdout.replace('#define ', '').replace(
                     '__ 1', '').replace('__', '').split('\n')
-                # Add the appropriate vectorization flag (not use avx512)
-                if 'AVX2' in stdout:
-                    cflags += ['-mavx2']
-                elif 'AVX' in stdout:
-                    cflags += ['-mavx']
-                elif 'SSE4_2' in stdout or 'SSE4_1' in stdout:
-                    cflags += ['-msse4']
-                elif 'SSE3' in stdout:
-                    cflags += ['-msse3']
-                else:
-                    cflags += ['-msse']
-
-                # Add FMA if supported
-                if 'FMA' in stdout:
-                    cflags += ['-mfma']
+                # following options exist only on x86 processors
+                if 'arm' not in platform.machine():
+                    # Add the appropriate vectorization flag (not use avx512)
+                    if 'AVX2' in stdout:
+                        cflags += ['-mavx2']
+                    elif 'AVX' in stdout:
+                        cflags += ['-mavx']
+                    elif 'SSE4_2' in stdout or 'SSE4_1' in stdout:
+                        cflags += ['-msse4']
+                    elif 'SSE3' in stdout:
+                        cflags += ['-msse3']
+                    else:
+                        cflags += ['-msse']
+    
+                    # Add FMA if supported
+                    if 'FMA' in stdout:
+                        cflags += ['-mfma']
 
         root, ext = os.path.splitext(args['libname'])
         if not ext:
@@ -312,10 +315,9 @@ def compile_cuda_library(args, nvccflags, float_flags, cuda_files, nvcc):
     # Compile the GPU library
     # print('\n' + ''.join(['='] * 80))
     print('\nCompiling the CUDA library')
-    import cupy as cp
-
     if args['gpu'] == 'discover':
         print('Discovering the device compute capability..')
+        import cupy as cp
 
         dev = cp.cuda.Device(0)
         dev_name = cp.cuda.runtime.getDeviceProperties(dev)['name']
-- 
GitLab