diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 8ffe5b0..ac2fb57 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -114,7 +114,9 @@ jobs: path: ./wheelhouse/*.whl name: wheels-${{ matrix.os }} - # disabling QBLAS optimization for windows due to incompatibility with MSVC + # QBLAS is auto-disabled on Windows by meson.build (it uses GCC/POSIX-only + # constructs that MSVC does not support); the wheel falls back to the + # naive matmul kernel. build_wheels_windows: name: Build wheels on Windows runs-on: windows-latest @@ -153,9 +155,6 @@ jobs: CIBW_BUILD_VERBOSITY: "3" DISTUTILS_USE_SDK: "1" MSSdk: "1" - CIBW_ENVIRONMENT: > - CFLAGS="/DDISABLE_QUADBLAS $CFLAGS" - CXXFLAGS="/DDISABLE_QUADBLAS $CXXFLAGS" CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin' CIBW_TEST_COMMAND_WINDOWS: pip install numpy && pip install --no-deps {wheel} && pip install pytest pytest-run-parallel && pytest -s {project}/tests CIBW_TEST_EXTRAS: test diff --git a/.github/workflows/test_old_cpu.yml b/.github/workflows/test_old_cpu.yml index 476a876..d292a69 100644 --- a/.github/workflows/test_old_cpu.yml +++ b/.github/workflows/test_old_cpu.yml @@ -1,8 +1,4 @@ -name: Test on Older CPUs (x86_64-v2) - -# This workflow tests numpy-quaddtype on older x86 CPUs using Intel SDE. -# It ensures compatibility with x86_64-v2 baseline CPUs (e.g., Sandy Bridge) -# that don't have AVX2/FMA support. +name: Test Older CPUs + Build Options on: push: @@ -13,14 +9,21 @@ on: jobs: test_old_cpu: - name: Test on ${{ matrix.cpu[1] }} + name: Test ${{ matrix.config.name }} runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: - cpu: - - ['snb', 'Sandy Bridge (x86_64-v2)'] - - ['hsw', 'Haswell (x86_64-v3)'] + config: + - name: "Sandy Bridge (x86_64-v2)" + sde_cpu: "snb" + meson_args: "-Csetup-args=-Ddisable_fma=true" + - name: "Haswell (x86_64-v3)" + sde_cpu: "hsw" + meson_args: "" + - name: "Haswell, no QBLAS" + sde_cpu: "hsw" + meson_args: "-Csetup-args=-Ddisable_quadblas=true" steps: - uses: actions/checkout@v6 with: @@ -51,27 +54,21 @@ jobs: env: LDFLAGS: "-fopenmp" run: | - # For Sandy Bridge (x86-64-v2), we need to disable FMA code paths - # since FMA instructions are not available on that microarchitecture - if [ "${{ matrix.cpu[0] }}" = "snb" ]; then - pip install .[test] --no-build-isolation -v -Csetup-args=-Ddisable_fma=true - else - pip install .[test] --no-build-isolation -v - fi + pip install .[test] --no-build-isolation -v ${{ matrix.config.meson_args }} - - name: Test import on ${{ matrix.cpu[1] }} + - name: Test import on ${{ matrix.config.name }} run: | - echo "Testing basic import on ${{ matrix.cpu[1] }}..." - sde -${{ matrix.cpu[0] }} -- python -c " + echo "Testing basic import on ${{ matrix.config.name }}..." + sde -${{ matrix.config.sde_cpu }} -- python -c " import numpy as np print('NumPy version:', np.__version__) from numpy_quaddtype import QuadPrecDType print('QuadPrecDType imported successfully!') arr = np.zeros(3, dtype=QuadPrecDType()) print('Created quad array:', arr) - print('SUCCESS: Works on ${{ matrix.cpu[1] }}!') + print('SUCCESS: Works on ${{ matrix.config.name }}!') " - - name: Run tests on ${{ matrix.cpu[1] }} + - name: Run tests on ${{ matrix.config.name }} run: | - sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short -v -s + sde -${{ matrix.config.sde_cpu }} -- python -m pytest tests/ -v --tb=short -v -s diff --git a/.gitignore b/.gitignore index 6a64dd3..f5cec6c 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,4 @@ compile_commands.json # docs /docs/_build/ +build_log.txt diff --git a/meson.build b/meson.build index d0aa2d8..5bc11f4 100644 --- a/meson.build +++ b/meson.build @@ -15,10 +15,7 @@ if is_windows add_project_arguments('-DWIN32', '-D_WINDOWS', language : ['c', 'cpp']) endif -qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep']) -# Try to find SLEEF system-wide first, fall back to subproject if not found -# Required SLEEF version (must match sleef.wrap revision) required_sleef_version = '3.9.0' # Don't use fallback here - we need to call subproject() explicitly later with disable_fma option sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, required: false) @@ -30,7 +27,7 @@ if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version) # SLEEF found system-wide - verify quad-precision support cpp = meson.get_compiler('cpp') sleefquad_lib = cpp.find_library('sleefquad', required: false) - + if sleefquad_lib.found() sleefquad_test_code = ''' #include @@ -48,7 +45,7 @@ if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version) dependencies: [sleef_dep, sleefquad_lib], name: 'SLEEF quad-precision support' ) - + if quad_works sleefquad_dep = declare_dependency( dependencies: [sleef_dep, sleefquad_lib] @@ -80,6 +77,23 @@ else message('Proceeding with vendored SLEEF subproject instead') endif +# QBLAS does not build under MSVC (GCC-only flags, POSIX-only APIs, GCC +# built-ins for CPUID); force-disable it on Windows. Users on other +# platforms can opt out via -Ddisable_quadblas=true to fall back to the +# naive matmul kernel. +disable_quadblas = is_windows or get_option('disable_quadblas') +if disable_quadblas + if is_windows + message('QBLAS disabled (Windows / MSVC) - using naive matmul kernel') + else + message('QBLAS disabled by user option - using naive matmul kernel') + endif + add_project_arguments('-DDISABLE_QUADBLAS', language: ['c', 'cpp']) + qblas_dep = declare_dependency() +else + qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep']) +endif + incdir_numpy = run_command(py, ['-c', 'import numpy; print(numpy.get_include())'], check : true @@ -101,12 +115,6 @@ npymath_lib = c.find_library('npymath', dirs: npymath_path) dependencies = [py_dep, qblas_dep, sleef_dep, sleefquad_dep, npymath_lib] -# Add OpenMP dependency (optional, for threading) -openmp_dep = dependency('openmp', required: false, static: false) -if openmp_dep.found() - dependencies += openmp_dep -endif - # compiler flags for QBLAS compatibility if not is_windows # QBLAS requires extended numeric literals for Q suffix support diff --git a/meson.options b/meson.options index d871c14..abdd203 100644 --- a/meson.options +++ b/meson.options @@ -1,3 +1,9 @@ option('disable_fma', type: 'boolean', value: false, description: 'Disable FMA (Fused Multiply-Add) code paths' + - 'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.') \ No newline at end of file + 'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.') + +option('disable_quadblas', type: 'boolean', value: false, + description: 'Skip the QBLAS subproject and fall back to naive ' + + 'matmul kernels. Auto-enabled on Windows because ' + + 'QBLAS uses GCC/POSIX-only constructs that do not ' + + 'build under MSVC.') diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp index 9d8c762..f2d27f2 100644 --- a/src/csrc/quadblas_interface.cpp +++ b/src/csrc/quadblas_interface.cpp @@ -1,116 +1,74 @@ +// numpy-quaddtype shim around QBLAS. + #include "quadblas_interface.h" #include #include #ifndef DISABLE_QUADBLAS -#include "quadblas/quadblas.hpp" -#endif // DISABLE_QUADBLAS +#include +#endif extern "C" { +#ifndef DISABLE_QUADBLAS -#ifndef DISABLE_QUADBLAS +static inline QBLAS_LAYOUT to_layout(char c) { + return (c == 'C' || c == 'c') ? QblasColMajor : QblasRowMajor; +} +static inline QBLAS_TRANSPOSE to_trans(char c) { + if (c == 'T' || c == 't') return QblasTrans; + if (c == 'C' || c == 'c') return QblasConjTrans; + return QblasNoTrans; +} int -qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) +qblas_dot(size_t n, Sleef_quad *x, size_t incx, + Sleef_quad *y, size_t incy, Sleef_quad *result) { if (!x || !y || !result || n == 0) { return -1; } - - try { - *result = QuadBLAS::dot(n, x, incx, y, incy); - return 0; - } - catch (...) { - return -1; - } + *result = cblas_qdot((int)n, x, (int)incx, y, (int)incy); + return 0; } int -qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, - size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy) +qblas_gemv(char layout, char trans, size_t m, size_t n, + Sleef_quad *alpha, Sleef_quad *A, size_t lda, + Sleef_quad *x, size_t incx, + Sleef_quad *beta, Sleef_quad *y, size_t incy) { if (!alpha || !A || !x || !beta || !y || m == 0 || n == 0) { return -1; } - - try { - // Convert layout - QuadBLAS::Layout qblas_layout; - if (layout == 'R' || layout == 'r') { - qblas_layout = QuadBLAS::Layout::RowMajor; - } - else if (layout == 'C' || layout == 'c') { - qblas_layout = QuadBLAS::Layout::ColMajor; - } - else { - return -1; // Invalid layout - } - - // Handle transpose (swap dimensions for transpose) - size_t actual_m = m, actual_n = n; - if (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c') { - std::swap(actual_m, actual_n); - // For transpose, we need to adjust the layout - if (qblas_layout == QuadBLAS::Layout::RowMajor) { - qblas_layout = QuadBLAS::Layout::ColMajor; - } - else { - qblas_layout = QuadBLAS::Layout::RowMajor; - } - } - - // Call QBLAS GEMV - QuadBLAS::gemv(qblas_layout, actual_m, actual_n, *alpha, A, lda, x, incx, *beta, y, incy); - - return 0; - } - catch (...) { - return -1; - } + cblas_qgemv(to_layout(layout), to_trans(trans), + (int)m, (int)n, + *alpha, A, (int)lda, + x, (int)incx, + *beta, y, (int)incy); + return 0; } int -qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, - Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, - size_t ldc) +qblas_gemm(char layout, char transa, char transb, + size_t m, size_t n, size_t k, + Sleef_quad *alpha, Sleef_quad *A, size_t lda, + Sleef_quad *B, size_t ldb, + Sleef_quad *beta, Sleef_quad *C, size_t ldc) { if (!alpha || !A || !B || !beta || !C || m == 0 || n == 0 || k == 0) { return -1; } - - try { - QuadBLAS::Layout qblas_layout; - if (layout == 'R' || layout == 'r') { - qblas_layout = QuadBLAS::Layout::RowMajor; - } - else if (layout == 'C' || layout == 'c') { - qblas_layout = QuadBLAS::Layout::ColMajor; - } - else { - return -1; // Invalid layout - } - - // For now, we only support no transpose - // TODO: Implement transpose support if needed - if ((transa != 'N' && transa != 'n') || (transb != 'N' && transb != 'n')) { - return -1; // Transpose not implemented yet - } - - QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc); - - return 0; - } - catch (...) { - return -1; - } + cblas_qgemm(to_layout(layout), to_trans(transa), to_trans(transb), + (int)m, (int)n, (int)k, + *alpha, A, (int)lda, B, (int)ldb, + *beta, C, (int)ldc); + return 0; } int qblas_supports_backend(QuadBackendType backend) { - // QBLAS only supports SLEEF backend return (backend == BACKEND_SLEEF) ? 1 : 0; } @@ -121,113 +79,94 @@ py_quadblas_set_num_threads(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "i", &num_threads)) { return NULL; } - if (num_threads <= 0) { PyErr_SetString(PyExc_ValueError, "Number of threads must be positive"); return NULL; } - - QuadBLAS::set_num_threads(num_threads); + qblas_set_num_threads(num_threads); Py_RETURN_NONE; } PyObject * py_quadblas_get_num_threads(PyObject *self, PyObject *args) { - int num_threads = QuadBLAS::get_num_threads(); - return PyLong_FromLong(num_threads); + return PyLong_FromLong(qblas_get_num_threads()); } PyObject * py_quadblas_get_version(PyObject *self, PyObject *args) { - return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS"); + /* qblas_get_version() returns "QBLAS X.Y.Z"; pair it with the + * runtime-detected SIMD tier so callers can confirm what's active. */ + const char *ver = qblas_get_version(); + const char *tier = qblas_get_dispatch_tier(); + char buf[256]; + PyOS_snprintf(buf, sizeof buf, "%s (dispatch: %s)", ver, tier); + return PyUnicode_FromString(buf); } int _quadblas_set_num_threads(int num_threads) { - QuadBLAS::set_num_threads(num_threads); + qblas_set_num_threads(num_threads); return 0; } int _quadblas_get_num_threads(void) { - int num_threads = QuadBLAS::get_num_threads(); - return num_threads; + return qblas_get_num_threads(); } -#else // DISABLE_QUADBLAS - +#else /* DISABLE_QUADBLAS */ int qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) -{ - return -1; // QBLAS is disabled, dot product not available -} +{ return -1; } int qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy) -{ - return -1; // QBLAS is disabled, GEMV not available -} +{ return -1; } int qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, size_t ldc) -{ - return -1; // QBLAS is disabled, GEMM not available -} +{ return -1; } -int -qblas_supports_backend(QuadBackendType backend) -{ - return -1; // QBLAS is disabled, backend support not available -} +int qblas_supports_backend(QuadBackendType backend) { return -1; } PyObject * py_quadblas_set_num_threads(PyObject *self, PyObject *args) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return NULL; } - PyObject * py_quadblas_get_num_threads(PyObject *self, PyObject *args) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return NULL; } - PyObject * py_quadblas_get_version(PyObject *self, PyObject *args) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return NULL; } -int -_quadblas_set_num_threads(int num_threads) +int _quadblas_set_num_threads(int num_threads) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return -1; } - -int -_quadblas_get_num_threads(void) +int _quadblas_get_num_threads(void) { - // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); return -1; } -#endif // DISABLE_QUADBLAS +#endif /* DISABLE_QUADBLAS */ -} // extern "C" \ No newline at end of file +} /* extern "C" */ diff --git a/subprojects/packagefiles/qblas/meson.build b/subprojects/packagefiles/qblas/meson.build deleted file mode 100644 index b7a972f..0000000 --- a/subprojects/packagefiles/qblas/meson.build +++ /dev/null @@ -1,8 +0,0 @@ -project('qblas', meson_version: '>=1.1') - -qblas_inc = include_directories('include') - -qblas_dep = declare_dependency( - include_directories: qblas_inc, - version: meson.project_version() -) diff --git a/subprojects/qblas.wrap b/subprojects/qblas.wrap index d8c2a89..019cb0f 100644 --- a/subprojects/qblas.wrap +++ b/subprojects/qblas.wrap @@ -1,8 +1,7 @@ [wrap-git] -directory=qblas -url=https://github.com/SwayamInSync/QBLAS.git -revision=42126fd78cbc04e9b031475fe39f4f46eaa51e01 -patch_directory = qblas +directory = qblas +url = https://github.com/SwayamInSync/QBLAS.git +revision = 8deb36b67ae4d2c81dfca2ceac8957deb8f23c9b [provide] qblas = qblas_dep