diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 8ffe5b0..ac2fb57 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -114,7 +114,9 @@ jobs:
           path: ./wheelhouse/*.whl
           name: wheels-${{ matrix.os }}
 
-  # disabling QBLAS optimization for windows due to incompatibility with MSVC
+  # QBLAS is auto-disabled on Windows by meson.build (it uses GCC/POSIX-only
+  # constructs that MSVC does not support); the wheel falls back to the
+  # naive matmul kernel.
   build_wheels_windows:
     name: Build wheels on Windows
     runs-on: windows-latest
@@ -153,9 +155,6 @@ jobs:
           CIBW_BUILD_VERBOSITY: "3"
           DISTUTILS_USE_SDK: "1"
           MSSdk: "1"
-          CIBW_ENVIRONMENT: >
-            CFLAGS="/DDISABLE_QUADBLAS $CFLAGS"
-            CXXFLAGS="/DDISABLE_QUADBLAS $CXXFLAGS"
           CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
           CIBW_TEST_COMMAND_WINDOWS: pip install numpy && pip install --no-deps {wheel} && pip install pytest pytest-run-parallel && pytest -s {project}/tests
           CIBW_TEST_EXTRAS: test
diff --git a/.github/workflows/test_old_cpu.yml b/.github/workflows/test_old_cpu.yml
index 476a876..d292a69 100644
--- a/.github/workflows/test_old_cpu.yml
+++ b/.github/workflows/test_old_cpu.yml
@@ -1,8 +1,4 @@
-name: Test on Older CPUs (x86_64-v2)
-
-# This workflow tests numpy-quaddtype on older x86 CPUs using Intel SDE.
-# It ensures compatibility with x86_64-v2 baseline CPUs (e.g., Sandy Bridge)
-# that don't have AVX2/FMA support.
+name: Test Older CPUs + Build Options
 
 on:
   push:
@@ -13,14 +9,21 @@ on:
 
 jobs:
   test_old_cpu:
-    name: Test on ${{ matrix.cpu[1] }}
+    name: Test ${{ matrix.config.name }}
     runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
       matrix:
-        cpu:
-          - ['snb', 'Sandy Bridge (x86_64-v2)']
-          - ['hsw', 'Haswell (x86_64-v3)']
+        config:
+          - name: "Sandy Bridge (x86_64-v2)"
+            sde_cpu: "snb"
+            meson_args: "-Csetup-args=-Ddisable_fma=true"
+          - name: "Haswell (x86_64-v3)"
+            sde_cpu: "hsw"
+            meson_args: ""
+          - name: "Haswell, no QBLAS"
+            sde_cpu: "hsw"
+            meson_args: "-Csetup-args=-Ddisable_quadblas=true"
     steps:
       - uses: actions/checkout@v6
         with:
@@ -51,27 +54,21 @@ jobs:
         env:
           LDFLAGS: "-fopenmp"
         run: |
-          # For Sandy Bridge (x86-64-v2), we need to disable FMA code paths
-          # since FMA instructions are not available on that microarchitecture
-          if [ "${{ matrix.cpu[0] }}" = "snb" ]; then
-            pip install .[test] --no-build-isolation -v -Csetup-args=-Ddisable_fma=true
-          else
-            pip install .[test] --no-build-isolation -v
-          fi
+          pip install .[test] --no-build-isolation -v ${{ matrix.config.meson_args }}
 
-      - name: Test import on ${{ matrix.cpu[1] }}
+      - name: Test import on ${{ matrix.config.name }}
         run: |
-          echo "Testing basic import on ${{ matrix.cpu[1] }}..."
-          sde -${{ matrix.cpu[0] }} -- python -c "
+          echo "Testing basic import on ${{ matrix.config.name }}..."
+          sde -${{ matrix.config.sde_cpu }} -- python -c "
           import numpy as np
           print('NumPy version:', np.__version__)
           from numpy_quaddtype import QuadPrecDType
           print('QuadPrecDType imported successfully!')
           arr = np.zeros(3, dtype=QuadPrecDType())
           print('Created quad array:', arr)
-          print('SUCCESS: Works on ${{ matrix.cpu[1] }}!')
+          print('SUCCESS: Works on ${{ matrix.config.name }}!')
           "
 
-      - name: Run tests on ${{ matrix.cpu[1] }}
+      - name: Run tests on ${{ matrix.config.name }}
         run: |
-          sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short -v -s
+          sde -${{ matrix.config.sde_cpu }} -- python -m pytest tests/ -v --tb=short -v -s
diff --git a/.gitignore b/.gitignore
index 6a64dd3..f5cec6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -146,3 +146,4 @@ compile_commands.json
 
 # docs
 /docs/_build/
+build_log.txt
diff --git a/meson.build b/meson.build
index d0aa2d8..5bc11f4 100644
--- a/meson.build
+++ b/meson.build
@@ -15,10 +15,7 @@ if is_windows
   add_project_arguments('-DWIN32', '-D_WINDOWS', language : ['c', 'cpp'])
 endif
 
-qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep'])
 
-# Try to find SLEEF system-wide first, fall back to subproject if not found
-# Required SLEEF version (must match sleef.wrap revision)
 required_sleef_version = '3.9.0'
 # Don't use fallback here - we need to call subproject() explicitly later with disable_fma option
 sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, required: false)
@@ -30,7 +27,7 @@ if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version)
   # SLEEF found system-wide - verify quad-precision support
   cpp = meson.get_compiler('cpp')
   sleefquad_lib = cpp.find_library('sleefquad', required: false)
-  
+
   if sleefquad_lib.found()
     sleefquad_test_code = '''
     #include <sleefquad.h>
@@ -48,7 +45,7 @@ if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version)
       dependencies: [sleef_dep, sleefquad_lib],
       name: 'SLEEF quad-precision support'
     )
-    
+
     if quad_works
       sleefquad_dep = declare_dependency(
         dependencies: [sleef_dep, sleefquad_lib]
@@ -80,6 +77,23 @@ else
   message('Proceeding with vendored SLEEF subproject instead')
 endif
 
+# QBLAS does not build under MSVC (GCC-only flags, POSIX-only APIs, GCC
+# built-ins for CPUID); force-disable it on Windows. Users on other
+# platforms can opt out via -Ddisable_quadblas=true to fall back to the
+# naive matmul kernel.
+disable_quadblas = is_windows or get_option('disable_quadblas')
+if disable_quadblas
+  if is_windows
+    message('QBLAS disabled (Windows / MSVC) - using naive matmul kernel')
+  else
+    message('QBLAS disabled by user option - using naive matmul kernel')
+  endif
+  add_project_arguments('-DDISABLE_QUADBLAS', language: ['c', 'cpp'])
+  qblas_dep = declare_dependency()
+else
+  qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep'])
+endif
+
 incdir_numpy = run_command(py,
   ['-c', 'import numpy; print(numpy.get_include())'],
   check : true
@@ -101,12 +115,6 @@ npymath_lib = c.find_library('npymath', dirs: npymath_path)
 
 dependencies = [py_dep, qblas_dep, sleef_dep, sleefquad_dep, npymath_lib]
 
-# Add OpenMP dependency (optional, for threading)
-openmp_dep = dependency('openmp', required: false, static: false)
-if openmp_dep.found()
-    dependencies += openmp_dep
-endif
-
 # compiler flags for QBLAS compatibility
 if not is_windows
     # QBLAS requires extended numeric literals for Q suffix support
diff --git a/meson.options b/meson.options
index d871c14..abdd203 100644
--- a/meson.options
+++ b/meson.options
@@ -1,3 +1,9 @@
 option('disable_fma', type: 'boolean', value: false,
        description: 'Disable FMA (Fused Multiply-Add) code paths' +
-                    'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.')
\ No newline at end of file
+                    'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.')
+
+option('disable_quadblas', type: 'boolean', value: false,
+       description: 'Skip the QBLAS subproject and fall back to naive ' +
+                    'matmul kernels. Auto-enabled on Windows because ' +
+                    'QBLAS uses GCC/POSIX-only constructs that do not ' +
+                    'build under MSVC.')
diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp
index 9d8c762..f2d27f2 100644
--- a/src/csrc/quadblas_interface.cpp
+++ b/src/csrc/quadblas_interface.cpp
@@ -1,116 +1,74 @@
+// numpy-quaddtype shim around QBLAS.
+
 #include "quadblas_interface.h"
 #include <cstring>
 #include <algorithm>
 
 #ifndef DISABLE_QUADBLAS
-#include "quadblas/quadblas.hpp"
-#endif // DISABLE_QUADBLAS
+#include <qblas/qblas.h>
+#endif
 
 extern "C" {
 
+#ifndef DISABLE_QUADBLAS
 
-#ifndef  DISABLE_QUADBLAS
+static inline QBLAS_LAYOUT to_layout(char c) {
+    return (c == 'C' || c == 'c') ? QblasColMajor : QblasRowMajor;
+}
+static inline QBLAS_TRANSPOSE to_trans(char c) {
+    if (c == 'T' || c == 't') return QblasTrans;
+    if (c == 'C' || c == 'c') return QblasConjTrans;
+    return QblasNoTrans;
+}
 
 int
-qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
+qblas_dot(size_t n, Sleef_quad *x, size_t incx,
+          Sleef_quad *y, size_t incy, Sleef_quad *result)
 {
     if (!x || !y || !result || n == 0) {
         return -1;
     }
-
-    try {
-        *result = QuadBLAS::dot(n, x, incx, y, incy);
-        return 0;
-    }
-    catch (...) {
-        return -1;
-    }
+    *result = cblas_qdot((int)n, x, (int)incx, y, (int)incy);
+    return 0;
 }
 
 int
-qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
-           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy)
+qblas_gemv(char layout, char trans, size_t m, size_t n,
+           Sleef_quad *alpha, Sleef_quad *A, size_t lda,
+           Sleef_quad *x, size_t incx,
+           Sleef_quad *beta, Sleef_quad *y, size_t incy)
 {
     if (!alpha || !A || !x || !beta || !y || m == 0 || n == 0) {
         return -1;
     }
-
-    try {
-        // Convert layout
-        QuadBLAS::Layout qblas_layout;
-        if (layout == 'R' || layout == 'r') {
-            qblas_layout = QuadBLAS::Layout::RowMajor;
-        }
-        else if (layout == 'C' || layout == 'c') {
-            qblas_layout = QuadBLAS::Layout::ColMajor;
-        }
-        else {
-            return -1;  // Invalid layout
-        }
-
-        // Handle transpose (swap dimensions for transpose)
-        size_t actual_m = m, actual_n = n;
-        if (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c') {
-            std::swap(actual_m, actual_n);
-            // For transpose, we need to adjust the layout
-            if (qblas_layout == QuadBLAS::Layout::RowMajor) {
-                qblas_layout = QuadBLAS::Layout::ColMajor;
-            }
-            else {
-                qblas_layout = QuadBLAS::Layout::RowMajor;
-            }
-        }
-
-        // Call QBLAS GEMV
-        QuadBLAS::gemv(qblas_layout, actual_m, actual_n, *alpha, A, lda, x, incx, *beta, y, incy);
-
-        return 0;
-    }
-    catch (...) {
-        return -1;
-    }
+    cblas_qgemv(to_layout(layout), to_trans(trans),
+                (int)m, (int)n,
+                *alpha, A, (int)lda,
+                x, (int)incx,
+                *beta, y, (int)incy);
+    return 0;
 }
 
 int
-qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
-           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
-           size_t ldc)
+qblas_gemm(char layout, char transa, char transb,
+           size_t m, size_t n, size_t k,
+           Sleef_quad *alpha, Sleef_quad *A, size_t lda,
+           Sleef_quad *B, size_t ldb,
+           Sleef_quad *beta, Sleef_quad *C, size_t ldc)
 {
     if (!alpha || !A || !B || !beta || !C || m == 0 || n == 0 || k == 0) {
         return -1;
     }
-
-    try {
-        QuadBLAS::Layout qblas_layout;
-        if (layout == 'R' || layout == 'r') {
-            qblas_layout = QuadBLAS::Layout::RowMajor;
-        }
-        else if (layout == 'C' || layout == 'c') {
-            qblas_layout = QuadBLAS::Layout::ColMajor;
-        }
-        else {
-            return -1;  // Invalid layout
-        }
-
-        // For now, we only support no transpose
-        // TODO: Implement transpose support if needed
-        if ((transa != 'N' && transa != 'n') || (transb != 'N' && transb != 'n')) {
-            return -1;  // Transpose not implemented yet
-        }
-
-        QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
-
-        return 0;
-    }
-    catch (...) {
-        return -1;
-    }
+    cblas_qgemm(to_layout(layout), to_trans(transa), to_trans(transb),
+                (int)m, (int)n, (int)k,
+                *alpha, A, (int)lda, B, (int)ldb,
+                *beta,  C, (int)ldc);
+    return 0;
 }
 
 int
 qblas_supports_backend(QuadBackendType backend)
 {
-    // QBLAS only supports SLEEF backend
     return (backend == BACKEND_SLEEF) ? 1 : 0;
 }
 
@@ -121,113 +79,94 @@ py_quadblas_set_num_threads(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "i", &num_threads)) {
         return NULL;
     }
-
     if (num_threads <= 0) {
         PyErr_SetString(PyExc_ValueError, "Number of threads must be positive");
         return NULL;
     }
-
-    QuadBLAS::set_num_threads(num_threads);
+    qblas_set_num_threads(num_threads);
     Py_RETURN_NONE;
 }
 
 PyObject *
 py_quadblas_get_num_threads(PyObject *self, PyObject *args)
 {
-    int num_threads = QuadBLAS::get_num_threads();
-    return PyLong_FromLong(num_threads);
+    return PyLong_FromLong(qblas_get_num_threads());
 }
 
 PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args)
 {
-    return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS");
+    /* qblas_get_version() returns "QBLAS X.Y.Z"; pair it with the
+     * runtime-detected SIMD tier so callers can confirm what's active. */
+    const char *ver  = qblas_get_version();
+    const char *tier = qblas_get_dispatch_tier();
+    char buf[256];
+    PyOS_snprintf(buf, sizeof buf, "%s (dispatch: %s)", ver, tier);
+    return PyUnicode_FromString(buf);
 }
 
 int
 _quadblas_set_num_threads(int num_threads)
 {
-    QuadBLAS::set_num_threads(num_threads);
+    qblas_set_num_threads(num_threads);
     return 0;
 }
 
 int
 _quadblas_get_num_threads(void)
 {
-    int num_threads = QuadBLAS::get_num_threads();
-    return num_threads;
+    return qblas_get_num_threads();
 }
 
-#else  // DISABLE_QUADBLAS
-
+#else  /* DISABLE_QUADBLAS */
 
 int
 qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
-{
-    return -1;  // QBLAS is disabled, dot product not available
-}
+{ return -1; }
 
 int
 qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
            size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy)
-{
-    return -1;  // QBLAS is disabled, GEMV not available
-}
+{ return -1; }
 
 int
 qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
            Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
            size_t ldc)
-{
-    return -1;  // QBLAS is disabled, GEMM not available
-}
+{ return -1; }
 
-int
-qblas_supports_backend(QuadBackendType backend)
-{
-    return -1; // QBLAS is disabled, backend support not available
-}
+int qblas_supports_backend(QuadBackendType backend) { return -1; }
 
 PyObject *
 py_quadblas_set_num_threads(PyObject *self, PyObject *args)
 {
-    // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
     return NULL;
 }
-
 PyObject *
 py_quadblas_get_num_threads(PyObject *self, PyObject *args)
 {
-    // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
     return NULL;
 }
-
 PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args)
 {
-    // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
     return NULL;
 }
 
-int
-_quadblas_set_num_threads(int num_threads)
+int _quadblas_set_num_threads(int num_threads)
 {
-    // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
     return -1;
 }
-
-int
-_quadblas_get_num_threads(void)
+int _quadblas_get_num_threads(void)
 {
-    // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
     return -1;
 }
 
-#endif // DISABLE_QUADBLAS
+#endif /* DISABLE_QUADBLAS */
 
-}  // extern "C"
\ No newline at end of file
+}  /* extern "C" */
diff --git a/subprojects/packagefiles/qblas/meson.build b/subprojects/packagefiles/qblas/meson.build
deleted file mode 100644
index b7a972f..0000000
--- a/subprojects/packagefiles/qblas/meson.build
+++ /dev/null
@@ -1,8 +0,0 @@
-project('qblas', meson_version: '>=1.1')
-
-qblas_inc = include_directories('include')
-
-qblas_dep = declare_dependency(
-    include_directories: qblas_inc,
-    version: meson.project_version()
-)
diff --git a/subprojects/qblas.wrap b/subprojects/qblas.wrap
index d8c2a89..019cb0f 100644
--- a/subprojects/qblas.wrap
+++ b/subprojects/qblas.wrap
@@ -1,8 +1,7 @@
 [wrap-git]
-directory=qblas
-url=https://github.com/SwayamInSync/QBLAS.git
-revision=42126fd78cbc04e9b031475fe39f4f46eaa51e01
-patch_directory = qblas
+directory = qblas
+url = https://github.com/SwayamInSync/QBLAS.git
+revision = 8deb36b67ae4d2c81dfca2ceac8957deb8f23c9b
 
 [provide]
 qblas = qblas_dep