diff --git a/Doc/ChangeLog b/Doc/ChangeLog
index 80964cba2d..4abd225652 100644
--- a/Doc/ChangeLog
+++ b/Doc/ChangeLog
@@ -1,3 +1,10 @@
+Jan 21, 2026, version 10.3.1
+
+    * (67) bug fix:  incorrect JIT kernel constructed for R=masker(C,M,Z)
+        when R is hypersparse.  Found by Roi Lipman, FalkorDB.
+    * (66) bug fix:  GB_bitonic did not compile with the MS cl compiler.
+        Found by Erik Welch, NVIDIA.
+
 Dec 3, 2025, version 10.3.0
 
     * GrB_extract: performance improvement for C=A(I,J) when A is very large
diff --git a/Doc/GraphBLAS_UserGuide.pdf b/Doc/GraphBLAS_UserGuide.pdf
index cd01ef2d65..14bc183482 100644
Binary files a/Doc/GraphBLAS_UserGuide.pdf and b/Doc/GraphBLAS_UserGuide.pdf differ
diff --git a/Doc/GraphBLAS_version.tex b/Doc/GraphBLAS_version.tex
index 0baf6d5321..267f6f81f7 100644
--- a/Doc/GraphBLAS_version.tex
+++ b/Doc/GraphBLAS_version.tex
@@ -1,5 +1,5 @@
 % version of SuiteSparse:GraphBLAS
 \date{VERSION
-10.3.0,
-Dec 3, 2025}
+10.3.1,
+Jan 21, 2026}
 
diff --git a/Doc/UserGuide/GrB_release.tex b/Doc/UserGuide/GrB_release.tex
index 976d3abbb6..7dcd6c0a1e 100644
--- a/Doc/UserGuide/GrB_release.tex
+++ b/Doc/UserGuide/GrB_release.tex
@@ -5,6 +5,15 @@ \section{Release Notes}
 
 \begin{itemize}
 
+\item Jan 21, 2026: version 10.3.1
+
+    \begin{itemize}
+    \item (67) bug fix:  incorrect JIT kernel constructed for R=masker(C,M,Z)
+        when R is hypersparse.  Found by Roi Lipman, FalkorDB.
+    \item (66) bug fix:  \verb'GB_bitonic' did not compile with the MS cl
+        compiler.  Found by Erik Welch, NVIDIA.
+    \end{itemize}
+
 \item Dec 3, 2025: version 10.3.0
 
     \begin{itemize}
diff --git a/Include/GraphBLAS.h b/Include/GraphBLAS.h
index cacf58ad11..86ab614231 100644
--- a/Include/GraphBLAS.h
+++ b/Include/GraphBLAS.h
@@ -1,4 +1,4 @@
-// SuiteSparse:GraphBLAS 10.3.0
+// SuiteSparse:GraphBLAS 10.3.1
 //------------------------------------------------------------------------------
 // GraphBLAS.h: definitions for the GraphBLAS package
 //------------------------------------------------------------------------------
@@ -286,10 +286,10 @@
 
 // The version of this implementation, and the GraphBLAS API version:
 #define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
-#define GxB_IMPLEMENTATION_DATE "Dec 3, 2025"
+#define GxB_IMPLEMENTATION_DATE "Jan 21, 2026"
 #define GxB_IMPLEMENTATION_MAJOR 10
 #define GxB_IMPLEMENTATION_MINOR 3
-#define GxB_IMPLEMENTATION_SUB   0
+#define GxB_IMPLEMENTATION_SUB   1
 #define GxB_SPEC_DATE "Dec 22, 2023"
 #define GxB_SPEC_MAJOR 2
 #define GxB_SPEC_MINOR 1
diff --git a/README.md b/README.md
index be849071b3..1d416dd328 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
 
 SPDX-License-Identifier: Apache-2.0
 
-VERSION 10.3.0, Dec 3, 2025
+VERSION 10.3.1, Jan 21, 2026
 
 SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS standard,
 which defines a set of sparse matrix operations on an extended algebra of
diff --git a/Source/global/GB_Global.c b/Source/global/GB_Global.c
index c2a010509c..fc7e0a237c 100644
--- a/Source/global/GB_Global.c
+++ b/Source/global/GB_Global.c
@@ -253,7 +253,7 @@ static GB_Global_struct GB_Global =
     .gpu_count = 0,                     // # of GPUs in the system
 
     // OpenMP locks
-    .lock_is_created = {0, 0, 0, 0},
+    .lock_is_created = {0, 0, 0, 0, 0, 0, 0, 0},    // of size GB_GLOBAL_NLOCKS
 } ;
 
 //==============================================================================
diff --git a/Source/jit_kernels/include/GB_jit_kernel_proto.h b/Source/jit_kernels/include/GB_jit_kernel_proto.h
index 5bdcb65789..cad1dc0cd7 100644
--- a/Source/jit_kernels/include/GB_jit_kernel_proto.h
+++ b/Source/jit_kernels/include/GB_jit_kernel_proto.h
@@ -727,7 +727,12 @@ GrB_Info GB_jit_kernel_kroner                                           \
 (                                                                       \
     GrB_Matrix C,                                                       \
     const GrB_Matrix A,                                                 \
+    const bool A_transpose,                                             \
     const GrB_Matrix B,                                                 \
+    const bool B_transpose,                                             \
+    const GrB_Matrix Mask,                                              \
+    const bool Mask_struct,                                             \
+    const bool Mask_comp,                                               \
     const int nthreads,                                                 \
     const void *theta,                                                  \
     const GB_callback_struct *restrict my_callback                      \
diff --git a/Source/jit_wrappers/GB_kroner_jit.c b/Source/jit_wrappers/GB_kroner_jit.c
index 174ec0e37e..1919b47f7d 100644
--- a/Source/jit_wrappers/GB_kroner_jit.c
+++ b/Source/jit_wrappers/GB_kroner_jit.c
@@ -20,7 +20,12 @@ GrB_Info GB_kroner_jit
     const GrB_BinaryOp binaryop,
     const bool flipij,
     const GrB_Matrix A,
+    const bool A_transpose,
     const GrB_Matrix B,
+    const bool B_transpose,
+    const GrB_Matrix Mask,
+    const bool Mask_struct,
+    const bool Mask_comp,
     const int nthreads
 )
 { 
@@ -41,7 +46,7 @@ GrB_Info GB_kroner_jit
         GB_JIT_KERNEL_KRONER, /* is_ewisemult: */ false, /* C_iso: */ C->iso,
         /* C_in_iso: */ false, C_sparsity, C->type,
         C->p_is_32, C->j_is_32, C->i_is_32,
-        /* M: */ NULL, true, false, binaryop, flipij, false, A, B) ;
+        /* M: */ Mask, Mask_struct, Mask_comp, binaryop, flipij, false, A, B) ;
 
     //--------------------------------------------------------------------------
     // get the kernel function pointer, loading or compiling it if needed
@@ -60,6 +65,6 @@ GrB_Info GB_kroner_jit
 
     #include "include/GB_pedantic_disable.h"
     GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
-    return (GB_jit_kernel (C, A, B, nthreads, binaryop->theta, &GB_callback)) ;
+    return (GB_jit_kernel (C, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads, binaryop->theta, &GB_callback)) ;
 }
 
diff --git a/Source/jit_wrappers/GB_masker_phase1_jit.c b/Source/jit_wrappers/GB_masker_phase1_jit.c
index f43fe6b915..410939a585 100644
--- a/Source/jit_wrappers/GB_masker_phase1_jit.c
+++ b/Source/jit_wrappers/GB_masker_phase1_jit.c
@@ -29,6 +29,7 @@ GrB_Info GB_masker_phase1_jit       // count nnz in each R(:,j)
     const int64_t *restrict R_to_Z,
     const bool Rp_is_32,            // if true, Rp is 32-bit; else 64-bit
     const bool Rj_is_32,            // if true, Rh is 32-bit; else 64-bit
+    const int R_sparsity,           // GxB_SPARSE or GxB_HYPERSPARSE
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -45,7 +46,8 @@ GrB_Info GB_masker_phase1_jit       // count nnz in each R(:,j)
     GB_jit_encoding encoding ;
     char *suffix ;
     uint64_t hash = GB_encodify_masker (&encoding, &suffix,
-        GB_JIT_KERNEL_MASKER_PHASE1, NULL, Rp_is_32, Rj_is_32, false,
+        GB_JIT_KERNEL_MASKER_PHASE1, R_sparsity, /* rtype: */ NULL,
+        Rp_is_32, Rj_is_32, /* Ri is not accessed: */ false,
         M, Mask_struct, Mask_comp, C, Z) ;
 
     //--------------------------------------------------------------------------
diff --git a/Source/jit_wrappers/GB_masker_phase2_jit.c b/Source/jit_wrappers/GB_masker_phase2_jit.c
index c014467352..7897a49ef9 100644
--- a/Source/jit_wrappers/GB_masker_phase2_jit.c
+++ b/Source/jit_wrappers/GB_masker_phase2_jit.c
@@ -23,6 +23,7 @@ GrB_Info GB_masker_phase2_jit       // phase2 for R = masker (C,M,Z)
     const int64_t *restrict R_to_M,
     const int64_t *restrict R_to_C,
     const int64_t *restrict R_to_Z,
+    const int R_sparsity,           // any sparsity format
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -45,7 +46,8 @@ GrB_Info GB_masker_phase2_jit       // phase2 for R = masker (C,M,Z)
     GB_jit_encoding encoding ;
     char *suffix ;
     uint64_t hash = GB_encodify_masker (&encoding, &suffix,
-        GB_JIT_KERNEL_MASKER_PHASE2, R, R->p_is_32, R->j_is_32, R->i_is_32,
+        GB_JIT_KERNEL_MASKER_PHASE2, R_sparsity, R->type,
+        R->p_is_32, R->j_is_32, R->i_is_32,
         M, Mask_struct, Mask_comp, C, Z) ;
 
     //--------------------------------------------------------------------------
diff --git a/Source/jitifyer/GB_encodify_masker.c b/Source/jitifyer/GB_encodify_masker.c
index f1588eca6c..628f881b48 100644
--- a/Source/jitifyer/GB_encodify_masker.c
+++ b/Source/jitifyer/GB_encodify_masker.c
@@ -20,7 +20,8 @@ uint64_t GB_encodify_masker     // encode a masker problem
     char **suffix,              // suffix for user-defined kernel
     // input:
     const GB_jit_kcode kcode,   // kernel to encode
-    const GrB_Matrix R,         // may be NULL, for phase1
+    const int R_sparsity,       // any sparsity format
+    const GrB_Type rtype,
     const bool Rp_is_32,        // if true, R->p is 32 bit; else 64 bit
     const bool Rj_is_32,        // if true, R->h is 32 bit; else 64 bit
     const bool Ri_is_32,        // if true, R->i is 32 bit; else 64 bit
@@ -33,11 +34,10 @@ uint64_t GB_encodify_masker     // encode a masker problem
 { 
 
     //--------------------------------------------------------------------------
-    // check if the R->type is JIT'able
+    // check if the rtype is JIT'able
     //--------------------------------------------------------------------------
 
-    GrB_Type rtype = (R == NULL) ? NULL : R->type ;
-    if (R != NULL && rtype->hash == UINT64_MAX)
+    if (rtype != NULL && rtype->hash == UINT64_MAX)
     { 
         // cannot JIT this type
         memset (encoding, 0, sizeof (GB_jit_encoding)) ;
@@ -50,7 +50,8 @@ uint64_t GB_encodify_masker     // encode a masker problem
     //--------------------------------------------------------------------------
 
     GB_encodify_kcode (encoding, kcode) ;
-    GB_enumify_masker (&encoding->code, R, Rp_is_32, Rj_is_32, Ri_is_32,
+    GB_enumify_masker (&encoding->code, R_sparsity, rtype,
+        Rp_is_32, Rj_is_32, Ri_is_32,
         M, Mask_struct, Mask_comp, C, Z) ;
 
     //--------------------------------------------------------------------------
diff --git a/Source/jitifyer/GB_enumify_masker.c b/Source/jitifyer/GB_enumify_masker.c
index 2f87cce1bd..b6bbff0dca 100644
--- a/Source/jitifyer/GB_enumify_masker.c
+++ b/Source/jitifyer/GB_enumify_masker.c
@@ -15,7 +15,8 @@ void GB_enumify_masker      // enumify a masker problem
     // output:
     uint64_t *method_code,  // unique encoding of the entire operation
     // input:
-    const GrB_Matrix R,     // NULL for phase 1
+    const int R_sparsity,   // any sparsity format
+    const GrB_Type rtype,   // the type of R (NULL for phase1)
     const bool Rp_is_32,    // if true, R->p is 32-bit; else 64-bit
     const bool Rj_is_32,    // if true, R->h is 32-bit; else 64-bit
     const bool Ri_is_32,    // if true, R->i is 32-bit; else 64-bit
@@ -28,12 +29,11 @@ void GB_enumify_masker      // enumify a masker problem
 { 
 
     //--------------------------------------------------------------------------
-    // get the types of R, C, and Z
+    // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Type rtype = (R == NULL) ? NULL : R->type ;
-    ASSERT (GB_IMPLIES (R != NULL, rtype == C->type)) ;
-    ASSERT (GB_IMPLIES (R != NULL, rtype == Z->type)) ;
+    ASSERT (GB_IMPLIES (rtype != NULL, rtype == C->type)) ;
+    ASSERT (GB_IMPLIES (rtype != NULL, rtype == Z->type)) ;
 
     //--------------------------------------------------------------------------
     // enumify the types
@@ -55,7 +55,6 @@ void GB_enumify_masker      // enumify a masker problem
     // enumify the sparsity structures of R, C, M, and Z
     //--------------------------------------------------------------------------
 
-    int R_sparsity = GB_sparsity (R) ;
     int C_sparsity = GB_sparsity (C) ;
     int M_sparsity = GB_sparsity (M) ;
     int Z_sparsity = GB_sparsity (Z) ;
diff --git a/Source/jitifyer/GB_stringify.h b/Source/jitifyer/GB_stringify.h
index 3fa499df20..5471bbaf24 100644
--- a/Source/jitifyer/GB_stringify.h
+++ b/Source/jitifyer/GB_stringify.h
@@ -1573,6 +1573,7 @@ GrB_Info GB_masker_phase1_jit       // count nnz in each R(:,j)
     const int64_t *restrict R_to_Z,
     const bool Rp_is_32,            // if true, Rp is 32-bit; else 64-bit
     const bool Rj_is_32,            // if true, Rh is 32-bit; else 64-bit
+    const int R_sparsity,           // GxB_SPARSE or GxB_HYPERSPARSE
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -1592,6 +1593,7 @@ GrB_Info GB_masker_phase2_jit       // phase2 for R = masker (C,M,Z)
     const int64_t *restrict R_to_M,
     const int64_t *restrict R_to_C,
     const int64_t *restrict R_to_Z,
+    const int R_sparsity,           // any sparsity format
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -1614,7 +1616,8 @@ uint64_t GB_encodify_masker     // encode a masker problem
     char **suffix,              // suffix for user-defined kernel
     // input:
     const GB_jit_kcode kcode,   // kernel to encode
-    const GrB_Matrix R,         // may be NULL, for phase1
+    const int R_sparsity,       // GxB_SPARSE or GxB_HYPERSPARSE
+    const GrB_Type rtype,
     const bool Rp_is_32,        // if true, R->p is 32 bit; else 64 bit
     const bool Rj_is_32,        // if true, R->h is 32 bit; else 64 bit
     const bool Ri_is_32,        // if true, R->i is 32 bit; else 64 bit
@@ -1630,7 +1633,8 @@ void GB_enumify_masker      // enumify a masker problem
     // output:
     uint64_t *method_code,  // unique encoding of the entire operation
     // input:
-    const GrB_Matrix R,     // NULL for phase 1
+    const int R_sparsity,   // GxB_SPARSE or GxB_HYPERSPARSE
+    const GrB_Type rtype,   // the type of R (NULL for phase1)
     const bool Rp_is_32,    // if true, R->p is 32-bit; else 64-bit
     const bool Rj_is_32,    // if true, R->h is 32-bit; else 64-bit
     const bool Ri_is_32,    // if true, R->i is 32-bit; else 64-bit
@@ -1833,7 +1837,12 @@ GrB_Info GB_kroner_jit
     const GrB_BinaryOp binaryop,
     const bool flipij,
     const GrB_Matrix A,
+    const bool A_transpose,
     const GrB_Matrix B,
+    const bool B_transpose,
+    const GrB_Matrix Mask,
+    const bool Mask_struct,
+    const bool Mask_comp,
     const int nthreads
 ) ;
 
diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c
index 5c462da512..7e414d7fbe 100644
--- a/Source/kronecker/GB_kron.c
+++ b/Source/kronecker/GB_kron.c
@@ -23,16 +23,76 @@
     GB_Matrix_free (&T) ;   \
 }
 
+#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p])
+
+#define GBB(Ab,p)       ((Ab == NULL) ? 1 : Ab [p])
+
+#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k])
+
+#define GBH(Ah,k)       ((Ah == NULL) ? (k) : Ah [k])
+
 #include "kronecker/GB_kron.h"
 #include "mxm/GB_mxm.h"
 #include "transpose/GB_transpose.h"
 #include "mask/GB_accum_mask.h"
 
+static bool GB_lookup_xoffset (
+    GrB_Index *p,
+    GrB_Matrix A,
+    GrB_Index row,
+    GrB_Index col
+)
+{
+    GrB_Index vector = A->is_csc ? col : row ;
+    GrB_Index coord  = A->is_csc ? row : col ;
+
+    if (A->p == NULL)
+    {
+        GrB_Index offset = vector * A->vlen + coord ;
+        if (A->b == NULL || ((int8_t *)A->b)[offset])
+        {
+            *p = A->iso ? 0 : offset ;
+            return true ;
+        }
+        return false ;
+    }
+
+    int64_t start, end ;
+    bool res ;
+
+    if (A->h == NULL)
+    {
+        start = A->p_is_32 ? ((uint32_t *)A->p)[vector] : ((uint64_t *)A->p)[vector] ;
+        end = A->p_is_32 ? ((uint32_t *)A->p)[vector + 1] : ((uint64_t *)A->p)[vector + 1] ;
+        end-- ;
+        if (start > end) return false ;
+        res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ;
+        if (res) { *p = A->iso ? 0 : start ; }
+        return res ;
+    }
+    else
+    {
+        start = 0 ; end = A->plen - 1 ;
+        res = GB_binary_search(vector, A->h, A->j_is_32, &start, &end) ;
+        if (!res) return false ;
+        int64_t k = start ;
+        start = A->p_is_32 ? ((uint32_t *)A->p)[k] : ((uint64_t *)A->p)[k] ;
+        end = A->p_is_32 ? ((uint32_t *)A->p)[k+1] : ((uint64_t *)A->p)[k+1] ;
+        end-- ;
+        if (start > end) return false ;
+        res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ;
+        if (res) { *p = A->iso ? 0 : start ; }
+        return res ;
+    }
+}
+
+#include "emult/GB_emult.h"
+
 GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
 (
     GrB_Matrix C,                   // input/output matrix for results
     const bool C_replace,           // if true, clear C before writing to it
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_Matrix Mask,             // optional mask for C, unused if NULL
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
@@ -51,6 +111,8 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
 
     // C may be aliased with M, A, and/or B
 
+    GrB_Matrix M = Mask ;
+
     GrB_Info info ;
     struct GB_Matrix_opaque T_header, AT_header, BT_header ;
     GrB_Matrix T = NULL, AT = NULL, BT = NULL ;
@@ -104,6 +166,81 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
     // quick return if an empty mask is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp, Mask_struct) ;
 
+    // check if it's possible to apply mask immediately in kron
+    // TODO: MT should have its own 32/64 bitness controls
+
+    bool Mask_is_applicable = M != NULL && !Mask_comp ;
+    if (Mask_is_applicable) {
+        bool MT_hypersparse = (A->h != NULL) || (B->h != NULL) ;
+        size_t allocated = 0 ;
+
+        GrB_Matrix MT = NULL ; struct GB_Matrix_opaque MT_header ;
+        GB_CLEAR_MATRIX_HEADER (MT, &MT_header) ;
+
+        bool A_is_pattern, B_is_pattern ;
+        GB_binop_pattern (&A_is_pattern, &B_is_pattern, false, op->opcode) ;
+
+        GrB_Info masked_kroner_info = GB_kroner (MT, C->is_csc, op, false, A, A_is_pattern, A_transpose, B, B_is_pattern, B_transpose,
+        M, Mask_comp, Mask_struct, Werk) ;
+        if (masked_kroner_info != GrB_SUCCESS)
+        {
+            return masked_kroner_info ;
+        }
+
+        if (MT->is_csc != C->is_csc) {
+            GrB_Info MTtranspose = GB_transpose_in_place (MT, true, Werk) ;
+            if (MTtranspose != GrB_SUCCESS)
+            {
+                GB_FREE_WORKSPACE ;
+                GB_Matrix_free (&MT) ;
+                return MTtranspose ;
+            }
+        }
+
+        if (MT_hypersparse)
+        {
+            uint32_t *MTh32 = NULL ; uint64_t *MTh64 =  NULL ;
+            if (MT->j_is_32)
+            {
+                MTh32 = GB_malloc_memory (MT->vdim, sizeof(uint32_t), &allocated) ;
+            }
+            else
+            {
+                MTh64 = GB_malloc_memory (MT->vdim, sizeof(uint64_t), &allocated) ;
+            }
+
+            if (MTh32 == NULL && MTh64 == NULL)
+            {
+                GB_FREE_WORKSPACE ;
+                GB_Matrix_free (&MT) ;
+                return GrB_OUT_OF_MEMORY ;
+            }
+
+            double work = M->vdim ;
+            int nthreads_max = GB_Context_nthreads_max ( ) ;
+            double chunk = GB_Context_chunk ( ) ;
+            int masked_hyper_threads = GB_nthreads (work, chunk, nthreads_max) ;
+
+            #pragma omp parallel for num_threads(masked_hyper_threads) schedule(static)
+            for (GrB_Index i = 0; i < MT->vdim; i++)
+            {
+                if (MT->j_is_32) { MTh32[i] = i ; } else { MTh64[i] = i ; } 
+            }
+
+            MT->h = MTh32 ? (void *)MTh32 : (void *)MTh64 ;
+
+            GrB_Info MThyperprune = GB_hyper_prune (MT, Werk) ;
+            if (MThyperprune != GrB_SUCCESS)
+            {
+                GB_FREE_WORKSPACE ;
+                GB_Matrix_free (&MT) ;
+                return MThyperprune ;
+            }
+        }
+
+        return (GB_accum_mask (C, NULL, NULL, accum, &MT, C_replace, Mask_comp, Mask_struct, Werk)) ;
+    }
+
     //--------------------------------------------------------------------------
     // transpose A and B if requested
     //--------------------------------------------------------------------------
@@ -152,8 +289,8 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
 
     GB_CLEAR_MATRIX_HEADER (T, &T_header) ;
     GB_OK (GB_kroner (T, T_is_csc, op, flipij,
-        A_transpose ? AT : A, A_is_pattern,
-        B_transpose ? BT : B, B_is_pattern, Werk)) ;
+        A_transpose ? AT : A, A_is_pattern, A_transpose,
+        B_transpose ? BT : B, B_is_pattern, B_transpose, M, Mask_comp, Mask_struct, Werk)) ;
 
     GB_FREE_WORKSPACE ;
     ASSERT_MATRIX_OK (T, "T = kron(A,B)", GB0) ;
diff --git a/Source/kronecker/GB_kron.h b/Source/kronecker/GB_kron.h
index a6c1a3f3ae..a25b0cfa15 100644
--- a/Source/kronecker/GB_kron.h
+++ b/Source/kronecker/GB_kron.h
@@ -15,7 +15,7 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
 (
     GrB_Matrix C,                   // input/output matrix for results
     const bool C_replace,           // if true, clear C before writing to it
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_Matrix Mask,             // optional mask for C, unused if NULL
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
@@ -35,8 +35,13 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     const bool flipij,              // if true, i and j are flipped: z=(x,y,j,i)
     const GrB_Matrix A,             // input matrix
     bool A_is_pattern,              // true if values of A are not used
+    bool A_transpose,
     const GrB_Matrix B,             // input matrix
     bool B_is_pattern,              // true if values of B are not used
+    bool B_transpose,
+    const GrB_Matrix Mask,
+    const bool Mask_comp,
+    const bool Mask_struct,
     GB_Werk Werk
 ) ;
 
diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c
index afbb655ee4..fabfd2ceb2 100644
--- a/Source/kronecker/GB_kroner.c
+++ b/Source/kronecker/GB_kroner.c
@@ -24,12 +24,187 @@
     GB_phybix_free (C) ;        \
 }
 
+#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p])
+#define GBB(Ab,p)       ((Ab == NULL) ? 1 : Ab [p])
+#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k])
+#define GBH(Ah,k)       ((Ah == NULL) ? (k) : Ah [k])
+
+#include "mxm/GB_mxm.h"
 #include "kronecker/GB_kron.h"
 #include "emult/GB_emult.h"
 #include "slice/include/GB_search_for_vector.h"
 #include "jitifyer/GB_stringify.h"
 
-GrB_Info GB_kroner                  // C = kron (A,B)
+//------------------------------------------------------------------------------
+// GB_lookup_xoffset: find the offset of (row,col) in a Ax if present
+//------------------------------------------------------------------------------
+
+static bool GB_lookup_xoffset
+(
+    GrB_Index *p,
+    GrB_Matrix A,
+    GrB_Index row,
+    GrB_Index col
+)
+{
+    GrB_Index vector = A->is_csc ? col : row ;
+    GrB_Index coord  = A->is_csc ? row : col ;
+
+    if (A->p == NULL)
+    {
+        GrB_Index offset = vector * A->vlen + coord ;
+        if (A->b == NULL || ((int8_t *) A->b) [offset])
+        {
+            *p = A->iso ? 0 : offset ;
+            return true ;
+        }
+        return false ;
+    }
+
+    int64_t start, end ;
+    bool res ;
+
+    if (A->h == NULL)
+    {
+        start = A->p_is_32 ? ((uint32_t *) A->p) [vector]
+                           : ((uint64_t *) A->p) [vector] ;
+        end   = A->p_is_32 ? ((uint32_t *) A->p) [vector + 1]
+                           : ((uint64_t *) A->p) [vector + 1] ;
+        end-- ;
+        if (start > end) return false ;
+        res = GB_binary_search (coord, A->i, A->i_is_32, &start, &end) ;
+        if (res) { *p = A->iso ? 0 : start ; }
+        return res ;
+    }
+    else
+    {
+        start = 0 ; end = A->plen - 1 ;
+        res = GB_binary_search (vector, A->h, A->j_is_32, &start, &end) ;
+        if (!res) return false ;
+        int64_t k = start ;
+        start = A->p_is_32 ? ((uint32_t *) A->p) [k]
+                           : ((uint64_t *) A->p) [k] ;
+        end   = A->p_is_32 ? ((uint32_t *) A->p) [k + 1]
+                           : ((uint64_t *) A->p) [k + 1] ;
+        end-- ;
+        if (start > end) return false ;
+        res = GB_binary_search (coord, A->i, A->i_is_32, &start, &end) ;
+        if (res) { *p = A->iso ? 0 : start ; }
+        return res ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_kroner_generic: generic kernel for masked and default paths
+//------------------------------------------------------------------------------
+
+static GrB_Info GB_kroner_generic
+(
+    GrB_Matrix C,
+    const GrB_BinaryOp op,
+    const bool flipij,
+    const GrB_Matrix A,
+    const GrB_Matrix B,
+    bool A_is_pattern,
+    bool B_is_pattern,
+    bool A_transpose,
+    bool B_transpose,
+    const GrB_Matrix Mask,
+    const bool Mask_comp,
+    const bool Mask_struct,
+    const bool C_is_full,
+    const int nthreads
+)
+{
+    GrB_Type ctype = op->ztype ;
+    const size_t csize = ctype->size ;
+
+    GB_Ap_DECLARE (Ap, const) ; GB_Ap_PTR (Ap, A) ;
+    GB_Ah_DECLARE (Ah, const) ; GB_Ah_PTR (Ah, A) ;
+    const int64_t avlen = A->vlen ;
+
+    GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ;
+    GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ;
+    const int64_t bvlen = B->vlen ;
+    const int64_t bnvec = B->nvec ;
+
+    GB_Cp_DECLARE (Cp, ) ; GB_Cp_PTR (Cp, C) ;
+    const int64_t cnvec = C->nvec ;
+    const int64_t cvlen = C->vlen ;
+    const int64_t cnz   = C->nvals ;
+
+    #define GB_NO_MASK      (Mask == NULL)
+    #define GB_MASK_STRUCT  Mask_struct
+    #define GB_MASK_COMP    Mask_comp
+    #define GB_Cp_IS_32     C->p_is_32
+    #define GB_A_TYPE       GB_void
+    #define GB_B_TYPE       GB_void
+    #define GB_C_TYPE       GB_void
+    #define GB_A_ISO        A_iso
+    #define GB_B_ISO        B_iso
+    #define GB_C_ISO        C_iso
+    #define GB_C_IS_FULL    C_is_full
+
+    const bool A_iso    = A->iso ;
+    const bool B_iso    = B->iso ;
+    const bool C_iso    = C->iso ;
+    const int64_t asize = A->type->size ;
+    const int64_t bsize = B->type->size ;
+
+    GxB_binary_function       fmult     = op->binop_function ;
+    GxB_index_binary_function fmult_idx = op->idxbinop_function ;
+    const void *theta = op->theta ;
+
+    GB_cast_function cast_A = NULL, cast_B = NULL ;
+    if (!A_is_pattern)
+        cast_A = GB_cast_factory (op->xtype->code, A->type->code) ;
+    if (!B_is_pattern)
+        cast_B = GB_cast_factory (op->ytype->code, B->type->code) ;
+
+    #define GB_DECLAREA(a) GB_void a [GB_VLA(asize)]
+    #define GB_DECLAREB(b) GB_void b [GB_VLA(bsize)]
+
+    #define GB_GETA(a,Ax,p,iso)                         \
+    {                                                   \
+        if (!A_is_pattern)                              \
+            cast_A (a, Ax + (p)*asize, asize) ;         \
+    }
+
+    #define GB_GETB(b,Bx,p,iso)                         \
+    {                                                   \
+        if (!B_is_pattern)                              \
+            cast_B (b, Bx + (p)*bsize, bsize) ;         \
+    }
+
+    #define GB_KRONECKER_OP(Cx,pC,a,ix,jx,b,iy,jy)      \
+    {                                                   \
+        if (fmult != NULL)                              \
+        {                                               \
+            fmult (Cx + (pC)*csize, a, b) ;             \
+        }                                               \
+        else                                            \
+        {                                               \
+            if (flipij)                                 \
+                fmult_idx (Cx + (pC)*csize,             \
+                    a, jx, ix, b, jy, iy, theta) ;      \
+            else                                        \
+                fmult_idx (Cx + (pC)*csize,             \
+                    a, ix, jx, b, iy, jy, theta) ;      \
+        }                                               \
+    }
+
+    #define GB_GENERIC
+    #include "ewise/include/GB_ewise_shared_definitions.h"
+    #include "kronecker/template/GB_kroner_template.c"
+
+    return GrB_SUCCESS ;
+}
+
+//------------------------------------------------------------------------------
+// GB_kroner: Kronecker product, C = kron (A,B)
+//------------------------------------------------------------------------------
+
+GrB_Info GB_kroner
 (
     GrB_Matrix C,                   // output matrix
     const bool C_is_csc,            // desired format of C
@@ -37,8 +212,13 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     const bool flipij,              // if true, i and j are flipped: z=(x,y,j,i)
     const GrB_Matrix A_in,          // input matrix
     bool A_is_pattern,              // true if values of A are not used
+    bool A_transpose,
     const GrB_Matrix B_in,          // input matrix
     bool B_is_pattern,              // true if values of B are not used
+    bool B_transpose,
+    const GrB_Matrix Mask,
+    const bool Mask_comp,
+    const bool Mask_struct,
     GB_Werk Werk
 )
 {
@@ -65,12 +245,260 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     GB_MATRIX_WAIT (B_in) ;
 
     //--------------------------------------------------------------------------
-    // bitmap case: create sparse copies of A and B if they are bitmap
+    // common definitions for masked and default
     //--------------------------------------------------------------------------
 
+    GrB_Type ctype = op->ztype ;
+    const size_t csize = ctype->size ;
+    GB_void cscalar [GB_VLA(csize)] ;
+    bool C_iso = GB_emult_iso (cscalar, ctype, A_in, B_in, op) ;
+
     GrB_Matrix A = A_in ;
+    GrB_Matrix B = B_in ;
+
+    //--------------------------------------------------------------------------
+    // apply mask immediately if possible
+    //--------------------------------------------------------------------------
+
+    if (Mask != NULL && !Mask_comp)
+    {
+        GB_MATRIX_WAIT (Mask) ;
+
+        int64_t bnrows = B_transpose ? GB_NCOLS (B) : GB_NROWS (B) ;
+        int64_t bncols = B_transpose ? GB_NROWS (B) : GB_NCOLS (B) ;
+        size_t allocated = 0 ;
+        int64_t centries = 0 ;
+        uint64_t nvecs = 0 ;
+
+        //----------------------------------------------------------------------
+        // allocate Cp
+        //----------------------------------------------------------------------
+
+        uint32_t *Cp32 = NULL ; uint64_t *Cp64 = NULL ;
+        if (Mask->p_is_32)
+            Cp32 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint32_t),
+                &allocated) ;
+        else
+            Cp64 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint64_t),
+                &allocated) ;
+
+        if (Cp32 == NULL && Cp64 == NULL)
+        {
+            GB_FREE_WORKSPACE ;
+            return GrB_OUT_OF_MEMORY ;
+        }
+
+        GB_Mp_DECLARE (Mp, ) ; GB_Mp_PTR (Mp, Mask) ;
+        GB_Mh_DECLARE (Mh, ) ; GB_Mh_PTR (Mh, Mask) ;
+        GB_Mi_DECLARE (Mi, ) ; GB_Mi_PTR (Mi, Mask) ;
+
+        GB_cast_function cast_A = GB_cast_factory (op->xtype->code,
+            A->type->code) ;
+        GB_cast_function cast_B = GB_cast_factory (op->ytype->code,
+            B->type->code) ;
+
+        double work = Mask->vdim ;
+        int nthreads_max = GB_Context_nthreads_max ( ) ;
+        double chunk = GB_Context_chunk ( ) ;
+        int nthreads = GB_nthreads (work, chunk, nthreads_max) ;
+
+        int64_t vlen = Mask->vlen ;
+
+        //----------------------------------------------------------------------
+        // count entries per vector
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel num_threads(nthreads)
+        {
+            GrB_Index offset ;
+
+            #pragma omp for reduction(+:nvecs) schedule(static)
+            for (GrB_Index k = 0 ; k < Mask->nvec ; k++)
+            {
+                GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ;
+
+                int64_t pM_start = Mp32 ? GBP (Mp32, k,   vlen)
+                                        : GBP (Mp64, k,   vlen) ;
+                int64_t pM_end   = Mp32 ? GBP (Mp32, k+1, vlen)
+                                        : GBP (Mp64, k+1, vlen) ;
+                bool nonempty = false ;
+
+                for (GrB_Index p = pM_start ; p < pM_end ; p++)
+                {
+                    if (!GBB (Mask->b, p)) continue ;
+
+                    int64_t i = Mi32 ? GBI (Mi32, p, vlen)
+                                     : GBI (Mi64, p, vlen) ;
+                    GrB_Index Mrow = Mask->is_csc ? i : j ;
+                    GrB_Index Mcol = Mask->is_csc ? j : i ;
+
+                    if (Mask_struct || (Mask->iso ? ((bool *) Mask->x) [0]
+                                                  : ((bool *) Mask->x) [p]))
+                    {
+                        GrB_Index arow = A_transpose ? (Mcol / bncols)
+                                                     : (Mrow / bnrows) ;
+                        GrB_Index acol = A_transpose ? (Mrow / bnrows)
+                                                     : (Mcol / bncols) ;
+                        GrB_Index brow = B_transpose ? (Mcol % bncols)
+                                                     : (Mrow % bnrows) ;
+                        GrB_Index bcol = B_transpose ? (Mrow % bnrows)
+                                                     : (Mcol % bncols) ;
+
+                        if (!GB_lookup_xoffset (&offset, A, arow, acol))
+                            continue ;
+                        if (!GB_lookup_xoffset (&offset, B, brow, bcol))
+                            continue ;
+
+                        if (Mask->p_is_32) { (Cp32 [j])++ ; }
+                        else               { (Cp64 [j])++ ; }
+                        nonempty = true ;
+                    }
+                }
+                if (nonempty) nvecs++ ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // prefix sum to get centries
+        //----------------------------------------------------------------------
+
+        if (Mask->p_is_32)
+            GB_cumsum (Cp32, Mask->p_is_32, Mask->vdim, NULL, nthreads, Werk) ;
+        else
+            GB_cumsum (Cp64, Mask->p_is_32, Mask->vdim, NULL, nthreads, Werk) ;
+
+        centries = Mask->p_is_32 ? (int64_t) Cp32 [Mask->vdim]
+                                 : (int64_t) Cp64 [Mask->vdim] ;
+
+        //----------------------------------------------------------------------
+        // allocate Ci
+        //----------------------------------------------------------------------
+
+        uint32_t *Ci32 = NULL ; uint64_t *Ci64 = NULL ;
+        if (Mask->i_is_32)
+            Ci32 = GB_malloc_memory (centries, sizeof (uint32_t), &allocated) ;
+        else
+            Ci64 = GB_malloc_memory (centries, sizeof (uint64_t), &allocated) ;
+
+        if (centries > 0 && Ci32 == NULL && Ci64 == NULL)
+        {
+            if (Mask->p_is_32) GB_free_memory (&Cp32,
+                (Mask->vdim + 1) * sizeof (uint32_t)) ;
+            else               GB_free_memory (&Cp64,
+                (Mask->vdim + 1) * sizeof (uint64_t)) ;
+            GB_FREE_WORKSPACE ;
+            return GrB_OUT_OF_MEMORY ;
+        }
+
+        //----------------------------------------------------------------------
+        // allocate Cx
+        //----------------------------------------------------------------------
+
+        void *Cx = NULL ;
+        if (C_iso)
+        {
+            Cx = GB_malloc_memory (1, op->ztype->size, &allocated) ;
+            if (Cx == NULL)
+            {
+                if (Mask->i_is_32) GB_free_memory (&Ci32,
+                    centries * sizeof (uint32_t)) ;
+                else               GB_free_memory (&Ci64,
+                    centries * sizeof (uint64_t)) ;
+                if (Mask->p_is_32) GB_free_memory (&Cp32,
+                    (Mask->vdim + 1) * sizeof (uint32_t)) ;
+                else               GB_free_memory (&Cp64,
+                    (Mask->vdim + 1) * sizeof (uint64_t)) ;
+                GB_FREE_WORKSPACE ;
+                return GrB_OUT_OF_MEMORY ;
+            }
+            memcpy (Cx, cscalar, csize) ;
+        }
+        else
+        {
+            Cx = GB_malloc_memory (centries, op->ztype->size, &allocated) ;
+            if (centries > 0 && Cx == NULL)
+            {
+                if (Mask->i_is_32) GB_free_memory (&Ci32,
+                    centries * sizeof (uint32_t)) ;
+                else               GB_free_memory (&Ci64,
+                    centries * sizeof (uint64_t)) ;
+                if (Mask->p_is_32) GB_free_memory (&Cp32,
+                    (Mask->vdim + 1) * sizeof (uint32_t)) ;
+                else               GB_free_memory (&Cp64,
+                    (Mask->vdim + 1) * sizeof (uint64_t)) ;
+                GB_FREE_WORKSPACE ;
+                return GrB_OUT_OF_MEMORY ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // allocate and initialize C matrix header
+        //----------------------------------------------------------------------
+
+        GrB_Info Calloc = GB_new_bix (&C, op->ztype, vlen, Mask->vdim,
+            GB_ph_null, Mask->is_csc, GxB_SPARSE, false, Mask->hyper_switch,
+            Mask->vdim, centries, false, C_iso,
+            Mask->p_is_32, Mask->j_is_32, Mask->i_is_32) ;
+        if (Calloc != GrB_SUCCESS)
+        {
+            if (C_iso) GB_free_memory (&Cx, op->ztype->size) ;
+            else       GB_free_memory (&Cx, centries * op->ztype->size) ;
+            if (Mask->i_is_32) GB_free_memory (&Ci32,
+                centries * sizeof (uint32_t)) ;
+            else               GB_free_memory (&Ci64,
+                centries * sizeof (uint64_t)) ;
+            if (Mask->p_is_32) GB_free_memory (&Cp32,
+                (Mask->vdim + 1) * sizeof (uint32_t)) ;
+            else               GB_free_memory (&Cp64,
+                (Mask->vdim + 1) * sizeof (uint64_t)) ;
+            GB_FREE_WORKSPACE ;
+            return Calloc ;
+        }
+
+        GB_free_memory (&C->i, C->i_size) ;
+        GB_free_memory (&C->x, C->x_size) ;
+
+        C->p = Mask->p_is_32 ? (void *) Cp32 : (void *) Cp64 ;
+        C->i = Mask->i_is_32 ? (void *) Ci32 : (void *) Ci64 ;
+        C->x = Cx ;
+        C->p_size = (Mask->p_is_32 ? sizeof (uint32_t)
+                     : sizeof (uint64_t))
+                     * (Mask->vdim + 1) ;
+        C->i_size = (Mask->i_is_32 ? sizeof (uint32_t)
+                     : sizeof (uint64_t)) * centries ;
+        C->x_size = C->iso ? op->ztype->size
+                     : op->ztype->size * centries ;
+        C->magic = GB_MAGIC ;
+        C->nvals = centries ;
+        C->nvec_nonempty = (int64_t) nvecs ;
+
+        //----------------------------------------------------------------------
+        // evaluate
+        //----------------------------------------------------------------------
+
+        info = GB_kroner_jit (C, op, false, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads) ;
+
+        if (info != GrB_SUCCESS)
+        {
+            info = GB_kroner_generic (C, op, false, A, B,
+                A_is_pattern, B_is_pattern, A_transpose, B_transpose,
+                Mask, Mask_comp, Mask_struct, false, nthreads) ;
+        }
+
+        GB_FREE_WORKSPACE ;
+        return info ;
+    }
+
+    //--------------------------------------------------------------------------
+    // default case (no mask, or complemented mask)
+    //--------------------------------------------------------------------------
+
+    //--------------------------------------------------------------------------
+    // bitmap case: create sparse copies of A and B if they are bitmap
+    //--------------------------------------------------------------------------
+
     if (GB_IS_BITMAP (A))
-    { 
+    {
         GBURBLE ("A:") ;
         GB_CLEAR_MATRIX_HEADER (Awork, &Awork_header) ;
         GB_OK (GB_dup_worker (&Awork, A->iso, A, true, NULL)) ;
@@ -80,9 +508,8 @@ GrB_Info GB_kroner                  // C = kron (A,B)
         A = Awork ;
     }
 
-    GrB_Matrix B = B_in ;
     if (GB_IS_BITMAP (B))
-    { 
+    {
         GBURBLE ("B:") ;
         GB_CLEAR_MATRIX_HEADER (Bwork, &Bwork_header) ;
         GB_OK (GB_dup_worker (&Bwork, B->iso, B, true, NULL)) ;
@@ -102,7 +529,7 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     const int64_t avlen = A->vlen ;
     const int64_t avdim = A->vdim ;
     const int64_t anvec = A->nvec ;
-    const int64_t anz = GB_nnz (A) ;
+    const int64_t anz   = GB_nnz (A) ;
 
     GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ;
     GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ;
@@ -110,7 +537,7 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     const int64_t bvlen = B->vlen ;
     const int64_t bvdim = B->vdim ;
     const int64_t bnvec = B->nvec ;
-    const int64_t bnz = GB_nnz (B) ;
+    const int64_t bnz   = GB_nnz (B) ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
@@ -118,57 +545,40 @@ GrB_Info GB_kroner                  // C = kron (A,B)
 
     double work = ((double) anz) * ((double) bnz)
                 + (((double) anvec) * ((double) bnvec)) ;
-
     int nthreads_max = GB_Context_nthreads_max ( ) ;
     double chunk = GB_Context_chunk ( ) ;
     int nthreads = GB_nthreads (work, chunk, nthreads_max) ;
 
-    //--------------------------------------------------------------------------
-    // check if C is iso and compute its iso value if it is
-    //--------------------------------------------------------------------------
-
-    GrB_Type ctype = op->ztype ;
-    const size_t csize = ctype->size ;
-    GB_void cscalar [GB_VLA(csize)] ;
-    bool C_iso = GB_emult_iso (cscalar, ctype, A, B, op) ;
-
     //--------------------------------------------------------------------------
     // allocate the output matrix C
     //--------------------------------------------------------------------------
 
-    // C has the same type as z for the multiply operator, z=op(x,y)
-
     uint64_t cvlen, cvdim, cnzmax, cnvec ;
-    bool ok = GB_int64_multiply (&cvlen, avlen, bvlen) ;
-    ok = ok & GB_int64_multiply (&cvdim, avdim, bvdim) ;
-    ok = ok & GB_int64_multiply (&cnzmax, anz, bnz) ;
-    ok = ok & GB_int64_multiply (&cnvec, anvec, bnvec) ;
+    bool ok = GB_int64_multiply (&cvlen,  avlen, bvlen) ;
+    ok = ok & GB_int64_multiply (&cvdim,  avdim, bvdim) ;
+    ok = ok & GB_int64_multiply (&cnzmax, anz,   bnz) ;
+    ok = ok & GB_int64_multiply (&cnvec,  anvec, bnvec) ;
     ASSERT (ok) ;
 
     if (C_iso)
-    { 
-        // the values of A and B are no longer needed if C is iso
+    {
         GBURBLE ("(iso kron) ") ;
         A_is_pattern = true ;
         B_is_pattern = true ;
     }
 
-    // C is hypersparse if either A or B are hypersparse.  It is never bitmap.
     bool C_is_hyper = (cvdim > 1) && (Ah != NULL || Bh != NULL) ;
-    bool C_is_full = GB_as_if_full (A) && GB_as_if_full (B) ;
-    int C_sparsity = C_is_full ? GxB_FULL :
-        ((C_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE) ;
-
-    // determine the p_is_32, j_is_32, and i_is_32 settings for the new matrix
+    bool C_is_full  = GB_as_if_full (A) && GB_as_if_full (B) ;
+    int  C_sparsity = C_is_full  ? GxB_FULL :
+                      C_is_hyper ? GxB_HYPERSPARSE : GxB_SPARSE ;
 
     bool Cp_is_32, Cj_is_32, Ci_is_32 ;
     GB_determine_pji_is_32 (&Cp_is_32, &Cj_is_32, &Ci_is_32,
         C_sparsity, cnzmax, (int64_t) cvlen, (int64_t) cvdim, Werk) ;
 
-    GB_OK (GB_new_bix (&C, // full, sparse, or hyper; existing header
-        ctype, (int64_t) cvlen, (int64_t) cvdim, GB_ph_malloc, C_is_csc,
-        C_sparsity, true, B->hyper_switch, cnvec, cnzmax, true, C_iso,
-        Cp_is_32, Cj_is_32, Ci_is_32)) ;
+    GB_OK (GB_new_bix (&C, ctype, (int64_t) cvlen, (int64_t) cvdim,
+        GB_ph_malloc, C_is_csc, C_sparsity, true, B->hyper_switch,
+        cnvec, cnzmax, true, C_iso, Cp_is_32, Cj_is_32, Ci_is_32)) ;
 
     //--------------------------------------------------------------------------
     // compute the column counts of C: Cp and Ch if C is hypersparse
@@ -176,34 +586,25 @@ GrB_Info GB_kroner                  // C = kron (A,B)
 
     GB_Cp_DECLARE (Cp, ) ; GB_Cp_PTR (Cp, C) ;
     GB_Ch_DECLARE (Ch, ) ; GB_Ch_PTR (Ch, C) ;
-    #define GB_Cp_IS_32 Cp_is_32
+    //#define GB_Cp_IS_32 Cp_is_32
 
     if (!C_is_full)
-    { 
-        // C is sparse or hypersparse
+    {
         int64_t kC ;
         #pragma omp parallel for num_threads(nthreads) schedule(static)
-        for (kC = 0 ; kC < cnvec ; kC++)
+        for (kC = 0 ; kC < (int64_t) cnvec ; kC++)
         {
             const int64_t kA = kC / bnvec ;
             const int64_t kB = kC % bnvec ;
-            // get A(:,jA), the (kA)th vector of A
             const int64_t jA = GBh_A (Ah, kA) ;
             const int64_t aknz = (Ap == NULL) ? avlen :
                 (GB_IGET (Ap, kA+1) - GB_IGET (Ap, kA)) ;
-            // get B(:,jB), the (kB)th vector of B
             const int64_t jB = GBh_B (Bh, kB) ;
             const int64_t bknz = (Bp == NULL) ? bvlen :
                 (GB_IGET (Bp, kB+1) - GB_IGET (Bp, kB)) ;
-            // determine # entries in C(:,jC), the (kC)th vector of C
-            // int64_t kC = kA * bnvec + kB ;
-            // Cp [kC] = aknz * bknz ;
             GB_ISET (Cp, kC, aknz * bknz) ;
             if (C_is_hyper)
-            { 
-                // Ch [kC] = jA * bvdim + jB ;
                 GB_ISET (Ch, kC, jA * bvdim + jB) ;
-            }
         }
 
         int64_t nvec_nonempty ;
@@ -216,19 +617,17 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     C->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
-    // C = kron (A,B) where C is iso and/or full full
+    // C = kron (A,B) where C is iso and/or full
     //--------------------------------------------------------------------------
 
     if (C_iso)
-    { 
-        // C->x [0] = cscalar = op (A,B)
+    {
         memcpy (C->x, cscalar, csize) ;
         if (C_is_full)
-        { 
-            // no more work to do if C is iso and full
+        {
             ASSERT_MATRIX_OK (C, "C=kron(A,B), iso full", GB0) ;
             GB_FREE_WORKSPACE ;
-            return (GrB_SUCCESS) ;
+            return GrB_SUCCESS ;
         }
     }
 
@@ -238,93 +637,22 @@ GrB_Info GB_kroner                  // C = kron (A,B)
 
     int64_t cnz = GB_nnz (C) ;
     if (cnz == 0)
-    { 
+    {
         GB_FREE_WORKSPACE ;
-        return (GrB_SUCCESS) ;
+        return GrB_SUCCESS ;
     }
 
     //--------------------------------------------------------------------------
-    // C = kron (A,B)
+    // evaluate: JIT or generic
     //--------------------------------------------------------------------------
 
-    // via the JIT kernel
-    info = GB_kroner_jit (C, op, flipij, A, B, nthreads) ;
-
-    if (info == GrB_NO_VALUE)
-    { 
-        // via the generic kernel
-        #define GB_A_TYPE GB_void
-        #define GB_B_TYPE GB_void
-        #define GB_C_TYPE GB_void
-        #define GB_A_ISO A_iso
-        #define GB_B_ISO B_iso
-        #define GB_C_ISO C_iso
-        const bool A_iso = A->iso ;
-        const bool B_iso = B->iso ;
-        const int64_t asize = A->type->size ;
-        const int64_t bsize = B->type->size ;
-
-        GxB_binary_function fmult = op->binop_function ;
-        GxB_index_binary_function fmult_idx = op->idxbinop_function ;
-        const void *theta = op->theta ;
-        GB_cast_function cast_A = NULL, cast_B = NULL ;
-        if (!A_is_pattern)
-        { 
-            cast_A = GB_cast_factory (op->xtype->code, A->type->code) ;
-        }
-        if (!B_is_pattern)
-        { 
-            cast_B = GB_cast_factory (op->ytype->code, B->type->code) ;
-        }
-
-        #define GB_C_IS_FULL C_is_full
-
-        #define GB_DECLAREA(a) GB_void a [GB_VLA(asize)]
-        #define GB_DECLAREB(b) GB_void b [GB_VLA(bsize)]
-
-        #define GB_GETA(a,Ax,p,iso)                         \
-        {                                                   \
-            if (!A_is_pattern)                              \
-            {                                               \
-                cast_A (a, Ax + (p)*asize, asize) ;         \
-            }                                               \
-        }
-
-        #define GB_GETB(b,Bx,p,iso)                         \
-        {                                                   \
-            if (!B_is_pattern)                              \
-            {                                               \
-                cast_B (b, Bx + (p)*bsize, bsize) ;         \
-            }                                               \
-        }
-
-        #define GB_KRONECKER_OP(Cx,pC,a,ix,jx,b,iy,jy)      \
-        {                                                   \
-            if (fmult != NULL)                              \
-            {                                               \
-                /* standard binary operator */              \
-                fmult (Cx +(pC)*csize, a, b) ;              \
-            }                                               \
-            else                                            \
-            {                                               \
-                /* index binary operator */                 \
-                if (flipij)                                 \
-                {                                           \
-                    fmult_idx (Cx +(pC)*csize,              \
-                        a, jx, ix, b, jy, iy, theta) ;      \
-                }                                           \
-                else                                        \
-                {                                           \
-                    fmult_idx (Cx +(pC)*csize,              \
-                        a, ix, jx, b, iy, jy, theta) ;      \
-                }                                           \
-            }                                               \
-        }
+    info = GB_kroner_jit (C, op, flipij, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads) ;
 
-        #define GB_GENERIC
-        #include "ewise/include/GB_ewise_shared_definitions.h"
-        #include "kronecker/template/GB_kroner_template.c"
-        info = GrB_SUCCESS ;
+    if (info != GrB_SUCCESS)
+    {
+        info = GB_kroner_generic (C, op, flipij, A, B,
+            A_is_pattern, B_is_pattern, A_transpose, B_transpose,
+            Mask, Mask_comp, Mask_struct, C_is_full, nthreads) ;
     }
 
     //--------------------------------------------------------------------------
@@ -332,16 +660,11 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     //--------------------------------------------------------------------------
 
     if (info == GrB_SUCCESS)
-    { 
+    {
         GB_OK (GB_hyper_prune (C, Werk)) ;
         ASSERT_MATRIX_OK (C, "C=kron(A,B)", GB0) ;
     }
 
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
     GB_FREE_WORKSPACE ;
-    return (info) ;
+    return info ;
 }
-
diff --git a/Source/kronecker/template/GB_kroner_template.c b/Source/kronecker/template/GB_kroner_template.c
index bdc76990c0..c306df0058 100644
--- a/Source/kronecker/template/GB_kroner_template.c
+++ b/Source/kronecker/template/GB_kroner_template.c
@@ -47,135 +47,334 @@
     // C = kron (A,B)
     //--------------------------------------------------------------------------
 
-    int tid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (tid = 0 ; tid < nthreads ; tid++)
+    if (!GB_NO_MASK && !GB_MASK_COMP)
     {
+        #define GB_GETVECTOR(A, row, col) GrB_Index vector_ = (A)->is_csc ? (col) : (row) 
+
+        #define GB_GETCOORD(A, row, col) GrB_Index coord_ = (A)->is_csc ? (row) : (col)
+
+        #define GB_LOOKUP_XOFFSET_BITMAP_FULL(offset, A)    \
+        {                                                   \
+            GrB_Index offset_ = vector_ * (A)->vlen +       \
+                                 coord_ ;                   \
+            if ((A)->b == NULL ||                           \
+                (((int8_t *) (A)->b)[offset_]))             \
+            {                                               \
+                offset = (A)->iso ? 0 : offset_ ;           \
+            }                                               \
+            else                                            \
+            {                                               \
+                offset = -1 ;                               \
+            }                                               \
+        }
 
-        //----------------------------------------------------------------------
-        // get the iso values of A and B
-        //----------------------------------------------------------------------
+        #define GB_LOOKUP_XOFFSET_SPARSE(offset, A)         \
+        {                                                   \
+            start_ = (A)->p_is_32 ?                         \
+                      ((int32_t *)(A)->p) [vector_] :       \
+                      ((int64_t *)(A)->p) [vector_] ;       \
+            end_ = (A)->p_is_32 ?                           \
+                    ((int32_t *)(A)->p) [vector_ + 1] :     \
+                    ((int64_t *)(A)->p) [vector_ + 1] ;     \
+            end_-- ;                                        \
+            if (start_ > end_)                              \
+            {                                               \
+                offset = -1 ;                               \
+            }                                               \
+            else                                            \
+            {                                               \
+                if (GB_binary_search (coord_, (A)->i,       \
+                    (A)->i_is_32, &start_, &end_))          \
+                {                                           \
+                    offset = (A)->iso ? 0 : start_ ;        \
+                }                                           \
+                else                                        \
+                {                                           \
+                    offset = - 1 ;                          \
+                }                                           \
+            }                                               \
+        }
 
-        GB_DECLAREA (a) ;
-        if (GB_A_ISO)
-        { 
-            GB_GETA (a, Ax, 0, true) ;
+        #define GB_LOOKUP_XOFFSET_HYPER(offset, A)          \
+        {                                                   \
+            start_ = 0 ;                                    \
+            end_ = (A)->plen - 1 ;                          \
+            if (!GB_binary_search (vector_, (A)->h,         \
+                (A)->j_is_32, &start_, &end_))              \
+            {                                               \
+                offset = - 1;                               \
+            }                                               \
+            else                                            \
+            {                                               \
+                int64_t k_ = start_ ;                       \
+                start_ = (A)->p_is_32 ?                     \
+                          ((int32_t *)(A)->p) [k_] :        \
+                          ((int64_t *)(A)->p) [k_] ;        \
+                end_ = (A)->p_is_32 ?                       \
+                        ((int32_t *)(A)->p) [k_ + 1] :      \
+                        ((int64_t *)(A)->p) [k_ + 1] ;      \
+                end_-- ;                                    \
+                if (start_ > end_)                          \
+                {                                           \
+                    offset = -1 ;                           \
+                }                                           \
+                else                                        \
+                {                                           \
+                    if (GB_binary_search (coord_, (A)->i,   \
+                        (A)->i_is_32, &start_, &end_))      \
+                    {                                       \
+                        offset = (A)->iso ? 0 : start_ ;    \
+                    }                                       \
+                    else                                    \
+                    {                                       \
+                        offset = -1 ;                       \
+                    }                                       \
+                }                                           \
+            }                                               \
         }
-        GB_DECLAREB (b) ;
-        if (GB_B_ISO)
-        { 
-            GB_GETB (b, Bx, 0, true) ;
+
+        #define GB_LOOKUP_XOFFSET(offset, A, row, col)      \
+        {                                                   \
+            GB_GETVECTOR(A, row, col) ;                     \
+            GB_GETCOORD(A, row, col) ;                      \
+            if ((A)->p == NULL)                             \
+            {                                               \
+                GB_LOOKUP_XOFFSET_BITMAP_FULL (offset, A) ; \
+            }                                               \
+            else                                            \
+            {                                               \
+                int64_t start_, end_ ;                      \
+                if ((A)->h == NULL)                         \
+                {                                           \
+                    GB_LOOKUP_XOFFSET_SPARSE(offset, A) ;   \
+                }                                           \
+                else                                        \
+                {                                           \
+                    GB_LOOKUP_XOFFSET_HYPER(offset, A) ;    \
+                }                                           \
+            }                                               \
         }
 
-        //----------------------------------------------------------------------
-        // construct the task to compute Ci,Cx [pC:pC_end-1]
-        //----------------------------------------------------------------------
+        #define GB_NROWS(A) ((A)->is_csc ? (A)->vlen : (A)->vdim)
+        #define GB_NCOLS(A) ((A)->is_csc ? (A)->vdim : (A)->vlen)
+
 
-        int64_t pC, pC_end ;
-        GB_PARTITION (pC, pC_end, cnz, tid, nthreads) ;
+        GB_Mp_DECLARE(Mp, ) ;
+        GB_Mp_PTR(Mp, Mask) ;
 
-        // find where this task starts in C
-        int64_t kC_task = GB_search_for_vector (Cp, GB_Cp_IS_32, pC, 0, cnvec,
-            cvlen) ;
-        int64_t pC_delta = pC - GBp_C (Cp, kC_task, cvlen) ;
+        GB_Mh_DECLARE(Mh, ) ;
+        GB_Mh_PTR(Mh, Mask) ;
 
-        //----------------------------------------------------------------------
-        // compute C(:,kC) for all vectors kC in this task
-        //----------------------------------------------------------------------
+        GB_Mi_DECLARE(Mi, ) ;
+        GB_Mi_PTR(Mi, Mask) ;
 
-        for (int64_t kC = kC_task ; kC < cnvec && pC < pC_end ; kC++)
+
+        int64_t bnrows = (B_transpose) ? GB_NCOLS (B) : GB_NROWS (B) ;
+        int64_t bncols = (B_transpose) ? GB_NROWS (B) : GB_NCOLS (B) ;
+
+        #pragma omp parallel num_threads(nthreads)
         {
+            GrB_Index offset = -1 ;
+            GB_DECLAREA (a_elem) ;
+            GB_DECLAREB (b_elem) ;
+            int64_t vlen = Mask->vlen ;
 
-            //------------------------------------------------------------------
-            // get the vectors C(:,jC), A(:,jA), and B(:,jB)
-            //------------------------------------------------------------------
-
-            // C(:,jC) = kron (A(:,jA), B(:,jB), the (kC)th vector of C,
-            // where jC = GBh_C (Ch, kC)
-            int64_t kA = kC / bnvec ;
-            int64_t kB = kC % bnvec ;
-
-            // get A(:,jA), the (kA)th vector of A
-            int64_t jA = GBh_A (Ah, kA) ;
-            int64_t pA_start = GBp_A (Ap, kA, avlen) ;
-            int64_t pA_end   = GBp_A (Ap, kA+1, avlen) ;
-
-            // get B(:,jB), the (kB)th vector of B
-            int64_t jB = GBh_B (Bh, kB) ;
-            int64_t pB_start = GBp_B (Bp, kB, bvlen) ;
-            int64_t pB_end   = GBp_B (Bp, kB+1, bvlen) ;
-            int64_t bknz = pB_end - pB_start ;
-
-            // shift into the middle of A(:,jA) and B(:,jB) for the first
-            // vector of C for this task.
-            int64_t pA_delta = 0 ;
-            int64_t pB_delta = 0 ;
-            if (kC == kC_task && bknz > 0)
+            #pragma omp for schedule(static)
+            for (GrB_Index k = 0 ; k < Mask->nvec ; k++)
+            {
+                GrB_Index j = GBh_M (Mh, k) ;
+
+                int64_t pA_start = GBp_M (Mp, k, vlen) ;
+                int64_t pA_end = GBp_M (Mp, k+1, vlen) ;
+                GrB_Index pos = Mask->p_is_32 ? ((int32_t *)C->p)[j] : ((int64_t *)C->p)[j] ;
+                for (GrB_Index p = pA_start ; p < pA_end ; p++)
+                {
+                    if (!GBb_M (Mask->b, p)) continue ;
+
+                    int64_t i = GBi_M (Mi, p, vlen) ;
+                    GrB_Index Mrow = Mask->is_csc ? i : j ; GrB_Index Mcol = Mask->is_csc ? j : i ;
+
+                    // extract elements from A and B, 
+                    // initialize offset in MTi and MTx,
+                    // get result of op, place it in MTx
+
+                    if (GB_MASK_STRUCT || (Mask->iso ? ((bool *)Mask->x)[0] : ((bool *)Mask->x)[p]))
+                    {
+                        GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows);
+                        GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols);
+
+                        GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows);
+                        GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols);
+
+                        GB_LOOKUP_XOFFSET (offset, A, arow, acol) ;
+                        if (offset == -1)
+                        {
+                            continue;
+                        }
+                        // iso of A or of C (?)
+                        GB_GETA (a_elem, Ax, offset, GB_A_ISO) ;
+
+                        GB_LOOKUP_XOFFSET (offset, B, brow, bcol) ;
+                        if (offset == -1)
+                        {
+                            continue;
+                        }
+                        GB_GETB (b_elem, Bx, offset, GB_B_ISO) ;
+
+                        GrB_Index ix, jx, iy, jy ;
+                        ix = A_transpose ? acol : arow ;
+                        jx = A_transpose ? arow : acol ;
+                        iy = B_transpose ? bcol : brow ;
+                        jy = B_transpose ? brow : bcol ;
+
+                        if (Mask->i_is_32)
+                        {
+                            ((int32_t *)C->i)[pos] = i ;
+                        }
+                        else
+                        {
+                            ((int64_t *)C->i)[pos] = i ;
+                        }
+
+                        GB_KRONECKER_OP (Cx, pos, a_elem, ix, jx, b_elem, iy, jy) ;
+
+                        pos++ ;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        int tid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (tid = 0 ; tid < nthreads ; tid++)
+        {
+
+            //----------------------------------------------------------------------
+            // get the iso values of A and B
+            //----------------------------------------------------------------------
+
+            GB_DECLAREA (a) ;
+            if (GB_A_ISO)
+            { 
+                GB_GETA (a, Ax, 0, true) ;
+            }
+            GB_DECLAREB (b) ;
+            if (GB_B_ISO)
             { 
-                pA_delta = pC_delta / bknz ;
-                pB_delta = pC_delta % bknz ;
+                GB_GETB (b, Bx, 0, true) ;
             }
 
-            //------------------------------------------------------------------
-            // for all entries in A(:,jA), skipping entries for first vector
-            //------------------------------------------------------------------
+            //----------------------------------------------------------------------
+            // construct the task to compute Ci,Cx [pC:pC_end-1]
+            //----------------------------------------------------------------------
 
-            int64_t pA = pA_start + pA_delta ;
-            pA_delta = 0 ;
-            for ( ; pA < pA_end && pC < pC_end ; pA++)
-            {
+            int64_t pC, pC_end ;
+            GB_PARTITION (pC, pC_end, cnz, tid, nthreads) ;
+
+            // find where this task starts in C
+            int64_t kC_task = GB_search_for_vector (Cp, GB_Cp_IS_32, pC, 0, cnvec,
+                cvlen) ;
+            int64_t pC_delta = pC - GBp_C (Cp, kC_task, cvlen) ;
 
-                //--------------------------------------------------------------
-                // a = A(iA,jA), typecasted to op->xtype
-                //--------------------------------------------------------------
+            //----------------------------------------------------------------------
+            // compute C(:,kC) for all vectors kC in this task
+            //----------------------------------------------------------------------
 
-                int64_t iA = GBi_A (Ai, pA, avlen) ;
-                int64_t iAblock = iA * bvlen ;
-                if (!GB_A_ISO)
+            for (int64_t kC = kC_task ; kC < cnvec && pC < pC_end ; kC++)
+            {
+
+                //------------------------------------------------------------------
+                // get the vectors C(:,jC), A(:,jA), and B(:,jB)
+                //------------------------------------------------------------------
+
+                // C(:,jC) = kron (A(:,jA), B(:,jB), the (kC)th vector of C,
+                // where jC = GBh_C (Ch, kC)
+                int64_t kA = kC / bnvec ;
+                int64_t kB = kC % bnvec ;
+
+                // get A(:,jA), the (kA)th vector of A
+                int64_t jA = GBh_A (Ah, kA) ;
+                int64_t pA_start = GBp_A (Ap, kA, avlen) ;
+                int64_t pA_end   = GBp_A (Ap, kA+1, avlen) ;
+
+                // get B(:,jB), the (kB)th vector of B
+                int64_t jB = GBh_B (Bh, kB) ;
+                int64_t pB_start = GBp_B (Bp, kB, bvlen) ;
+                int64_t pB_end   = GBp_B (Bp, kB+1, bvlen) ;
+                int64_t bknz = pB_end - pB_start ;
+
+                // shift into the middle of A(:,jA) and B(:,jB) for the first
+                // vector of C for this task.
+                int64_t pA_delta = 0 ;
+                int64_t pB_delta = 0 ;
+                if (kC == kC_task && bknz > 0)
                 { 
-                    GB_GETA (a, Ax, pA, false) ;
+                    pA_delta = pC_delta / bknz ;
+                    pB_delta = pC_delta % bknz ;
                 }
 
-                //--------------------------------------------------------------
-                // for all entries in B(:,jB), skipping entries for 1st vector
-                //--------------------------------------------------------------
+                //------------------------------------------------------------------
+                // for all entries in A(:,jA), skipping entries for first vector
+                //------------------------------------------------------------------
 
-                // scan B(:,jB), skipping to the first entry of C if this is
-                // the first time B is accessed in this task
-                int64_t pB = pB_start + pB_delta ;
-                pB_delta = 0 ;
-                for ( ; pB < pB_end && pC < pC_end ; pB++)
-                { 
+                int64_t pA = pA_start + pA_delta ;
+                pA_delta = 0 ;
+                for ( ; pA < pA_end && pC < pC_end ; pA++)
+                {
 
-                    //----------------------------------------------------------
-                    // b = B(iB,jB), typecasted to op->ytype
-                    //----------------------------------------------------------
+                    //--------------------------------------------------------------
+                    // a = A(iA,jA), typecasted to op->xtype
+                    //--------------------------------------------------------------
 
-                    int64_t iB = GBi_B (Bi, pB, bvlen) ;
-                    if (!GB_B_ISO)
+                    int64_t iA = GBi_A (Ai, pA, avlen) ;
+                    int64_t iAblock = iA * bvlen ;
+                    if (!GB_A_ISO)
                     { 
-                        GB_GETB (b, Bx, pB, false) ;
+                        GB_GETA (a, Ax, pA, false) ;
                     }
 
-                    //----------------------------------------------------------
-                    // C(iC,jC) = A(iA,jA) * B(iB,jB)
-                    //----------------------------------------------------------
+                    //--------------------------------------------------------------
+                    // for all entries in B(:,jB), skipping entries for 1st vector
+                    //--------------------------------------------------------------
 
-                    if (!GB_C_IS_FULL)
-                    { 
-                        // save the row index iC
-                        // Ci [pC] = iAblock + iB ;
-                        GB_ISET (Ci, pC, iAblock + iB) ;
-                    }
-                    // Cx [pC] = op (a, b)
-                    if (!GB_C_ISO)
+                    // scan B(:,jB), skipping to the first entry of C if this is
+                    // the first time B is accessed in this task
+                    int64_t pB = pB_start + pB_delta ;
+                    pB_delta = 0 ;
+                    for ( ; pB < pB_end && pC < pC_end ; pB++)
                     { 
-                        GB_KRONECKER_OP (Cx, pC, a, iA, jA, b, iB, jB) ;
+
+                        //----------------------------------------------------------
+                        // b = B(iB,jB), typecasted to op->ytype
+                        //----------------------------------------------------------
+
+                        int64_t iB = GBi_B (Bi, pB, bvlen) ;
+                        if (!GB_B_ISO)
+                        { 
+                            GB_GETB (b, Bx, pB, false) ;
+                        }
+
+                        //----------------------------------------------------------
+                        // C(iC,jC) = A(iA,jA) * B(iB,jB)
+                        //----------------------------------------------------------
+
+                        if (!GB_C_IS_FULL)
+                        { 
+                            // save the row index iC
+                            // Ci [pC] = iAblock + iB ;
+                            GB_ISET (Ci, pC, iAblock + iB) ;
+                        }
+                        // Cx [pC] = op (a, b)
+                        if (!GB_C_ISO)
+                        { 
+                            GB_KRONECKER_OP (Cx, pC, a, iA, jA, b, iB, jB) ;
+                        }
+                        pC++ ;
                     }
-                    pC++ ;
                 }
             }
         }
     }
 }
-
diff --git a/Source/mask/GB_mask.h b/Source/mask/GB_mask.h
index a5286fc665..0c2b5b113f 100644
--- a/Source/mask/GB_mask.h
+++ b/Source/mask/GB_mask.h
@@ -54,6 +54,7 @@ GrB_Info GB_masker_phase1           // count nnz in each R(:,j)
     const int64_t *restrict R_to_Z,
     const bool Rp_is_32,
     const bool Rj_is_32,
+    const int R_sparsity,           // GxB_SPARSE or GxB_HYPERSPARSE
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -85,7 +86,7 @@ GrB_Info GB_masker_phase2           // phase2 for R = masker (C,M,Z)
     const bool Rp_is_32,
     const bool Rj_is_32,
     const bool Ri_is_32,
-    const int R_sparsity,
+    const int R_sparsity,           // any sparsity format
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
diff --git a/Source/mask/GB_masker.c b/Source/mask/GB_masker.c
index 8fe8cc2d79..eb1eaf546f 100644
--- a/Source/mask/GB_masker.c
+++ b/Source/mask/GB_masker.c
@@ -208,7 +208,7 @@ GrB_Info GB_masker          // R = masker (C, M, Z)
             // from phase1a:
             TaskList, R_ntasks, R_nthreads,
             // from phase0:
-            Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32,
+            Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32, R_sparsity,
             // original input:
             M, Mask_comp, Mask_struct, C, Z, Werk) ;
         if (info != GrB_SUCCESS)
diff --git a/Source/mask/GB_masker_phase1.c b/Source/mask/GB_masker_phase1.c
index 5e23d999b5..c7c6a0eaf9 100644
--- a/Source/mask/GB_masker_phase1.c
+++ b/Source/mask/GB_masker_phase1.c
@@ -43,6 +43,7 @@ GrB_Info GB_masker_phase1           // count nnz in each R(:,j)
     const int64_t *restrict R_to_Z,
     const bool Rp_is_32,
     const bool Rj_is_32,
+    const int R_sparsity,           // GxB_SPARSE or GxB_HYPERSPARSE
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -60,6 +61,7 @@ GrB_Info GB_masker_phase1           // count nnz in each R(:,j)
     ASSERT (Rp_handle != NULL) ;
     ASSERT (Rp_size_handle != NULL) ;
     ASSERT (Rnvec_nonempty != NULL) ;
+    ASSERT (R_sparsity == GxB_SPARSE || R_sparsity == GxB_HYPERSPARSE) ;
 
     ASSERT_MATRIX_OK (M, "M for mask phase1", GB0) ;
     ASSERT (!GB_ZOMBIES (M)) ; 
@@ -109,7 +111,7 @@ GrB_Info GB_masker_phase1           // count nnz in each R(:,j)
         R_ntasks,                   // # of tasks
         R_nthreads,                 // # of threads to use
         // analysis from phase0:
-        Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32,
+        Rnvec, Rh, R_to_M, R_to_C, R_to_Z, Rp_is_32, Rj_is_32, R_sparsity,
         // original input:
         M, Mask_comp, Mask_struct, C, Z) ;
 
diff --git a/Source/mask/GB_masker_phase2.c b/Source/mask/GB_masker_phase2.c
index f7b2e4b93e..52a01aaa6a 100644
--- a/Source/mask/GB_masker_phase2.c
+++ b/Source/mask/GB_masker_phase2.c
@@ -65,7 +65,7 @@ GrB_Info GB_masker_phase2           // phase2 for R = masker (C,M,Z)
     const bool Rp_is_32,
     const bool Rj_is_32,
     const bool Ri_is_32,
-    const int R_sparsity,
+    const int R_sparsity,           // any sparsity format
     // original input:
     const GrB_Matrix M,             // required mask
     const bool Mask_comp,           // if true, then M is complemented
@@ -243,7 +243,8 @@ GrB_Info GB_masker_phase2           // phase2 for R = masker (C,M,Z)
         //----------------------------------------------------------------------
 
         info = GB_masker_phase2_jit (R, TaskList, R_ntasks, R_nthreads,
-            R_to_M, R_to_C, R_to_Z, M, Mask_comp, Mask_struct, C, Z,
+            R_to_M, R_to_C, R_to_Z, R_sparsity,
+            M, Mask_comp, Mask_struct, C, Z,
             C_ek_slicing, C_ntasks, C_nthreads,
             M_ek_slicing, M_ntasks, M_nthreads) ;
 
diff --git a/Test/test226.m b/Test/test226.m
index 41f77fd3d3..4778425a8a 100644
--- a/Test/test226.m
+++ b/Test/test226.m
@@ -9,6 +9,8 @@
 A.matrix = sprand (5, 10, 0.4) ;
 B.matrix = ones (3, 2) ;
 B.iso = true ;
+M.matrix = sprandn (15, 20,0.2) ~= 0 ;
+MT.matrix = sprandn (9, 4, 20,0.2) ~= 0 ;
 
 mult.opname = 'times' ;
 mult.optype = 'double' ;
@@ -18,14 +20,26 @@
 C2 = GB_spec_kron (Cin, [ ], [ ], mult, A, B, [ ]) ;
 GB_spec_compare (C1, C2) ;
 
+C1 = GB_mex_kron (Cin, M, [ ], mult, A, B, [ ]) ;
+C2 = GB_spec_kron (Cin, M, [ ], mult, A, B, [ ]) ;
+GB_spec_compare (C1, C2) ;
+
 C1 = GB_mex_kron  (Cin, [ ], [ ], mult, B, A, [ ]) ;
 C2 = GB_spec_kron (Cin, [ ], [ ], mult, B, A, [ ]) ;
 GB_spec_compare (C1, C2) ;
 
+C1 = GB_mex_kron (Cin, M, [ ], mult, B, A, [ ]) ;
+C2 = GB_spec_kron (Cin, M, [ ], mult, B, A, [ ]) ;
+GB_spec_compare (C1, C2) ;
+
 Cin = sparse (9, 4) ;
 C1 = GB_mex_kron  (Cin, [ ], [ ], mult, B, B, [ ]) ;
 C2 = GB_spec_kron (Cin, [ ], [ ], mult, B, B, [ ]) ;
 GB_spec_compare (C1, C2) ;
 
+C1 = GB_mex_kron (Cin, MT, [ ], mult, B, B, [ ]) ;
+C2 = GB_spec_kron (Cin, MT, [ ], mult, B, B, [ ]) ;
+GB_spec_compare (C1, C2) ;
+
 fprintf ('\ntest226: all tests passed\n') ;
 
diff --git a/Test/test227.m b/Test/test227.m
index 98e75da2e2..50dd0153d1 100644
--- a/Test/test227.m
+++ b/Test/test227.m
@@ -16,6 +16,16 @@
 dnt = struct ( 'inp1', 'tran' ) ;
 dtt = struct ( 'inp0', 'tran', 'inp1', 'tran' ) ;
 
+dnn.mask = 'default' ;
+dtn.mask = 'default' ;
+dnt.mask = 'default' ;
+dtt.mask = 'default' ;
+
+dnn.outp = 'default' ;
+dtn.outp = 'default' ;
+dnt.outp = 'default' ;
+dtt.outp = 'default' ;
+
 types = { 'int32', 'int64', 'single', 'double' } ;
 
 am = 5 ;
@@ -23,13 +33,18 @@
 bm = 4 ;
 bn = 2 ;
 
-Ax = sparse (100 * sprandn (am,an, 0.5)) ;
-Bx = sparse (100 * sprandn (bm,bn, 0.5)) ; 
+Ax_temp = 100 * sprandn (am, an, 0.5);
+Bx_temp = 100 * sprandn (bm, bn, 0.5);
+
+Ax = sparse(round(Ax_temp));
+Bx = sparse(round(Bx_temp));
+
 cm = am * bm ;
 cn = an * bn ;
 Cx = sparse (cm,cn) ;
-AT = Ax' ;
-BT = Bx' ;
+Maskmat = sprandn (cm,cn,0.2) ~= 0 ;
+ATmat = Ax' ;
+BTmat = Bx' ;
 
 for k2 = [4 7 45:52 ]
     for k1 = 1:4
@@ -64,9 +79,19 @@
                             B.is_csc   = B_is_csc   ;
 
                             clear C
-                            C.matrix = Cx ;
+                            C.matrix = sparse (cm,cn) ;
                             C.is_csc   = C_is_csc   ;
 
+                            clear AT
+                            AT.matrix = ATmat ;
+                            AT.is_hyper = A_is_hyper ;
+                            AT.is_csc = A.is_csc ;
+
+                            clear BT
+                            BT.matrix = BTmat ;
+                            BT.is_hyper = B_is_hyper ;
+                            BT.is_csc = B.is_csc ;
+
                             %---------------------------------------
                             % kron(A,B)
                             %---------------------------------------
@@ -103,6 +128,44 @@
                             C1 = GB_mex_kron  (C, [ ], [ ], op, AT, BT, dtt) ;
                             GB_spec_compare (C0, C1) ;
 
+                            % tests with Mask
+                            for Mask_is_hyper = 0:1
+                            for Mask_is_csc = 0:1
+                            fprintf('*')
+
+                            A.is_csc = A_is_csc ;
+                            B.is_csc = B_is_csc ;
+                            A.is_hyper = A_is_hyper ;
+                            B.is_hyper = B_is_hyper ;
+
+                            clear M
+                            M.matrix = Maskmat ;
+                            M.is_hyper = Mask_is_hyper ;
+                            M.is_csc = Mask_is_csc;
+                            C.is_csc = C_is_csc ;
+
+                            % kron(A, B) with Mask
+                            C0 = GB_spec_kron (C, M, [ ], op, A, B, dnn) ;
+                            fprintf('#') ;
+                            C1 = GB_mex_kron (C, M, [ ], op, A, B, dnn) ;
+                            GB_spec_compare(C0, C1) ;
+
+                            % kron(A', B) with Mask
+                            C0 = GB_spec_kron (C, M, [ ], op, AT, B, dtn) ;
+                            C1 = GB_mex_kron  (C, M, [ ], op, AT, B, dtn) ;
+                            GB_spec_compare (C0, C1) ;
+
+                            % kron(A, B') with Mask
+                            C0 = GB_spec_kron (C, M, [ ], op, A, BT, dnt) ;
+                            C1 = GB_mex_kron  (C, M, [ ], op, A, BT, dnt) ;
+                            GB_spec_compare (C0, C1) ;
+
+                            % kron(A', B') with Mask
+                            C0 = GB_spec_kron (C, M, [ ], op, AT, BT, dtt) ;
+                            C1 = GB_mex_kron  (C, M, [ ], op, AT, BT, dtt) ;
+                            GB_spec_compare (C0, C1) ;
+                            end
+                            end
                         end
                     end
                 end
diff --git a/cmake_modules/GraphBLAS_version.cmake b/cmake_modules/GraphBLAS_version.cmake
index a9208bb857..b2be1d004c 100644
--- a/cmake_modules/GraphBLAS_version.cmake
+++ b/cmake_modules/GraphBLAS_version.cmake
@@ -8,10 +8,10 @@
 #-------------------------------------------------------------------------------
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Dec 3, 2025" )
+set ( GraphBLAS_DATE "Jan 21, 2026" )
 set ( GraphBLAS_VERSION_MAJOR 10 CACHE STRING "" FORCE )
 set ( GraphBLAS_VERSION_MINOR 3 CACHE STRING "" FORCE )
-set ( GraphBLAS_VERSION_SUB   0 CACHE STRING "" FORCE )
+set ( GraphBLAS_VERSION_SUB   1 CACHE STRING "" FORCE )
 
 # GraphBLAS C API Specification version, at graphblas.org
 set ( GraphBLAS_API_DATE "Dec 22, 2023" )