# Import the NumPy library
import numpy as np

Z = np.ones((16,16))
k = 4
block_shape = (Z.shape[0]//k, Z.shape[1]//k, k, k)
block_strides = Z.itemsize * np.array([Z.shape[1]*k, k, Z.shape[1], 1])
block_view = np.lib.stride_tricks.as_strided(Z, shape=block_shape, strides=block_strides)
S = np.sum(block_view, axis=(-2,-1))

print(S)