I am looking to accelerate a binary erosion image processing function with Cython, although I am new to Cython. I am not seeing the drastic speedups I was expecting. I am looking for help to optimize this code as I am still not familiar with how C types, indexing, memory views, and objects can be utilized to enhance performance. Below is the source code and output of the Cython function, python function using SciPy module, setup.py, and jupyter notebook.
Cython code erode.pyx
import numpy as np
cimport numpy as np
DTYPE = np.int_
ctypedef np.int_t DTYPE_t
def erode(long [:,:] img):
# Variables
cdef int height, width, local_min
cdef int vals[5]
height = img.shape[0]
width = img.shape[1]
# Padded Array
cdef np.ndarray[DTYPE_t, ndim=2] padded = np.zeros((height+2, width+2), dtype = DTYPE)
padded[1:height+1,1:width+1] = img
#Return array
cdef np.ndarray[DTYPE_t, ndim=2] eroded = np.zeros((height,width),dtype=DTYPE)
cdef int i,j
for i in range(height):
for j in range(width):
vals = [padded[i+1,j+1], padded[i,j+1], padded[i+1,j],padded[i+1,j+2],padded[i+2,j+1]]
local_min = min(vals)
eroded[i,j] = local_min
return eroded
Python code erode_py.py
import numpy as np
from scipy.ndimage import binary_erosion
def erode_py(img):
strel = np.array([[0, 1, 0],
[1, 1, 1],
[0, 1, 0]], dtype=np.uint8)
img = img.astype(np.uint8)
eroded_image = binary_erosion(img, strel, border_value=0)
return eroded_image
setup.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name='binary_erode_build',
ext_modules=cythonize("erode.pyx"),
include_dirs=[numpy.get_include()]
)
Jupyter notebook
import numpy as np
import erode
import erode_py
obj = np.array([[0, 0, 0, 1, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 1, 1, 0],
[0, 0, 0, 1, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 1, 1, 0],
[0, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 0],
[0, 1, 1, 1, 1, 1, 0, 0]], dtype=np.int_)
%timeit -n100 -r100 erode.erode(obj)
%timeit -n100 -r100 erode_py.erode_py(obj)
42.8 µs ± 10.3 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)
44.2 µs ± 14.4 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)