I want to optimize the for loop with SSE/SSE2 instructions for a better time in image compression.
size_t height = get_height();
size_t width = get_width();
size_t total_size = height * width * 3;
uint8_t *src = get_pixels();
uint8_t *dst = new uint8_t[total_size / 6];
uint8_t *tmp = dst;
rgb_t block[16];
if (height % 4 != 0 || width % 4 != 0) {
cerr << "Texture compression only supported for images if width and height are multiples of 4" << endl;
return;
}
// Split image in 4x4 pixels zones
for (unsigned y = 0; y < height; y += 4, src += width * 3 * 4) {
for (unsigned x = 0; x < width; x += 4, dst += 8) {
const rgb_t *row0 = reinterpret_cast<const rgb_t*>(src + x * 3);
const rgb_t *row1 = row0 + width;
const rgb_t *row2 = row1 + width;
const rgb_t *row3 = row2 + width;
// Extract 4x4 matrix of pixels from a linearized matrix(linear memory).
memcpy(block, row0, 12);
memcpy(block + 4, row1, 12);
memcpy(block + 8, row2, 12);
memcpy(block + 12, row3, 12);
// Compress block and write result in dst.
compress_block(block, dst);
}
}
How can I read from memory an entire line from matrix with sse/sse2 registers when a line is supposed to have 4 elements of 3 bytes? The rgb_t structure has 3 uint_t variables.