I am adding my own answer along with my code to help anyone else who may be looking to do this.
From a combination of pointers and using an approximation of Sine and Cosine instead of calling an outside function for the rotation, I have come pretty darn close to reaching GDI speeds. No outside functions are called at all.
It still takes about 50% more time than GDI, but my earlier implementation took over 10 times longer than GDI. And when you consider multi threading, this method can be 10 times faster than GDI. This function can rotate a 300x400 picture in 3 milliseconds on my machine.
Keep in mind that this is for grayscale images and each byte in the input array represents one pixel.
If you have any ideas to make it faster please share!
private unsafe byte[] rotate(byte[] input, int inputWidth, int inputHeight, int cx, int cy, double angle)
{
byte[] result = new byte[input.Length];
int
tx, ty, ix, iy, x1, y1;
double
px, py, fx, fy, sin, cos, v;
byte a, b;
//Approximate Sine and Cosine of the angle
if (angle < 0)
sin = 1.27323954 * angle + 0.405284735 * angle * angle;
else
sin = 1.27323954 * angle - 0.405284735 * angle * angle;
angle += 1.57079632;
if (angle > 3.14159265)
angle -= 6.28318531;
if (angle < 0)
cos = 1.27323954 * angle + 0.405284735 * angle * angle;
else
cos = 1.27323954 * angle - 0.405284735 * angle * angle;
angle -= 1.57079632;
fixed (byte* pInput = input, pResult = result)
{
byte* pi = pInput;
byte* pr = pResult;
for (int x = 0; x < inputWidth; x++)
for (int y = 0; y < inputHeight; y++)
{
tx = x - cx;
ty = y - cy;
px = tx * cos - ty * sin + cx;
py = tx * sin + ty * cos + cy;
ix = (int)px;
iy = (int)py;
fx = px - ix;
fy = py - iy;
if (ix < inputWidth && iy < inputHeight && ix >= 0 && iy >= 0)
{
//keep in array bounds
x1 = ix + 1;
y1 = iy + 1;
if (x1 >= inputWidth)
x1 = ix;
if (y1 >= inputHeight)
y1 = iy;
//bilinear interpolation using pointers
a = *(pInput + (iy * inputWidth + ix));
b = *(pInput + (y1 * inputWidth + ix));
v = a + ((*(pInput + (iy * inputWidth + x1)) - a) * fx);
pr = (pResult + (y * inputWidth + x));
*pr = (byte)(v + (((b + ((*(pInput + (y1 * inputWidth + x1)) - b) * fx)) - v) * fy));
}
}
}
return result;
}