I am adding my own answer along with my code to help someone else who can do this.
From a combination of pointers and using an approximation of sine and cosine instead of calling an external function to rotate, I came pretty close to achieving GDI speed. External functions are not called at all.
It's still 50% longer than GDI, but my earlier implementation took more than 10 times longer than GDI. And when you consider multithreading, this method can be 10 times faster than GDI. This function can rotate a 300x400 image in 3 milliseconds on my machine.
Keep in mind that this is for grayscale images, and each byte in the input array represents one pixel. If you have ideas to make it faster, please share!
private unsafe byte[] rotate(byte[] input, int inputWidth, int inputHeight, int cx, int cy, double angle) { byte[] result = new byte[input.Length]; int tx, ty, ix, iy, x1, y1; double px, py, fx, fy, sin, cos, v; byte a, b; //Approximate Sine and Cosine of the angle if (angle < 0) sin = 1.27323954 * angle + 0.405284735 * angle * angle; else sin = 1.27323954 * angle - 0.405284735 * angle * angle; angle += 1.57079632; if (angle > 3.14159265) angle -= 6.28318531; if (angle < 0) cos = 1.27323954 * angle + 0.405284735 * angle * angle; else cos = 1.27323954 * angle - 0.405284735 * angle * angle; angle -= 1.57079632; fixed (byte* pInput = input, pResult = result) { byte* pi = pInput; byte* pr = pResult; for (int x = 0; x < inputWidth; x++) for (int y = 0; y < inputHeight; y++) { tx = x - cx; ty = y - cy; px = tx * cos - ty * sin + cx; py = tx * sin + ty * cos + cy; ix = (int)px; iy = (int)py; fx = px - ix; fy = py - iy; if (ix < inputWidth && iy < inputHeight && ix >= 0 && iy >= 0) { //keep in array bounds x1 = ix + 1; y1 = iy + 1; if (x1 >= inputWidth) x1 = ix; if (y1 >= inputHeight) y1 = iy; //bilinear interpolation using pointers a = *(pInput + (iy * inputWidth + ix)); b = *(pInput + (y1 * inputWidth + ix)); v = a + ((*(pInput + (iy * inputWidth + x1)) - a) * fx); pr = (pResult + (y * inputWidth + x)); *pr = (byte)(v + (((b + ((*(pInput + (y1 * inputWidth + x1)) - b) * fx)) - v) * fy)); } } } return result; }
source share