I am trying to implement a function that multiplies a 32-bit operand with a 256-bit operand in an ARM assembly by an ARM Cortex-a8. The problem is that my registers are running out, and I have no idea how to reduce the number of registers used. Here is my function:
typedef struct UN_256fe{ uint32_t uint32[8]; }UN_256fe; typedef struct UN_288bite{ uint32_t uint32[9]; }UN_288bite; void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res){ asm ( "umull r3, r4, %9, %10;\n\t" "mov %0, r3; \n\t" "umull r3, r5, %9, %11;\n\t" "adds r6, r3, r4; \n\t" "mov %1, r6; \n\t" "umull r3, r4, %9, %12;\n\t" "adcs r6, r5, r3; \n\t" "mov %2, r6; \n\t" "umull r3, r5, %9, %13;\n\t" "adcs r6, r3, r4; \n\t" "mov %3, r6; \n\t" "umull r3, r4, %9, %14;\n\t" "adcs r6, r3, r5; \n\t" "mov %4, r6; \n\t" "umull r3, r5, %9, %15;\n\t" "adcs r6, r3, r4; \n\t" "mov %5, r6; \n\t" "umull r3, r4, %9, %16;\n\t" "adcs r6, r3, r5; \n\t" "mov %6, r6; \n\t" "umull r3, r5, %9, %17;\n\t" "adcs r6, r3, r4; \n\t" "mov %7, r6; \n\t" "adc r6, r5, #0 ; \n\t" "mov %8, r6; \n\t" : "=r"(res->uint32[8]), "=r"(res->uint32[7]), "=r"(res->uint32[6]), "=r"(res->uint32[5]), "=r"(res->uint32[4]), "=r"(res->uint32[3]), "=r"(res->uint32[2]), "=r"(res->uint32[1]), "=r"(res->uint32[0]) : "r"(A), "r"(B->uint32[7]), "r"(B->uint32[6]), "r"(B->uint32[5]), "r"(B->uint32[4]), "r"(B->uint32[3]), "r"(B->uint32[2]), "r"(B->uint32[1]), "r"(B->uint32[0]), "r"(temp) : "r3", "r4", "r5", "r6", "cc", "memory"); }
EDIT-1: I updated the clobber list based on the first comment, but I still get the same error