I am using inline assembly in c . However, my code does not work properly, It always returns 0. I want to find some of negative values and show. Could you please help me out?
PS: I have used debugging but could not find the problem
void func(const int* arr, size_t arr_rows, size_t arr_cols, int* result)
{
int sum = 0;
_asm
{
mov ebx, [arr] ///address
mov edx, 0; //sum
mov ecx, [arr_rows] // number of rows
row_loop:
push ecx // save number of rows
xor esi, esi // reset column offset for the current row
mov ecx, [arr_cols] // number of column
col_loop :
add ebx, esi;
cmp ebx, 0
jge bigger_case
jl less_case
jmp n_case
less_case :
add esi, 4
add edx, dword ptr[ebx esi];
loop col_loop
bigger_case:
add esi, 4
loop col_loop
n_case:
add esi, 4
add ebx, esi // move to the next row offset
pop ecx // restore row loop counter
loop row_loop;
ending:
mov sum, edx;
}
cout << sum<<" is answer"<<endl;
}
CodePudding user response:
Review
cmp ebx, 0
This compares an address to 0. You need to compare a value from the array.
add esi, 4 add edx, dword ptr[ebx esi];
This adds the next element to the sum. You need the current element.
loop col_loop <=== This is a 'fall through' bigger_case: add esi, 4 loop col_loop
If the row's last element would happen to be negative, then this fall through in the code would start a very long loop!
jge bigger_case jl less_case jmp n_case less_case :
Once you know it's not greater nor equal, then it's got to be less. Here you can indeed rightfully fall through in the less_case.
You can easily address the array elements without using the extra ESI
offset. Just always add ebx, 4
, much cleaner.
Solution
You don't need to solve this task with nested loops. Just calculate the total number of elements and use a single loop.
xor edx, edx
mov esi, [arr]
mov ecx, [arr_rows]
imul ecx, [arr_cols]
more:
mov eax, [esi]
test eax, eax ; TEST is efficient to inspect the sign
jns skip
add edx, eax ; Only adding negative values
skip:
add esi, 4
dec ecx
jnz more
Better not use the LOOP
instruction. See Why is the loop instruction slow? Couldn't Intel have implemented it efficiently?