I am having a strange memory access performance problem, any ideas?
int* pixel_ptr = somewhereFromHeap;
int local_ptr[307200];
for(int i=0;i<307200;i++){
pixel_ptr[i] = someCalculatedVal ;
}
for(int i=0;i<307200;i++){
pixel_ptr[i] = 1 ;
}
for(int i=0;i<307200;i++){
int val = pixel_ptr[i];
local_ptr[i] = val;
}
for(int i=0;i<307200;i++){
local_ptr[i] = someCalculatedVal ;
}
I tried to combine the values into a local scan
int scanline[640];
for(int i=xMin;i<xMax;i++){
int screen_pos = sy*screen_width+i;
int val = scanline[i];
pixel_ptr[screen_pos] = val ;
}
for(int i=xMin;i<xMax;i++){
int screen_pos = sy*screen_width+i;
int val = scanline[i];
pixel_ptr[screen_pos] = 1 ;
}
for(int i=xMin;i<xMax;i++){
int screen_pos = sy*screen_width+i;
int val = i;
pixel_ptr[screen_pos] = val ;
}
for(int i=xMin;i<xMax;i++){
int screen_pos = sy*screen_width+i;
int val = scanline[0];
pixel_ptr[screen_pos] = val ;
}
Any ideas? I use mingw with cflags -01 -std = C ++ 11 -fpermissive.
update4: I have to say that these are fragments from my program, and there are heavy codes / functions launched before and after. The scanline block was executed at the end of the function before exiting.
Now with the proper testing program. thks for @Iwillnotexist.
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
#define SIZE 307200
#define SAMPLES 1000
double local_test(){
int local_array[SIZE];
timeval start, end;
long cpu_time_used_sec,cpu_time_used_usec;
double cpu_time_used;
gettimeofday(&start, NULL);
for(int i=0;i<SIZE;i++){
local_array[i] = i;
}
gettimeofday(&end, NULL);
cpu_time_used_sec = end.tv_sec- start.tv_sec;
cpu_time_used_usec = end.tv_usec- start.tv_usec;
cpu_time_used = cpu_time_used_sec*1000 + cpu_time_used_usec/1000.0;
return cpu_time_used;
}
double heap_test(){
int* heap_array=new int[SIZE];
timeval start, end;
long cpu_time_used_sec,cpu_time_used_usec;
double cpu_time_used;
gettimeofday(&start, NULL);
for(int i=0;i<SIZE;i++){
heap_array[i] = i;
}
gettimeofday(&end, NULL);
cpu_time_used_sec = end.tv_sec- start.tv_sec;
cpu_time_used_usec = end.tv_usec- start.tv_usec;
cpu_time_used = cpu_time_used_sec*1000 + cpu_time_used_usec/1000.0;
delete[] heap_array;
return cpu_time_used;
}
double heap_test2(){
static int* heap_array = NULL;
if(heap_array==NULL){
heap_array = new int[SIZE];
}
timeval start, end;
long cpu_time_used_sec,cpu_time_used_usec;
double cpu_time_used;
gettimeofday(&start, NULL);
for(int i=0;i<SIZE;i++){
heap_array[i] = i;
}
gettimeofday(&end, NULL);
cpu_time_used_sec = end.tv_sec- start.tv_sec;
cpu_time_used_usec = end.tv_usec- start.tv_usec;
cpu_time_used = cpu_time_used_sec*1000 + cpu_time_used_usec/1000.0;
return cpu_time_used;
}
int main (int argc, char** argv){
double cpu_time_used = 0;
for(int i=0;i<SAMPLES;i++)
cpu_time_used+=local_test();
printf("local: %f ms\n",cpu_time_used);
cpu_time_used = 0;
for(int i=0;i<SAMPLES;i++)
cpu_time_used+=heap_test();
printf("heap_: %f ms\n",cpu_time_used);
cpu_time_used = 0;
for(int i=0;i<SAMPLES;i++)
cpu_time_used+=heap_test2();
printf("heap2: %f ms\n",cpu_time_used);
}
Optimization not performed.
local: 577.201000 ms
heap_: 826.802000 ms
heap2: 686.401000 ms
The first cumulative test with the new and remote 2x slower. (paging, as suggested?)
1,2 .
, , , , , . , pixel_ptr, ,
prograim.
- / , !
, , .
, , /.
?:
, , ,
. , , complier
, , , .