I am trying to do simple image processing using opengl. Since I could not find a good library that does this alrdy, I am trying to make my own decision.
I just want to compose some images on gpu and then read them back. However, the performance of my implementation seems almost equal to what it does for the processor ... something is wrong ...
I tried to follow the recommendations I found on the net. But still it does something wrong.
I tried to remove all irrelevant codes.
Any ideas as to why this implementation has poor performance?
int image_width = 1280;
int image_height = 720;
int image_size = image_width * image_height;
class texture
{
public:
texture()
{
glGenTextures(1, &texture_);
bind();
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
}
~texture(){ glDeleteTextures(1, &texture_); }
void bind(){ glBindTexture(GL_TEXTURE_2D, texture_); }
GLuint handle() { return texture_; }
private:
GLuint texture_;
};
typedef std::shared_ptr<texture> texture_ptr;
class pixel_buffer // pixel buffer with associated texture
{
public:
pixel_buffer()
{
glGenBuffersARB(1, &pbo_);
bind_pbo();
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
}
~pixel_buffer(){ glDeleteBuffers(1, &pbo_); }
void begin_write(void* src)
{
texture_.bind();
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
assert(ptr);
memcpy(ptr, src, image_size);
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
void end_write()
{
bind_texture();
bind_pbo();
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
unbind_pbo();
}
void begin_read(GLuint buffer)
{
glReadBuffer(buffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo_);
glBufferData(GL_PIXEL_PACK_BUFFER_ARB, image_size, NULL, GL_STREAM_READ);
glReadPixels(0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
}
void end_read(void* dest)
{
void* ptr = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
memcpy(dest, ptr, image_size);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
unbind_pbo();
}
void bind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
void unbind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
void bind_texture() { texture_.bind(); }
GLuint texture_handle() { return texture_.handle(); }
private:
texture texture_;
GLuint pbo_;
};
typedef std::shared_ptr<pixel_buffer> pixel_buffer_ptr;
class frame_buffer// frame buffer with associated pixel buffer
{
public:
frame_buffer()
{
glGenFramebuffersEXT(1, &fbo_);
bind();
pbo_.bind_texture();
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, pbo_.texture_handle(), 0);
}
~frame_buffer() { glDeleteFramebuffersEXT(1, &fbo_); }
void bind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_); }
void unbind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); }
void begin_read()
{
bind();
pbo_.begin_read(GL_COLOR_ATTACHMENT0_EXT);
}
void end_read(void* dest)
{
pbo_.end_read(dest);
unbind();
}
private:
pixel_buffer pbo_;
GLuint fbo_;
};
typedef std::shared_ptr<frame_buffer> frame_buffer_ptr;
struct image_processor::implementation
{
void compose(const std::vector<image_ptr>& images)
{
if(reading_fbo_)
{
image_ptr result_image = std::make_shared<image>(image_size);
reading_fbo_->end_read(result_image->data());
output_.push(reading_result_image_);
reading_fbo_ = nullptr;
}
frame_buffer_ptr written_fbo;
if(!writing_pbo_group_.empty())
{
written_fbo = get_fbo();
written_fbo->bind();
glClear(GL_COLOR_BUFFER_BIT);
for(size_t n = 0; n < writing_pbo_group_.size(); ++n)
{
writing_pbo_group_[n]->end_write();
writing_pbo_group_[n]->bind_texture();
quad_->draw();
}
written_fbo->unbind();
writing_pbo_group_.clear();
}
if(!images.empty())
{
for(size_t n = 0; n < images.size(); ++n)
{
auto pbo = get_pbo();
pbo->begin_write(images[n]->data());
writing_pbo_group_.push_back(pbo);
}
}
if(written_fbo)
{
written_fbo->begin_read();
reading_fbo_ = written_fbo;
}
}
pixel_buffer_ptr get_pbo()
{
if(pbo_pool_.empty())
pbo_pool_.push_back(std::make_shared<pixel_buffer>());
auto pbo = pbo_pool_.front();
pbo_pool_.pop_front();
return pixel_buffer_ptr(pbo.get(), [=](pixel_buffer*){pbo_pool_.push_back(pbo);});
}
frame_buffer_ptr get_fbo()
{
if(fbo_pool_.empty())
fbo_pool_.push_back(std::make_shared<frame_buffer>());
auto fbo = fbo_pool_.front();
fbo_pool_.pop_front();
return frame_buffer_ptr(fbo.get(), [=](frame_buffer*){fbo_pool_.push_back(fbo);});
}
std::vector<pixel_buffer_ptr> writing_pbo_group_;
frame_buffer_ptr reading_fbo_;
std::deque<pixel_buffer_ptr> pbo_pool_;
std::deque<frame_buffer_ptr> fbo_pool_;
};
EDIT:
Profiling. In most cases, processor time is spent on begin_write ();
I see nothing wrong with that, though ...
void begin_write(void* src)
{
texture_.bind();
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
assert(ptr);
memcpy(ptr, src, image_size);
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}