I am working on the problem of OCR text detection and recognition, where after testing Detecting and recognizing text scenes using OpenCV 3 using Opencv and Tesseract on some images, I made a few comments (manually after checking the detection image output, segmentation output and recognition output images) , and I'm looking for how to improve the result by making changes to the code.
First, the images are black and white (mostly gray, and the text is black or white), and the text can be a combination of numeric and alphabetic characters.
1 - When the color intensity of the text is not too different from the background, the text is not detected.
2- When the image contains, for example, “63H”, it is correctly detected, but if there is more space between (for example, “63 H”), it is not detected at all.
3 - When different text areas are not perfectly aligned (a different corner of the image than another correct detection of text of the same shape), one part of the text is detected, the other is not (for example, text = 12548 58A. 12548 - part one 58A - part 2).
I tried to analyze the results and thought that probably the reason for these remarks is in the ERGrouping phase ('m, using the exhaustive search algorithm grouping algorithm, grouping arbitrary oriented text gives me worse results). If so, how to improve it?
- ? ( , , )
( , opencv)?
createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )
? ( (Python/++). //TextRecognition.cpp: définit le point d'entrée pour l'application console. // /* * textdetection.cpp *
*/
#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <fstream>
#include <iostream>
using namespace std;
using namespace cv;
using namespace cv::text;
size_t edit_distance(const string& A, const string& B);
size_t min(size_t x, size_t y, size_t z);
bool isRepetitive(const string& s);
bool sort_by_lenght(const string &a, const string &b);
void er_draw(vector<Mat> &channels, vector<vector<ERStat> > ®ions, vector<Vec2i> group, Mat& segmentation);
int main(int argc, char* argv[])
{
cout << endl << argv[0] << endl << endl;
cout << "Text Detection and Recognition: " << endl;
cout << "algorithm described in:" << endl;
cout << "Real-Time Scene Text Localization and Recognition" << endl << endl;
Mat image;
if(argc>1)
image = imread(argv[1]);
else
{
cout << " Usage: " << argv[0] << " <input_image> [<gt_word1> ... <gt_wordN>]" << endl;
return(0);
}
cout << "IMG_W=" << image.cols << endl;
cout << "IMG_H=" << image.rows << endl;
vector<Mat> channels;
Mat grey;
cvtColor(image,grey,COLOR_RGB2GRAY);
channels.push_back(grey);
channels.push_back(255-grey);
double t_d = (double)getTickCount();
Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.2f);
Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);
vector<vector<ERStat> > regions(channels.size());
for (int c=0; c<(int)channels.size(); c++)
{
er_filter1->run(channels[c], regions[c]);
er_filter2->run(channels[c], regions[c]);
}
cout << "TIME_REGION_DETECTION = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;
Mat out_img_decomposition= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
vector<Vec2i> tmp_group;
for (int i=0; i<(int)regions.size(); i++)
{
for (int j=0; j<(int)regions[i].size();j++)
{
tmp_group.push_back(Vec2i(i,j));
}
Mat tmp= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
er_draw(channels, regions, tmp_group, tmp);
if (i > 0)
tmp = tmp / 2;
out_img_decomposition = out_img_decomposition | tmp;
tmp_group.clear();
}
double t_g = (double)getTickCount();
vector< vector<Vec2i> > nm_region_groups;
vector<Rect> nm_boxes;
erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_HORIZ);
cout << "TIME_GROUPING = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;