Improving OCR Text Detection / Segmentation in Natural Images Using OpenCV

Question

Improving OCR Text Detection / Segmentation in Natural Images Using OpenCV

I am working on the problem of OCR text detection and recognition, where after testing Detecting and recognizing text scenes using OpenCV 3 using Opencv and Tesseract on some images, I made a few comments (manually after checking the detection image output, segmentation output and recognition output images) , and I'm looking for how to improve the result by making changes to the code.

First, the images are black and white (mostly gray, and the text is black or white), and the text can be a combination of numeric and alphabetic characters.

1 - When the color intensity of the text is not too different from the background, the text is not detected.

2- When the image contains, for example, “63H”, it is correctly detected, but if there is more space between (for example, “63 H”), it is not detected at all.

3 - When different text areas are not perfectly aligned (a different corner of the image than another correct detection of text of the same shape), one part of the text is detected, the other is not (for example, text = 12548 58A. 12548 - part one 58A - part 2).

I tried to analyze the results and thought that probably the reason for these remarks is in the ERGrouping phase ('m, using the exhaustive search algorithm grouping algorithm, grouping arbitrary oriented text gives me worse results). If so, how to improve it?

- ? ( , , )

( , opencv)?

createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )

? ( (Python/++). //TextRecognition.cpp: définit le point d'entrée pour l'application console. // /* * textdetection.cpp *

 */

#include "opencv2/text.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <fstream>

#include <iostream>

using namespace std;
using namespace cv;
using namespace cv::text;

//Calculate edit distance between two words
size_t edit_distance(const string& A, const string& B);
size_t min(size_t x, size_t y, size_t z);
bool   isRepetitive(const string& s);
bool   sort_by_lenght(const string &a, const string &b);
//Draw ER in an image via floodFill
void   er_draw(vector<Mat> &channels, vector<vector<ERStat> > &regions, vector<Vec2i> group, Mat& segmentation);

//Perform text detection and recognition and evaluate results using edit distance
int main(int argc, char* argv[])
{
    cout << endl << argv[0] << endl << endl;
    cout << "Text Detection and Recognition: " << endl;
    cout << "algorithm described in:" << endl;
    cout << "Real-Time Scene Text Localization and Recognition" << endl << endl;

    Mat image;
    // image  = imread("C:\\scene-text-recognition\\scenetext01.jpg");

     if(argc>1)
        image  = imread(argv[1]);
    else
    {
        cout << "    Usage: " << argv[0] << " <input_image> [<gt_word1> ... <gt_wordN>]" << endl;
        return(0);
    }


    cout << "IMG_W=" << image.cols << endl;
    cout << "IMG_H=" << image.rows << endl;


    /*Text Detection*/

    // Extract channels to be processed individually
    vector<Mat> channels;

    Mat grey;
    cvtColor(image,grey,COLOR_RGB2GRAY);

    // Notice here we are only using grey channel, see textdetection.cpp for example with more channels
    channels.push_back(grey);
    channels.push_back(255-grey);

    double t_d = (double)getTickCount();
    // Create ERFilter objects with the 1st and 2nd stage default classifiers
    //createERFilterNM1(const Ptr<ERFilter::Callback>& cb, int thresholdDelta=1, float minArea=0.00025, float maxArea=0.13, float minProbability=0.4, bool nonMaxSuppression=true, float minProbabilityDiff=0.1 )
    Ptr<ERFilter> er_filter1 = createERFilterNM1(loadClassifierNM1("trained_classifierNM1.xml"),8,0.00015f,0.13f,0.2f,true,0.2f);
    Ptr<ERFilter> er_filter2 = createERFilterNM2(loadClassifierNM2("trained_classifierNM2.xml"),0.5);

    vector<vector<ERStat> > regions(channels.size());
    // Apply the default cascade classifier to each independent channel (could be done in parallel)
    for (int c=0; c<(int)channels.size(); c++)
    {
        er_filter1->run(channels[c], regions[c]);
        er_filter2->run(channels[c], regions[c]);
    }
    cout << "TIME_REGION_DETECTION = " << ((double)getTickCount() - t_d)*1000/getTickFrequency() << endl;

    Mat out_img_decomposition= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
    vector<Vec2i> tmp_group;
    for (int i=0; i<(int)regions.size(); i++)
    {
        for (int j=0; j<(int)regions[i].size();j++)
        {
            tmp_group.push_back(Vec2i(i,j));
        }
        Mat tmp= Mat::zeros(image.rows+2, image.cols+2, CV_8UC1);
        er_draw(channels, regions, tmp_group, tmp);
        if (i > 0)
            tmp = tmp / 2;
        out_img_decomposition = out_img_decomposition | tmp;
        tmp_group.clear();
    }

    double t_g = (double)getTickCount();
    // Detect character groups
    vector< vector<Vec2i> > nm_region_groups;
    vector<Rect> nm_boxes;
     erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_HORIZ);
    //erGrouping(image, channels, regions, nm_region_groups, nm_boxes,ERGROUPING_ORIENTATION_ANY,"trained_classifier_erGrouping.xml",0.5);
    cout << "TIME_GROUPING = " << ((double)getTickCount() - t_g)*1000/getTickFrequency() << endl;