Can SWT (Stroke Width Transform) help OCR with screenshots?

Question

I have been trying to detect text from screenshots. The screenshots can contain arbitrary content. I just want to locate the text content.

It's OK that if some non-text content is detected as text. My bottom line is no text content is missed.

I found the following article:

Detecting Text in Natural Scenes with Stroke Width Transform. Boris Epshtein, Yonathan Wexler, and Eyal Ofek. IEEE International Conference on Computer Vision and Pattern Recognition, 2010.

But I haven't found a working implementation on Windows. And so far I only see it used with natural scenes, not screenshot. If anyone has implemented it on other platforms, could you try it with the following image so I can get a quick evaluation before I make up my mind to implement it on Windows? Thanks.

I guess it would work for screenshots even better than for "natural images", beause there are no such things like perspective projection effects etc — Micka
– Micka, Commented Aug 5, 2015 at 8:22

Community · Accepted Answer · 2017-05-23 12:06:12Z

UPDATE

The code from this answer seems to not work as expected (at least I didn't manage to make it work satisfactorily).

So I run the code on which that implementation was based, that you can find here

The (more reasonble) result is:

I'll leave the code below for future references.

I adapted the mex implementation from here. The result on your image with the below code is:

I'll let you evaluate if this is helpful to you. The code is below.

swt.h

#include <opencv2\opencv.hpp> #include <vector> #include <map> #include <set> #include <algorithm> using namespace std; namespace sw { #define PI 3.14159265 struct Point2d { int x; int y; float SWT; }; struct Point2dFloat { float x; float y; }; struct Ray { Point2d p; Point2d q; std::vector<Point2d> points; }; void strokeWidthTransform(const float * edgeImage, const float * gradientX, const float * gradientY, bool dark_on_light, float * SWTImage, int h, int w, std::vector<Ray> & rays) { // First pass float prec = .05f; for (int row = 0; row < h; row++){ const float* ptr = edgeImage + row*w; for (int col = 0; col < w; col++){ if (*ptr > 0) { Ray r; Point2d p; p.x = col; p.y = row; r.p = p; std::vector<Point2d> points; points.push_back(p); float curX = (float)col + 0.5f; float curY = (float)row + 0.5f; int curPixX = col; int curPixY = row; float G_x = gradientX[col + row*w]; float G_y = gradientY[col + row*w]; // normalize gradient float mag = sqrt((G_x * G_x) + (G_y * G_y)); if (dark_on_light){ G_x = -G_x / mag; G_y = -G_y / mag; } else { G_x = G_x / mag; G_y = G_y / mag; } while (true) { curX += G_x*prec; curY += G_y*prec; if ((int)(floor(curX)) != curPixX || (int)(floor(curY)) != curPixY) { curPixX = (int)(floor(curX)); curPixY = (int)(floor(curY)); // check if pixel is outside boundary of image if (curPixX < 0 || (curPixX >= w) || curPixY < 0 || (curPixY >= h)) { break; } Point2d pnew; pnew.x = curPixX; pnew.y = curPixY; points.push_back(pnew); if (edgeImage[curPixY*w + curPixX] > 0) { r.q = pnew; // dot product float G_xt = gradientX[curPixY*w + curPixX]; float G_yt = gradientY[curPixY*w + curPixX]; mag = sqrt((G_xt * G_xt) + (G_yt * G_yt)); if (dark_on_light){ G_xt = -G_xt / mag; G_yt = -G_yt / mag; } else { G_xt = G_xt / mag; G_yt = G_yt / mag; } if (acos(G_x * -G_xt + G_y * -G_yt) < PI / 2.0) { float length = sqrt(((float)r.q.x - (float)r.p.x)*((float)r.q.x - (float)r.p.x) + ((float)r.q.y - (float)r.p.y)*((float)r.q.y - (float)r.p.y)); for (std::vector<Point2d>::iterator pit = points.begin(); pit != points.end(); pit++) { float* pSWT = SWTImage + w * pit->y + pit->x; if (*pSWT < 0) { *pSWT = length; } else { *pSWT = std::min(length, *pSWT); } } r.points = points; rays.push_back(r); } break; } } } } ptr++; } } } bool Point2dSort(const Point2d &lhs, const Point2d &rhs) { return lhs.SWT < rhs.SWT; } void SWTMedianFilter(float * SWTImage, int h, int w, std::vector<Ray> & rays, float maxWidth = -1) { for (std::vector<Ray>::iterator rit = rays.begin(); rit != rays.end(); rit++) { for (std::vector<Point2d>::iterator pit = rit->points.begin(); pit != rit->points.end(); pit++) { pit->SWT = SWTImage[w*pit->y + pit->x]; } std::sort(rit->points.begin(), rit->points.end(), &Point2dSort); //std::nth_element( rit->points.begin(), rit->points.end(), rit->points.size()/2, &Point2dSort ); float median = (rit->points[rit->points.size() / 2]).SWT; if (maxWidth > 0 && median >= maxWidth) { median = -1; } for (std::vector<Point2d>::iterator pit = rit->points.begin(); pit != rit->points.end(); pit++) { SWTImage[w*pit->y + pit->x] = std::min(pit->SWT, median); } } } typedef std::vector< std::set<int> > graph_t; // graph as a list of neighbors per node void connComp(const graph_t& g, std::vector<int>& c, int i, int l) { // starting from node i labe this conn-comp with label l if (i < 0 || i > g.size()) { return; } std::vector< int > stack; // push i stack.push_back(i); c[i] = l; while (!stack.empty()) { // pop i = stack.back(); stack.pop_back(); // go over all nieghbors for (std::set<int>::const_iterator it = g[i].begin(); it != g[i].end(); it++) { if (c[*it] < 0) { stack.push_back(*it); c[*it] = l; } } } } int findNextToLabel(const graph_t& g, const vector<int>& c) { for (int i = 0; i < c.size(); i++) { if (c[i] < 0) { return i; } } return c.size(); } int connected_components(const graph_t& g, vector<int>& c) { // check for empty graph! if (g.empty()) { return 0; } int i = 0; int num_conn = 0; do { connComp(g, c, i, num_conn); num_conn++; i = findNextToLabel(g, c); } while (i < g.size()); return num_conn; } std::vector< std::vector<Point2d> > findLegallyConnectedComponents(const float* SWTImage, int h, int w, std::vector<Ray> & rays) { std::map<int, int> Map; std::map<int, Point2d> revmap; std::vector<std::vector<Point2d> > components; // empty int num_vertices = 0, idx = 0; graph_t g; // Number vertices for graph. Associate each point with number for (int row = 0; row < h; row++){ for (int col = 0; col < w; col++){ idx = col + w * row; if (SWTImage[idx] > 0) { Map[idx] = num_vertices; Point2d p; p.x = col; p.y = row; revmap[num_vertices] = p; num_vertices++; std::set<int> empty; g.push_back(empty); } } } if (g.empty()) { return components; // nothing to do with an empty graph... } for (int row = 0; row < h; row++){ for (int col = 0; col < w; col++){ idx = col + w * row; if (SWTImage[idx] > 0) { // check pixel to the right, right-down, down, left-down int this_pixel = Map[idx]; float thisVal = SWTImage[idx]; if (col + 1 < w) { float right = SWTImage[w*row + col + 1]; if (right > 0 && (thisVal / right <= 3.0 || right / thisVal <= 3.0)) { g[this_pixel].insert(Map[w*row + col + 1]); g[Map[w*row + col + 1]].insert(this_pixel); //boost::add_edge(this_pixel, map.at(row * SWTImage->width + col + 1), g); } } if (row + 1 < h) { if (col + 1 < w) { float right_down = SWTImage[w*(row + 1) + col + 1]; if (right_down > 0 && (thisVal / right_down <= 3.0 || right_down / thisVal <= 3.0)) { g[this_pixel].insert(Map[w*(row + 1) + col + 1]); g[Map[w*(row + 1) + col + 1]].insert(this_pixel); // boost::add_edge(this_pixel, map.at((row+1) * SWTImage->width + col + 1), g); } } float down = SWTImage[w*(row + 1) + col]; if (down > 0 && (thisVal / down <= 3.0 || down / thisVal <= 3.0)) { g[this_pixel].insert(Map[w*(row + 1) + col]); g[Map[w*(row + 1) + col]].insert(this_pixel); //boost::add_edge(this_pixel, map.at((row+1) * SWTImage->width + col), g); } if (col - 1 >= 0) { float left_down = SWTImage[w*(row + 1) + col - 1]; if (left_down > 0 && (thisVal / left_down <= 3.0 || left_down / thisVal <= 3.0)) { g[this_pixel].insert(Map[w*(row + 1) + col - 1]); g[Map[w*(row + 1) + col - 1]].insert(this_pixel); //boost::add_edge(this_pixel, map.at((row+1) * SWTImage->width + col - 1), g); } } } } } } std::vector<int> c(num_vertices, -1); int num_comp = connected_components(g, c); components.reserve(num_comp); //std::cout << "Before filtering, " << num_comp << " components and " << num_vertices << " vertices" << std::endl; for (int j = 0; j < num_comp; j++) { std::vector<Point2d> tmp; components.push_back(tmp); } for (int j = 0; j < num_vertices; j++) { Point2d p = revmap[j]; (components[c[j]]).push_back(p); } return components; } enum { EIN = 0, GXIN, GYIN, DOLFIN, MAXWIN, NIN }; void swt_mex(const float* edgeImage, const float* gradientX, const float* gradientY, float* SWTImage, float* pComp, int* nstrokes, int w, int h, bool dark_on_light) { float maxWidth = w; std::vector<Ray> rays; strokeWidthTransform(edgeImage, gradientX, gradientY, dark_on_light, SWTImage, h, w, rays); SWTMedianFilter(SWTImage, h, w, rays, maxWidth); std::vector<std::vector<Point2d> > components = findLegallyConnectedComponents(SWTImage, h, w, rays); *nstrokes = components.size(); for (int ci = 0; ci < components.size(); ci++) { for (std::vector<Point2d>::iterator it = components[ci].begin(); it != components[ci].end(); it++) { pComp[w * it->y + it->x] = ci + 1; } } } void swt(const cv::Mat1b& img, cv::Mat1f& strokes, int* nstrokes, bool dark_on_light = true) { cv::Mat1b edgeMap; cv::Canny(img, edgeMap, 400, 200); cv::Mat1f floatEdgeMap; edgeMap.convertTo(floatEdgeMap, CV_32F); cv::Mat1b blurred; cv::GaussianBlur(img, blurred, cv::Size(5, 5), 0.3*(2.5 - 1) + .8); cv::Mat1f gx, gy; cv::Sobel(blurred, gx, CV_32F, 1, 0); cv::Sobel(blurred, gy, CV_32F, 0, 1); cv::medianBlur(gx, gx, 3); cv::medianBlur(gy, gy, 3); cv::Mat1f swtimg(img.rows, img.cols, -1.f); strokes = cv::Mat1f(img.rows, img.cols, 0.f); swt_mex((float*)floatEdgeMap.data, (float*)gx.data, (float*)gy.data, (float*)swtimg.data, (float*)strokes.data, nstrokes, img.cols, img.rows, dark_on_light); } }

main

#include <opencv2/opencv.hpp> #include "swt.h" using namespace cv; int main(int, char** argv) { Mat1b img = cv::imread("path_to_image", IMREAD_GRAYSCALE); // Compute SWT Mat1f strokes; int nstrokes; sw::swt(img, strokes, &nstrokes); // Create color table vector<Vec3b> colors(nstrokes+1); colors[0] = Vec3b(0, 0, 0); RNG rng; for (int i = 0; i < nstrokes; ++i) { colors[i + 1] = Vec3b(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255)); } // Colors strokes Mat3b coloredStrokes(strokes.size(), Vec3b(0,0,0)); for (int r = 0; r < strokes.rows; ++r) { for (int c = 0; c < strokes.cols; ++c) { coloredStrokes(r, c) = colors[strokes(r,c)]; } } imshow("Strokes", coloredStrokes); waitKey(); return 0; }

Dravidian · Accepted Answer · 2020-09-29 05:02:05Z

Since the text you are dealing with in an image is having very small font size, hence even before you begin to address the OCR, it would benefit you to actually resize(Increase the size maintaining the Aspect Ratio) and then perform Edge Detection and then Stroke Width Transform.

Steps :-

Resize the original image to magnify the small font text in it
Run SWT

After installing the swtloc library from pip.

Full disclosure : I wrote this library

from swtloc import SWTLocalizer from swtloc.utils import resize_maintinaAR swtl = SWTLocalizer() imgpath = rawimage_path+'so3_img1.png' r_imgpath = rawimage_path+'so3_img11.jpg' orig_img = cv2.imread(imgpath) resized_img = resize_maintinaAR(orig_img, width=2.0) print(f'Shape changed from {orig_img.shape} -> {resized_img.shape}') cv2.imwrite(r_imgpath, resized_img) swtl.swttransform(imgpaths=r_imgpath, save_results=True, save_rootpath='swtres/', edge_func = 'ac', ac_sigma = .33, text_mode = 'lb_df', gs_blurr=True, blurr_kernel = (5,5), minrsw = 3, maxCC_comppx = 10000, maxrsw = 10, max_angledev = np.pi/6, acceptCC_aspectratio = 5.0) imgshow(swtl.swtlabelled_pruned13C) _=cv2.imwrite(rawimage_path+'so3_img11_processed.jpg', swtl.swtlabelled_pruned13C)

And the result :-

Stroke Width Transform

BBoxes

min_bboxes, min_bbox_annotated = swtl.get_min_bbox(show=True, padding=10)

Collectives™ on Stack Overflow

Can SWT (Stroke Width Transform) help OCR with screenshots?

2 Answers 2

1 Comment

Comments

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

1 Comment

Comments

Linked

Related