I spend sometimes answering this question I saw few people asking the same question.
The solution I used here is to use Opencv to pre process the image before using tesseract. After that some arrangement is needed. Sorry My code is quit long I think some can make it shorter. But anyway it get the job done. I couldn't explain the code line by line but I added comments hope it can give a general idea about what is going on.
import cv2 import numpy as np import pytesseract pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract"
reading Image and filtring
table = cv2.imread("Table.png") # adding some Border around image table= cv2.copyMakeBorder(table,20,20,20,20,cv2.BORDER_CONSTANT,value=[255,255,255])
Removing noise from image
table_c = cv2.GaussianBlur(cv2.cvtColor(table,cv2.COLOR_BGR2GRAY),(3,3),0,0) # Threshold _,thre = cv2.threshold(table_c,200,255,cv2.THRESH_BINARY,cv2.THRESH_OTSU)
Geting rows only in the image
kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(100,1)) morph = cv2.morphologyEx(thre,cv2.MORPH_CLOSE,kernel) contours,h = cv2.findContours(morph, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) rows = [None]*len(contours) for i, c in enumerate(contours): rows[i] = cv2.boundingRect(cv2.approxPolyDP(c, 3, True)) rows = sorted(rows, key=lambda b:b[1], reverse=False)
Geting cols only in the image
kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(1,50)) morph2 = cv2.morphologyEx(thre,cv2.MORPH_CLOSE,kernel2) contours,h = cv2.findContours(morph2, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) table = cv2.drawContours(table, contours, 0, (0,255,0), 3) cols = [None]*len(contours) for i, c in enumerate(contours): cols[i] = cv2.boundingRect(cv2.approxPolyDP(c, 3, True)) cols = sorted(cols, key=lambda b:b[0], reverse=False)
removing rows and cols and keep text only
_,thre2 = cv2.threshold(thre,0,255,cv2.THRESH_BINARY_INV) no_table = cv2.bitwise_and(morph,thre2) no_table = cv2.bitwise_and(morph2,no_table) kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT,(10,2)) mask = cv2.morphologyEx(no_table,cv2.MORPH_CLOSE,kernel2)
get each text in a box
contours,h = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours_poly = [None]*len(contours) boundRect = [None]*len(contours) for i, c in enumerate(contours): contours_poly[i] = cv2.approxPolyDP(c, 3, True) boundRect[i] = cv2.boundingRect(contours_poly[i]) # cv2.rectangle(table, (int(boundRect[i][0]), int(boundRect[i][1])), # (int(boundRect[i][0]+boundRect[i][2]), int(boundRect[i][1]+boundRect[i][3])), (0,0,255), 2) # table = cv2.drawContours(table, contours, -1, (0,255,0), 3)
Crop each box and recognise text
get the row and the cols of each text and it's position in the image
text_position = [] offest = 10 boundingBoxes = sorted(boundRect, key=lambda b:b[0], reverse=False) for rect in boundingBoxes: if rect[2] > 30 and rect[3]>10: image = table[rect[1]-offest:rect[1]+rect[3]+offest,rect[0]-offest:rect[0]+rect[2]+offest] text = pytesseract.image_to_string(image) for i,row in enumerate(rows): if i < len(rows): if rect[1] >row[1] and rect[1] <rows[i+1][1]: r = i break for i,col in enumerate(cols): if i < len(cols): if rect[0] >col[0] and rect[0] <cols[i+1][0]: c = i break text_position.append({'Text':text.split("\n")[0],"row":r,'col':c,"X":rect[0],"Y":rect[1]})
combined text in the same row and cols
indexs = [] for j,t in enumerate(text_position): list_re = [] for i,tt in enumerate(text_position): if tt["row"] == t["row"] and tt["col"] == t["col"] : list_re.append(i) if len(list_re)>1: indexs.append(list_re) indexs = list(set(tuple(i) for i in indexs)) text = "" for indexs_ in indexs: text_repeated = [text_position[i] for i in indexs_] text_repeated = sorted(text_repeated, key=lambda b:b["Y"], reverse=False) for i in range(len(text_repeated)): text += text_repeated[i]["Text"]+" " new_dic = {'Text': text, 'row':text_repeated[0]["row"] , 'col': text_repeated[0]["col"], 'X': text_repeated[0]["X"], 'Y': text_repeated[-1]["Y"]} for i in indexs_: text_position.pop(i) text_position.append(new_dic)
Final Output will be a list of dictionaries each contains text, row, and col of each cell in the table like below
[{'Text': 'Jane Doe', 'row': 3, 'col': 1, 'X': 67, 'Y': 167}, {'Text': 'John Smith', 'row': 2, 'col': 1, 'X': 67, 'Y': 86}, {'Text': 'Name', 'row': 1, 'col': 1, 'X': 68, 'Y': 59}, {'Text': '07 March, 2017', 'row': 3, 'col': 2, 'X': 301, 'Y': 167}, {'Text': '07 March, 2017', 'row': 2, 'col': 2, 'X': 301, 'Y': 86}, {'Text': ' ', 'row': 1, 'col': 2, 'X': 302, 'Y': 59}, {'Text': 'Los Angeles', 'row': 3, 'col': 3, 'X': 536, 'Y': 167}, {'Text': 'Detroit', 'row': 2, 'col': 3, 'X': 536, 'Y': 140}, {'Text': 'Locations', 'row': 1, 'col': 3, 'X': 536, 'Y': 58}, {'Text': 'Currently in', 'row': 1, 'col': 4, 'X': 769, 'Y': 58}, {'Text': 'Pacific Ocean', 'row': 2, 'col': 4, 'X': 770, 'Y': 85}, {'Text': 'Chicago Milwaukee Detroit ', 'row': 2, 'col': 3, 'X': 535, 'Y': 140}
--psm 6usually does the trick?