Skip to content

Commit aa5c668

Browse files
authored
feat(experimental): extend IDocTags tokens (#439)
feat: extend IDocTags tokens [WIP] Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent 92d60b0 commit aa5c668

File tree

4 files changed

+54
-13
lines changed

4 files changed

+54
-13
lines changed

docling_core/experimental/idoctags.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Define classes for DocTags serialization."""
22

3+
from enum import Enum
34
from typing import Any, Final, Optional
45
from xml.dom.minidom import parseString
56

@@ -10,13 +11,15 @@
1011
BaseDocSerializer,
1112
BaseMetaSerializer,
1213
BasePictureSerializer,
14+
BaseTableSerializer,
1315
SerializationResult,
1416
)
1517
from docling_core.transforms.serializer.common import create_ser_result
1618
from docling_core.transforms.serializer.doctags import (
1719
DocTagsDocSerializer,
1820
DocTagsParams,
1921
DocTagsPictureSerializer,
22+
DocTagsTableSerializer,
2023
_get_delim,
2124
_wrap,
2225
)
@@ -40,6 +43,25 @@
4043
DOCTAGS_VERSION: Final = "1.0.0"
4144

4245

46+
class IDocTagsTableToken(str, Enum):
47+
"""Class to represent an LLM friendly representation of a Table."""
48+
49+
CELL_LABEL_COLUMN_HEADER = "<column_header/>"
50+
CELL_LABEL_ROW_HEADER = "<row_header/>"
51+
CELL_LABEL_SECTION_HEADER = "<shed/>"
52+
CELL_LABEL_DATA = "<data/>"
53+
54+
OTSL_ECEL = "<ecel/>" # empty cell
55+
OTSL_FCEL = "<fcel/>" # cell with content
56+
OTSL_LCEL = "<lcel/>" # left looking cell,
57+
OTSL_UCEL = "<ucel/>" # up looking cell,
58+
OTSL_XCEL = "<xcel/>" # 2d extension cell (cross cell),
59+
OTSL_NL = "<nl/>" # new line,
60+
OTSL_CHED = "<ched/>" # - column header cell,
61+
OTSL_RHED = "<rhed/>" # - row header cell,
62+
OTSL_SROW = "<srow/>" # - section row cell
63+
64+
4365
class IDocTagsParams(DocTagsParams):
4466
"""DocTags-specific serialization parameters."""
4567

@@ -166,6 +188,7 @@ def serialize(
166188
temp_doc,
167189
add_cell_location=False,
168190
self_closing=params.do_self_closing,
191+
table_token=IDocTagsTableToken,
169192
)
170193
body += otsl_content
171194
res_parts.append(create_ser_result(text=body, span_source=item))
@@ -184,11 +207,19 @@ def serialize(
184207
return create_ser_result(text=text_res, span_source=res_parts)
185208

186209

210+
class IDocTagsTableSerializer(DocTagsTableSerializer):
211+
"""DocTags-specific table item serializer."""
212+
213+
def _get_table_token(self) -> Any:
214+
return IDocTagsTableToken
215+
216+
187217
class IDocTagsDocSerializer(DocTagsDocSerializer):
188218
"""DocTags document serializer."""
189219

190220
picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
191221
meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
222+
table_serializer: BaseTableSerializer = IDocTagsTableSerializer()
192223
params: IDocTagsParams = IDocTagsParams()
193224

194225
@override
@@ -207,7 +238,7 @@ def serialize_doc(
207238
text_res = delim.join([p.text for p in parts if p.text])
208239

209240
if self.params.add_page_break:
210-
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
241+
page_sep = f"<{DocumentToken.PAGE_BREAK.value}{'/' if self.params.do_self_closing else ''}>"
211242
for full_match, _, _ in self._get_page_breaks(text=text_res):
212243
text_res = text_res.replace(full_match, page_sep)
213244

docling_core/transforms/serializer/doctags.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
TextItem,
5151
)
5252
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
53-
from docling_core.types.doc.tokens import DocumentToken
53+
from docling_core.types.doc.tokens import DocumentToken, TableToken
5454

5555

5656
def _wrap(text: str, wrap_tag: str) -> str:
@@ -147,6 +147,7 @@ def serialize(
147147
if isinstance(item, CodeItem):
148148
language_token = DocumentToken.get_code_language_token(
149149
code_language=item.code_language,
150+
self_closing=params.do_self_closing,
150151
)
151152
text_part = f"{language_token}{text_part}"
152153
else:
@@ -171,6 +172,9 @@ def serialize(
171172
class DocTagsTableSerializer(BaseTableSerializer):
172173
"""DocTags-specific table item serializer."""
173174

175+
def _get_table_token(self) -> Any:
176+
return TableToken
177+
174178
@override
175179
def serialize(
176180
self,
@@ -203,6 +207,7 @@ def serialize(
203207
xsize=params.xsize,
204208
ysize=params.ysize,
205209
visited=visited,
210+
table_token=self._get_table_token(),
206211
)
207212
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
208213

docling_core/types/doc/document.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2267,6 +2267,8 @@ def export_to_otsl(
22672267

22682268
from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
22692269

2270+
table_token = kwargs.get("table_token", TableToken)
2271+
22702272
doc_serializer = DocTagsDocSerializer(doc=doc)
22712273
body = []
22722274
nrows = self.data.num_rows
@@ -2309,34 +2311,34 @@ def export_to_otsl(
23092311
if rowstart == i and colstart == j:
23102312
if len(content) > 0:
23112313
if cell.column_header:
2312-
body.append(str(TableToken.OTSL_CHED.value))
2314+
body.append(str(table_token.OTSL_CHED.value))
23132315
elif cell.row_header:
2314-
body.append(str(TableToken.OTSL_RHED.value))
2316+
body.append(str(table_token.OTSL_RHED.value))
23152317
elif cell.row_section:
2316-
body.append(str(TableToken.OTSL_SROW.value))
2318+
body.append(str(table_token.OTSL_SROW.value))
23172319
else:
2318-
body.append(str(TableToken.OTSL_FCEL.value))
2320+
body.append(str(table_token.OTSL_FCEL.value))
23192321
if add_cell_location:
23202322
body.append(str(cell_loc))
23212323
if add_cell_text:
23222324
body.append(str(content))
23232325
else:
2324-
body.append(str(TableToken.OTSL_ECEL.value))
2326+
body.append(str(table_token.OTSL_ECEL.value))
23252327
else:
23262328
add_cross_cell = False
23272329
if rowstart != i:
23282330
if colspan == 1:
2329-
body.append(str(TableToken.OTSL_UCEL.value))
2331+
body.append(str(table_token.OTSL_UCEL.value))
23302332
else:
23312333
add_cross_cell = True
23322334
if colstart != j:
23332335
if rowspan == 1:
2334-
body.append(str(TableToken.OTSL_LCEL.value))
2336+
body.append(str(table_token.OTSL_LCEL.value))
23352337
else:
23362338
add_cross_cell = True
23372339
if add_cross_cell:
2338-
body.append(str(TableToken.OTSL_XCEL.value))
2339-
body.append(str(TableToken.OTSL_NL.value))
2340+
body.append(str(table_token.OTSL_XCEL.value))
2341+
body.append(str(table_token.OTSL_NL.value))
23402342
body_str = "".join(body)
23412343
return body_str
23422344

docling_core/types/doc/tokens.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,9 +257,12 @@ def get_picture_classification_token(classification: str) -> str:
257257
return _PictureClassificationToken(f"<{classification}>").value
258258

259259
@staticmethod
260-
def get_code_language_token(code_language: str) -> str:
260+
def get_code_language_token(code_language: str, self_closing: bool = False) -> str:
261261
"""Function to get the token for a given code language."""
262-
return _CodeLanguageToken(f"<_{code_language}_>").value
262+
if self_closing:
263+
return f"<{code_language}/>"
264+
else:
265+
return _CodeLanguageToken(f"<_{code_language}_>").value
263266

264267
@staticmethod
265268
def get_location_token(

0 commit comments

Comments
 (0)