Skip to content
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
## 0.22.1
## 0.22.3

### Enhancements
- **Reduce langdetect memory**: Load only 15 common language profiles instead of all 55, saving ~44 MiB peak memory (-58%). Covers >95% of web content by language share.

### Fixes
- **Security update**: Bumped dependencies to address security vulnerabilities
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.1" # pragma: no cover
__version__ = "0.22.3" # pragma: no cover
46 changes: 46 additions & 0 deletions unstructured/partition/common/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Callable, Iterable, Iterator, Optional

import iso639 # pyright: ignore[reportMissingTypeStubs]
import langdetect.detector_factory as _ldf
from langdetect import ( # pyright: ignore[reportMissingTypeStubs]
DetectorFactory,
detect_langs, # pyright: ignore[reportUnknownVariableType]
Expand All @@ -18,6 +19,51 @@
TESSERACT_LANGUAGES_SPLITTER,
)

# Patch langdetect to load only 15 common language profiles instead of all 55.
# Cuts n-gram probability map memory by ~77% (58 MiB -> 14 MiB). Documents in
# excluded languages still get a result — the closest loaded profile matches.
LANGDETECT_LANGUAGES = frozenset(
{
"en",
"es",
"ar",
"fr",
"de",
"it",
"pt",
"ru",
"ja",
"ko",
"zh-cn",
"zh-tw",
"hi",
"bn",
"id",
}
)


def init_langdetect_with_subset():
"""Load only common language profiles into langdetect's DetectorFactory."""
if _ldf._factory is not None:
return

import json
from pathlib import Path

from langdetect.utils.lang_profile import LangProfile

factory = _ldf.DetectorFactory()
profile_dir = Path(_ldf.PROFILES_DIRECTORY)
files = sorted(f.name for f in profile_dir.iterdir() if f.name in LANGDETECT_LANGUAGES)
for index, filename in enumerate(files):
with open(profile_dir / filename, encoding="utf-8") as fh:
factory.add_profile(LangProfile(**json.load(fh)), index, len(files))
_ldf._factory = factory


_ldf.init_factory = init_langdetect_with_subset

_ASCII_RE = re.compile(r"^[\x00-\x7F]+$")

# pytesseract.get_languages(config="") only shows user installed language packs,
Expand Down
Loading