Skip to content

Commit eeb5024

Browse files
committed
OCR Tesseract demo
1 parent 8342173 commit eeb5024

File tree

12 files changed

+240
-0
lines changed

12 files changed

+240
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,4 @@ UpgradeLog*.XML
136136
/WordsProcessing/MailMerge/.vs
137137
/ZipLibrary/CreateZipArchive/.vs
138138
/ZipLibrary/ExtractZipArchiveToDirectory/.vs
139+
/PdfProcessing/TesseractOcrProviderDemo/.vs
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<configuration>
3+
<startup>
4+
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
5+
</startup>
6+
<runtime>
7+
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
8+
<dependentAssembly>
9+
<assemblyIdentity name="Telerik.Licensing.Runtime" publicKeyToken="98bb5b04e55c09ef" culture="neutral" />
10+
<bindingRedirect oldVersion="0.0.0.0-1.4.2.0" newVersion="1.4.2.0" />
11+
</dependentAssembly>
12+
</assemblyBinding>
13+
</runtime>
14+
</configuration>
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Diagnostics;
4+
using System.Drawing;
5+
using System.IO;
6+
using System.Linq;
7+
using System.Text;
8+
using System.Threading.Tasks;
9+
using Telerik.Windows.Documents.Fixed.FormatProviders.Ocr;
10+
using Telerik.Windows.Documents.Fixed.FormatProviders.Pdf;
11+
using Telerik.Windows.Documents.Fixed.Model;
12+
using Telerik.Windows.Documents.Ocr;
13+
using Telerik.Windows.Documents.TesseractOcr;
14+
15+
namespace TesseractOcrProviderDemo
16+
{
17+
internal class Program
18+
{
19+
static void Main(string[] args)
20+
{
21+
// Requirement for Images in .NET Standard - https://docs.telerik.com/devtools/document-processing/libraries/radpdfprocessing/cross-platform/images
22+
//FixedExtensibilityManager.ImagePropertiesResolver = new ImagePropertiesResolver();
23+
24+
TesseractOcrProvider tesseractOcrProvider = new TesseractOcrProvider(".");
25+
tesseractOcrProvider.LanguageCodes = new List<string>() { "eng" };
26+
//tesseractOcrProvider.CorrectVerticalPosition = false; //Available only for .NET Standard
27+
tesseractOcrProvider.DataPath = AppDomain.CurrentDomain.BaseDirectory + @"..\..\";
28+
tesseractOcrProvider.ParseLevel = OcrParseLevel.Word;
29+
30+
string imagePath = AppDomain.CurrentDomain.BaseDirectory + @"..\..\images\image.png";
31+
32+
string imageText = tesseractOcrProvider.GetAllTextFromImage(File.ReadAllBytes(imagePath));
33+
Dictionary<Rectangle, string> imageTextAndTextDimentions = tesseractOcrProvider.GetTextFromImage(File.ReadAllBytes(imagePath));
34+
35+
OcrFormatProvider OcProvider = new OcrFormatProvider(tesseractOcrProvider);
36+
37+
RadFixedDocument document = new RadFixedDocument();
38+
39+
RadFixedPage page = new RadFixedPage();
40+
page = OcProvider.Import(new FileStream(imagePath, FileMode.Open), null);
41+
document.Pages.Add(page);
42+
43+
string outputPath = "output.pdf";
44+
File.Delete(outputPath);
45+
PdfFormatProvider pdfFormatProvider = new PdfFormatProvider();
46+
using (Stream output = File.OpenWrite(outputPath))
47+
{
48+
pdfFormatProvider.Export(document, output, TimeSpan.FromSeconds(10));
49+
}
50+
51+
var psi = new ProcessStartInfo()
52+
{
53+
FileName = outputPath,
54+
UseShellExecute = true
55+
};
56+
Process.Start(psi);
57+
}
58+
}
59+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using System.Reflection;
2+
using System.Runtime.CompilerServices;
3+
using System.Runtime.InteropServices;
4+
5+
// General Information about an assembly is controlled through the following
6+
// set of attributes. Change these attribute values to modify the information
7+
// associated with an assembly.
8+
[assembly: AssemblyTitle("TesseractOcrProviderDemo")]
9+
[assembly: AssemblyDescription("")]
10+
[assembly: AssemblyConfiguration("")]
11+
[assembly: AssemblyCompany("Progress")]
12+
[assembly: AssemblyProduct("TesseractOcrProviderDemo")]
13+
[assembly: AssemblyCopyright("Copyright © Progress 2025")]
14+
[assembly: AssemblyTrademark("")]
15+
[assembly: AssemblyCulture("")]
16+
17+
// Setting ComVisible to false makes the types in this assembly not visible
18+
// to COM components. If you need to access a type in this assembly from
19+
// COM, set the ComVisible attribute to true on that type.
20+
[assembly: ComVisible(false)]
21+
22+
// The following GUID is for the ID of the typelib if this project is exposed to COM
23+
[assembly: Guid("4926b7c9-e2cd-4233-a3c7-b1486fff6524")]
24+
25+
// Version information for an assembly consists of the following four values:
26+
//
27+
// Major Version
28+
// Minor Version
29+
// Build Number
30+
// Revision
31+
//
32+
[assembly: AssemblyVersion("1.0.0.0")]
33+
[assembly: AssemblyFileVersion("1.0.0.0")]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
## TesseractOcrProvider
2+
3+
Since Q1 2025 the RadPdfProcessing library supports Optical Character Recognition (OCR). OCR is the electronic or mechanical conversion of images of typed, handwritten, or printed text into machine-encoded text from a scanned document.
4+
This project demonstrates how to use [TesseractOcrProvider](https://docs.telerik.com/devtools/document-processing/libraries/radpdfprocessing/formats-and-conversion/ocr/ocrformatprovider) that allows you to import an image to get a RadFixedPage.
5+
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<Import Project="packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.props" Condition="Exists('packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.props')" />
4+
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
5+
<PropertyGroup>
6+
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
7+
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
8+
<ProjectGuid>{4926B7C9-E2CD-4233-A3C7-B1486FFF6524}</ProjectGuid>
9+
<OutputType>Exe</OutputType>
10+
<RootNamespace>TesseractOcrProviderDemo</RootNamespace>
11+
<AssemblyName>TesseractOcrProviderDemo</AssemblyName>
12+
<TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
13+
<FileAlignment>512</FileAlignment>
14+
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
15+
<Deterministic>true</Deterministic>
16+
<NuGetPackageImportStamp>
17+
</NuGetPackageImportStamp>
18+
</PropertyGroup>
19+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
20+
<PlatformTarget>AnyCPU</PlatformTarget>
21+
<DebugSymbols>true</DebugSymbols>
22+
<DebugType>full</DebugType>
23+
<Optimize>false</Optimize>
24+
<OutputPath>bin\Debug\</OutputPath>
25+
<DefineConstants>DEBUG;TRACE</DefineConstants>
26+
<ErrorReport>prompt</ErrorReport>
27+
<WarningLevel>4</WarningLevel>
28+
</PropertyGroup>
29+
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
30+
<PlatformTarget>AnyCPU</PlatformTarget>
31+
<DebugType>pdbonly</DebugType>
32+
<Optimize>true</Optimize>
33+
<OutputPath>bin\Release\</OutputPath>
34+
<DefineConstants>TRACE</DefineConstants>
35+
<ErrorReport>prompt</ErrorReport>
36+
<WarningLevel>4</WarningLevel>
37+
</PropertyGroup>
38+
<ItemGroup>
39+
<Reference Include="System" />
40+
<Reference Include="System.Core" />
41+
<Reference Include="System.Drawing" />
42+
<Reference Include="System.Xml.Linq" />
43+
<Reference Include="System.Data.DataSetExtensions" />
44+
<Reference Include="Microsoft.CSharp" />
45+
<Reference Include="System.Data" />
46+
<Reference Include="System.Net.Http" />
47+
<Reference Include="System.Xml" />
48+
<Reference Include="Telerik.Licensing.Runtime, Version=1.4.6.0, Culture=neutral, PublicKeyToken=98bb5b04e55c09ef, processorArchitecture=MSIL">
49+
<HintPath>packages\Telerik.Licensing.1.4.6\lib\net462\Telerik.Licensing.Runtime.dll</HintPath>
50+
</Reference>
51+
<Reference Include="Telerik.Windows.Documents.Core, Version=2025.1.205.462, Culture=neutral, PublicKeyToken=5803cfa389c90ce7, processorArchitecture=MSIL">
52+
<SpecificVersion>False</SpecificVersion>
53+
<HintPath>Trial 4.6.2\Telerik.Windows.Documents.Core.dll</HintPath>
54+
</Reference>
55+
<Reference Include="Telerik.Windows.Documents.Fixed, Version=2025.1.205.462, Culture=neutral, PublicKeyToken=5803cfa389c90ce7, processorArchitecture=MSIL">
56+
<SpecificVersion>False</SpecificVersion>
57+
<HintPath>Trial 4.6.2\Telerik.Windows.Documents.Fixed.dll</HintPath>
58+
</Reference>
59+
<Reference Include="Telerik.Windows.Documents.Fixed.FormatProviders.Ocr, Version=2025.1.205.462, Culture=neutral, PublicKeyToken=5803cfa389c90ce7, processorArchitecture=MSIL">
60+
<SpecificVersion>False</SpecificVersion>
61+
<HintPath>Trial 4.6.2\Telerik.Windows.Documents.Fixed.FormatProviders.Ocr.dll</HintPath>
62+
</Reference>
63+
<Reference Include="Telerik.Windows.Documents.TesseractOcr, Version=2025.1.205.462, Culture=neutral, PublicKeyToken=5803cfa389c90ce7, processorArchitecture=MSIL">
64+
<SpecificVersion>False</SpecificVersion>
65+
<HintPath>Trial 4.6.2\Telerik.Windows.Documents.TesseractOcr.dll</HintPath>
66+
</Reference>
67+
<Reference Include="Telerik.Windows.Zip, Version=2025.1.205.462, Culture=neutral, PublicKeyToken=5803cfa389c90ce7, processorArchitecture=MSIL">
68+
<SpecificVersion>False</SpecificVersion>
69+
<HintPath>Trial 4.6.2\Telerik.Windows.Zip.dll</HintPath>
70+
</Reference>
71+
<Reference Include="Tesseract, Version=5.2.0.0, Culture=neutral, PublicKeyToken=5803cfa389c90ce7, processorArchitecture=MSIL">
72+
<SpecificVersion>False</SpecificVersion>
73+
<HintPath>Trial 4.6.2\Tesseract.dll</HintPath>
74+
</Reference>
75+
</ItemGroup>
76+
<ItemGroup>
77+
<Compile Include="Program.cs" />
78+
<Compile Include="Properties\AssemblyInfo.cs" />
79+
</ItemGroup>
80+
<ItemGroup>
81+
<None Include="App.config" />
82+
<None Include="packages.config" />
83+
</ItemGroup>
84+
<ItemGroup>
85+
<Content Include="images\image.png" />
86+
<Content Include="Trial 4.6.2\Telerik.Windows.Documents.Core.dll" />
87+
<Content Include="Trial 4.6.2\Telerik.Windows.Documents.Fixed.dll" />
88+
<Content Include="Trial 4.6.2\Telerik.Windows.Documents.Fixed.FormatProviders.Ocr.dll" />
89+
<Content Include="Trial 4.6.2\Telerik.Windows.Documents.TesseractOcr.dll" />
90+
<Content Include="Trial 4.6.2\Telerik.Windows.Zip.dll" />
91+
<Content Include="Trial 4.6.2\Tesseract.dll" />
92+
</ItemGroup>
93+
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
94+
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
95+
<PropertyGroup>
96+
<ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
97+
</PropertyGroup>
98+
<Error Condition="!Exists('packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.props')" Text="$([System.String]::Format('$(ErrorText)', 'packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.props'))" />
99+
<Error Condition="!Exists('packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.targets')" Text="$([System.String]::Format('$(ErrorText)', 'packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.targets'))" />
100+
</Target>
101+
<Import Project="packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.targets" Condition="Exists('packages\Telerik.Licensing.1.4.6\build\Telerik.Licensing.targets')" />
102+
</Project>
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.12.35424.110 d17.12
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TesseractOcrProviderDemo", "TesseractOcrProviderDemo.csproj", "{4926B7C9-E2CD-4233-A3C7-B1486FFF6524}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{4926B7C9-E2CD-4233-A3C7-B1486FFF6524}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{4926B7C9-E2CD-4233-A3C7-B1486FFF6524}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{4926B7C9-E2CD-4233-A3C7-B1486FFF6524}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{4926B7C9-E2CD-4233-A3C7-B1486FFF6524}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
EndGlobal
166 KB
Loading
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<packages>
3+
<package id="Telerik.Licensing" version="1.4.6" targetFramework="net462" />
4+
</packages>
12.7 MB
Binary file not shown.

0 commit comments

Comments
 (0)