OCR: Получение и редактирование результатов OCR.
В этом разделе
Результат распознавания текста на изображении возвращается в виде OCR страницы (экземпляр класса
OcrPage). OCR страница содержит текстовые области (экземпляры класса
OcrTextRegion). Текстовая область содержит абзацы (экземпляры класса
OcrParagraph). Абзац содержит текстовые строки (экземпляры класса
OcrTextLine). Текстовая строка содержит слова (экземпляры класса
OcrWord). Слово содержит символы (экземпляры класса
OcrSymbol).
Все элементы (
OcrPage,
OcrTextRegion,
OcrParagraph,
OcrTextLine,
OcrWord,
OcrSymbol) содержат:
Класс
OcrWord также содержит имя шрифта распознанного текста (
OcrWord.Font).
Класс
OcrPage позволяет получить результат распознавания в виде форматированного текста (
OcrPage.GetFormattedText) и не форматированного текста (
OcrPage.GetText).
Часто необходимо обработать результаты распознавания, например, удалить слова с низкой достоверностью или объединить два и более результатов в один.
Класс
OcrResultsEditor предназначен для такой обработки результатов OCR.
Вот C#/VB.NET код, который демонстрирует, как удалить слова с низкой достоверностью из результатов распознавания текста:
/// <summary>
/// Recognizes text in images,
/// removes words with low confidence from recognized text and
/// returns recognized text.
/// </summary>
/// <param name="filename">The name of the file containing image to OCR.</param>
public string RecognizeTextAndFilterRecognitionResult(string filename)
{
// minimum confidence
const float MIN_CONFIDENCE = 75.0f;
// create image collection
using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
{
// add images from file to image collection
images.Add(filename);
// create tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
// create tesseract OCR settings
Vintasoft.Imaging.Ocr.OcrEngineSettings settings =
new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English);
tesseractOcr.Init(settings);
// create result builder
System.Text.StringBuilder result = new System.Text.StringBuilder();
// for each image in image collection
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
// recognize the image
Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);
// get all words in recognized text
Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects = page.GetObjects(
Vintasoft.Imaging.Ocr.OcrObjectType.Word);
// create list of words to remove
System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects =
new System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject>();
// for each word
foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects)
{
// if word confidence is less than minimum confidence
if (word.Confidence < MIN_CONFIDENCE)
// add word to a list of words to remove
removeObjects.Add(word);
}
// validate recognition results (remove words with low confidence)
Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor =
new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page);
editor.RemoveObjects(removeObjects.ToArray());
editor.ValidateResults();
// get recognized text
string text = page.GetText();
// add recognized text to result
result.Append(text);
result.AppendLine();
}
// dispose images and clear image collection
images.ClearAndDisposeItems();
// return result
return result.ToString();
}
}
}
''' <summary>
''' Recognizes text in images,
''' removes words with low confidence from recognized text and
''' returns recognized text.
''' </summary>
''' <param name="filename">The name of the file containing image to OCR.</param>
Public Function RecognizeTextAndFilterRecognitionResult(filename As String) As String
' minimum confidence
Const MIN_CONFIDENCE As Single = 75F
' create image collection
Using images As New Vintasoft.Imaging.ImageCollection()
' add images from file to image collection
images.Add(filename)
' create tesseract OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
' create tesseract OCR settings
Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English)
tesseractOcr.Init(settings)
' create result builder
Dim result As New System.Text.StringBuilder()
' for each image in image collection
For Each image As Vintasoft.Imaging.VintasoftImage In images
' recognize the image
Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)
' get all words in recognized text
Dim ocrObjects As Vintasoft.Imaging.Ocr.Results.OcrObject() = page.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word)
' create list of words to remove
Dim removeObjects As New System.Collections.Generic.List(Of Vintasoft.Imaging.Ocr.Results.OcrObject)()
' for each word
For Each word As Vintasoft.Imaging.Ocr.Results.OcrObject In ocrObjects
' if word confidence is less than minimum confidence
If word.Confidence < MIN_CONFIDENCE Then
' add word to a list of words to remove
removeObjects.Add(word)
End If
Next
' validate recognition results (remove words with low confidence)
Dim editor As New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page)
editor.RemoveObjects(removeObjects.ToArray())
editor.ValidateResults()
' get recognized text
Dim text As String = page.GetText()
' add recognized text to result
result.Append(text)
result.AppendLine()
Next
' dispose images and clear image collection
images.ClearAndDisposeItems()
' return result
Return result.ToString()
End Using
End Using
End Function
Вот C#/VB.NET код, который демонстрирует, как распознать текст в двух изображениях (с разным разрешением) одного и того же документа и объединить результаты распознавания:
string imageFilePath = @"D:\TestImage.pdf";
// create the OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
// create an array for additional OCR engines
Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr[] additionalEngines = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr[2];
try
{
// create an array for additional OCR engines
for (int i = 0; i < additionalEngines.Length; i++)
// create the additional OCR engine
additionalEngines[i] = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr();
// create the OCR engine manager
Vintasoft.Imaging.Ocr.OcrEngineManager engineManager =
new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr, additionalEngines);
// load a PDF document from file
using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument =
new Vintasoft.Imaging.Pdf.PdfDocument(imageFilePath))
{
// create the OCR engine settings and
// specify that German text will be recognized
Vintasoft.Imaging.Ocr.OcrEngineSettings settings =
new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.German);
// if PDF document is empty
if (pdfDocument.Pages.Count == 0)
return;
// get the first PDF page
Vintasoft.Imaging.Pdf.Tree.PdfPage pdfPage = pdfDocument.Pages[0];
// scales, which should be applied to the PDF page before text recognition
float[] scales = new float[] { 0.5f, 1.5f };
// an array that contains the scaled OCR results
Vintasoft.Imaging.Ocr.Results.OcrPage[] scaledOcrResults =
new Vintasoft.Imaging.Ocr.Results.OcrPage[scales.Length];
// for each scale
for (int i = 0; i < scales.Length; i++)
{
// render the scaled PDF page
using (Vintasoft.Imaging.VintasoftImage renderedImage = pdfPage.Render(scales[i]))
{
// recognize text in scaled PDF page
scaledOcrResults[i] = engineManager.Recognize(renderedImage, settings);
}
}
// if text is recognized
if (scaledOcrResults.Length > 0)
{
Vintasoft.Imaging.Ocr.Results.OcrResultsEditor resultEditor;
// "downscale" the OCR results, this is necessary
// because we scaled PDF page before text recognition
// for each recognition result
for (int i = 0; i < scaledOcrResults.Length; i++)
{
if (scales[i] == 1f)
continue;
// create the OCR results editor
resultEditor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(scaledOcrResults[i]);
// calculate the "downscale" factor
Vintasoft.Imaging.Scale downScale = new Vintasoft.Imaging.Scale(1f / scales[i], 1f / scales[i]);
// "downscale" the OCR result
resultEditor.ScaleOcrPage(downScale);
}
// combine the scaled OCR results into the final OCR result
// set the first scaled OCR result as the final OCR result
Vintasoft.Imaging.Ocr.Results.OcrPage finalOcrResult = scaledOcrResults[0];
// create the OCR results editor for the final OCR result
resultEditor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(finalOcrResult);
// for each scaled OCR result starting from the second
for (int i = 1; i < scaledOcrResults.Length; i++)
{
// add the scaled OCR result to the OCR results editor
resultEditor.AddRegions(scaledOcrResults[i]);
}
// validate the final OCR result
resultEditor.ValidateResults();
// get the recognized text from the final OCR result
string ocrPageContent = finalOcrResult.GetText();
string textFilePath = System.IO.Path.Combine(
System.IO.Path.GetDirectoryName(imageFilePath),
System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".txt");
// save the recognition results
System.IO.File.WriteAllText(textFilePath, ocrPageContent, System.Text.Encoding.UTF8);
}
}
}
finally
{
// for each additional OCR engine
for (int i = 0; i < additionalEngines.Length; i++)
{
if (additionalEngines[i] != null)
// dispose the additional OCR engine
additionalEngines[i].Dispose();
}
}
}
Dim imageFilePath As String = "D:\TestImage.pdf"
' create the OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
' create an array for additional OCR engines
Dim additionalEngines As Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() = New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(1) {}
Try
' create an array for additional OCR engines
For i As Integer = 0 To additionalEngines.Length - 1
' create the additional OCR engine
additionalEngines(i) = New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
Next
' create the OCR engine manager
Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr, additionalEngines)
' load a PDF document from file
Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(imageFilePath)
' create the OCR engine settings and
' specify that German text will be recognized
Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.German)
' if PDF document is empty
If pdfDocument.Pages.Count = 0 Then
Return
End If
' get the first PDF page
Dim pdfPage As Vintasoft.Imaging.Pdf.Tree.PdfPage = pdfDocument.Pages(0)
' scales, which should be applied to the PDF page before text recognition
Dim scales As Single() = New Single() {0.5F, 1.5F}
' an array that contains the scaled OCR results
Dim scaledOcrResults As Vintasoft.Imaging.Ocr.Results.OcrPage() = New Vintasoft.Imaging.Ocr.Results.OcrPage(scales.Length - 1) {}
' for each scale
For i As Integer = 0 To scales.Length - 1
' render the scaled PDF page
Using renderedImage As Vintasoft.Imaging.VintasoftImage = pdfPage.Render(scales(i))
' recognize text in scaled PDF page
scaledOcrResults(i) = engineManager.Recognize(renderedImage, settings)
End Using
Next
' if text is recognized
If scaledOcrResults.Length > 0 Then
Dim resultEditor As Vintasoft.Imaging.Ocr.Results.OcrResultsEditor
' "downscale" the OCR results, this is necessary
' because we scaled PDF page before text recognition
' for each recognition result
For i As Integer = 0 To scaledOcrResults.Length - 1
If scales(i) = 1F Then
Continue For
End If
' create the OCR results editor
resultEditor = New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(scaledOcrResults(i))
' calculate the "downscale" factor
Dim downScale As New Vintasoft.Imaging.Scale(1F / scales(i), 1F / scales(i))
' "downscale" the OCR result
resultEditor.ScaleOcrPage(downScale)
Next
' combine the scaled OCR results into the final OCR result
' set the first scaled OCR result as the final OCR result
Dim finalOcrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = scaledOcrResults(0)
' create the OCR results editor for the final OCR result
resultEditor = New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(finalOcrResult)
' for each scaled OCR result starting from the second
For i As Integer = 1 To scaledOcrResults.Length - 1
' add the scaled OCR result to the OCR results editor
resultEditor.AddRegions(scaledOcrResults(i))
Next
' validate the final OCR result
resultEditor.ValidateResults()
' get the recognized text from the final OCR result
Dim ocrPageContent As String = finalOcrResult.GetText()
Dim textFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".txt")
' save the recognition results
System.IO.File.WriteAllText(textFilePath, ocrPageContent, System.Text.Encoding.UTF8)
End If
End Using
Finally
' for each additional OCR engine
For i As Integer = 0 To additionalEngines.Length - 1
If additionalEngines(i) IsNot Nothing Then
' dispose the additional OCR engine
additionalEngines(i).Dispose()
End If
Next
End Try
End Using