Определение и распознавание документа, содержащего заполненную форму с текстом
В этом разделе
Если вы хотите определить и распознать документ, содержащий заполненную форму с текстом, вам необходимо выполнить следующие шаги:
- Указать OCR-движок, который должен использоваться для распознавания текста в текстовых полях - это можно сделать с помощью свойства OcrFieldTemplate.OcrEngineManager.
- Определить шаблон для заполненной формы
- Распознать значения полей в заполненной форме
Вот C#/VB.NET код, демонстрирующий, как идентифицировать и распознать заполненную форму, содержащую текст.
/// <summary>
/// Recognizes the form with OCR fields.
/// </summary>
/// <param name="formRecognitionManager">The form recognition manager.</param>
/// <param name="image">The image.</param>
public static void RecognizeFormWithOcrFields(
Vintasoft.Imaging.FormsProcessing.FormRecognitionManager formRecognitionManager,
Vintasoft.Imaging.VintasoftImage image)
{
// check whether OCR engine manager of the OCR field templates is initialized
// (this initialization can be moved to the start of your application)
if (Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager == null)
{
// get or create text OCR engine
Vintasoft.Imaging.Ocr.OcrEngine textOcrEngine = GetOcrEngine();
// create Handwritten digits OCR engine
Vintasoft.Imaging.Ocr.OcrEngine handwrittenDigitsOcrEngine = new Vintasoft.Imaging.Ocr.ML.HandwrittenDigits.HandwrittenDigitsOcrEngine();
// create and set OCR engine manager of the OCR field templates
Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager =
new Vintasoft.Imaging.Ocr.OcrEngineManager(textOcrEngine, handwrittenDigitsOcrEngine);
}
// recognize filled form in an image
Vintasoft.Imaging.FormsProcessing.FormRecognitionResult recognitionResult =
formRecognitionManager.Recognize(image);
// get the result of image comparison
Vintasoft.Imaging.FormsProcessing.TemplateMatching.ImageImprintCompareResult imageCompareResult =
recognitionResult.TemplateMatchingResult.ImageCompareResult;
// if result is not reliable
if (!imageCompareResult.IsReliable)
{
// matching template is not found
System.Console.WriteLine("Matching template is not found.");
}
else
{
// get recognized page
Vintasoft.Imaging.FormsProcessing.FormRecognition.FormPage recognizedPage = recognitionResult.RecognizedPage;
// get form field count
if (recognizedPage.Items.Count == 0)
{
System.Console.WriteLine("No form fields were recognized.");
}
else
{
System.Console.WriteLine(string.Format(
"Recognized form field count: {0}",
recognizedPage.Items.Count));
// for each recognized form field
foreach (Vintasoft.Imaging.FormsProcessing.FormRecognition.FormField recognizedField in recognizedPage.Items)
{
if (recognizedField is Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField)
{
Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField ocrField =
(Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField)recognizedField;
// write field info
System.Console.WriteLine(string.Format(
" OCR field: name: {0}; value: {1}; confidence: {2:F1}%",
ocrField.Name,
ocrField.Value,
ocrField.Confidence * 100));
Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = ocrField.OcrResult;
// get all words
Vintasoft.Imaging.Ocr.Results.OcrObject[] words = ocrResult.GetWords(75, 75);
// write words info
for (int i = 0; i < words.Length; i++)
{
Vintasoft.Imaging.Ocr.Results.OcrObject word = words[i];
System.Console.WriteLine(string.Format(
" OCR word: {0}; confidence: {1:F1}%",
word.ToString(),
word.Confidence));
}
}
}
}
}
}
/// <summary>
/// Gets the OCR engine used for OCR field recognition.
/// </summary>
/// <remarks>
/// To create a Tesseract OCR engine,
/// add a reference to Vintasoft.Imaging.Ocr.Tesseract.dll
/// into your project.
/// </remarks>
private static Vintasoft.Imaging.Ocr.OcrEngine GetOcrEngine()
{
// full path to the Tesseract5.Vintasoft.xXX.dll files
// NOTE: specify here the actual path to the Tesseract OCR dll files
string tesseractDllDirectory = @"C:\Program Files\VintaSoft\VintaSoft Imaging .NET\Bin\TesseractOCR\";
// create Tesseract OCR engine (Vintasoft.Imaging.Ocr.Tesseract.dll is required)
return new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractDllDirectory);
}
''' <summary>
''' Recognizes the form with OCR fields.
''' </summary>
''' <param name="formRecognitionManager">The form recognition manager.</param>
''' <param name="image">The image.</param>
Public Shared Sub RecognizeFormWithOcrFields(formRecognitionManager As Vintasoft.Imaging.FormsProcessing.FormRecognitionManager, image As Vintasoft.Imaging.VintasoftImage)
' check whether OCR engine manager of the OCR field templates is initialized
' (this initialization can be moved to the start of your application)
If Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager Is Nothing Then
' get or create text OCR engine
Dim textOcrEngine As Vintasoft.Imaging.Ocr.OcrEngine = GetOcrEngine()
' create Handwritten digits OCR engine
Dim handwrittenDigitsOcrEngine As Vintasoft.Imaging.Ocr.OcrEngine = New Vintasoft.Imaging.Ocr.ML.HandwrittenDigits.HandwrittenDigitsOcrEngine()
' create and set OCR engine manager of the OCR field templates
Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager = New Vintasoft.Imaging.Ocr.OcrEngineManager(textOcrEngine, handwrittenDigitsOcrEngine)
End If
' recognize filled form in an image
Dim recognitionResult As Vintasoft.Imaging.FormsProcessing.FormRecognitionResult = formRecognitionManager.Recognize(image)
' get the result of image comparison
Dim imageCompareResult As Vintasoft.Imaging.FormsProcessing.TemplateMatching.ImageImprintCompareResult = recognitionResult.TemplateMatchingResult.ImageCompareResult
' if result is not reliable
If Not imageCompareResult.IsReliable Then
' matching template is not found
System.Console.WriteLine("Matching template is not found.")
Else
' get recognized page
Dim recognizedPage As Vintasoft.Imaging.FormsProcessing.FormRecognition.FormPage = recognitionResult.RecognizedPage
' get form field count
If recognizedPage.Items.Count = 0 Then
System.Console.WriteLine("No form fields were recognized.")
Else
System.Console.WriteLine(String.Format("Recognized form field count: {0}", recognizedPage.Items.Count))
' for each recognized form field
For Each recognizedField As Vintasoft.Imaging.FormsProcessing.FormRecognition.FormField In recognizedPage.Items
If TypeOf recognizedField Is Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField Then
Dim ocrField As Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField = DirectCast(recognizedField, Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField)
' write field info
System.Console.WriteLine(String.Format(" OCR field: name: {0}; value: {1}; confidence: {2:F1}%", ocrField.Name, ocrField.Value, ocrField.Confidence * 100))
Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = ocrField.OcrResult
' get all words
Dim words As Vintasoft.Imaging.Ocr.Results.OcrObject() = ocrResult.GetWords(75, 75)
' write words info
For i As Integer = 0 To words.Length - 1
Dim word As Vintasoft.Imaging.Ocr.Results.OcrObject = words(i)
System.Console.WriteLine(String.Format(" OCR word: {0}; confidence: {1:F1}%", word.ToString(), word.Confidence))
Next
End If
Next
End If
End If
End Sub
''' <summary>
''' Gets the OCR engine used for OCR field recognition.
''' </summary>
''' <remarks>
''' To create a Tesseract OCR engine,
''' add a reference to Vintasoft.Imaging.Ocr.Tesseract.dll
''' into your project.
''' </remarks>
Private Shared Function GetOcrEngine() As Vintasoft.Imaging.Ocr.OcrEngine
' full path to the Tesseract5.Vintasoft.xXX.dll files
' NOTE: specify here the actual path to the Tesseract OCR dll files
Dim tesseractDllDirectory As String = "C:\Program Files\VintaSoft\VintaSoft Imaging .NET\Bin\TesseractOCR\"
' create Tesseract OCR engine (Vintasoft.Imaging.Ocr.Tesseract.dll is required)
Return New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractDllDirectory)
End Function