OCR: Как проанализировать макет изображения документа с помощью Tesseract OCR в .NET

Распознавание текста в изображении документа состоит из двух этапов. На первом этапе анализируется макет изображения документа, т.е. определяется положение абзацев, текстовых строк, слов и символов в изображении документа. На втором этапе выполняется распознавание символов в изображении документа и разделение символов на абзацы, текстовые строки и слова.

VintaSoft Imaging .NET SDK с VintaSoft OCR .NET Plug-in позволяет распознавать текст в изображении документа с помощью метода TesseractOcr.Recognize. Этот метод выполняет оба этапа распознавания текста, т. е. анализирует макет и выполняет распознавание символов.

Иногда возникает необходимость проанализировать положение абзацев, текстовых строк, слов и символов в изображении документа без распознавания символов.
VintaSoft Imaging .NET SDK с VintaSoft OCR .NET Plug-in позволяет проанализировать макет изображения документа (определить положение абзацев, текстовых строк, слов и символов в изображении документа) с помощью метода TesseractOcr.AnalyzeLayout. Метод TesseractOcr.AnalyzeLayout работает быстрее, чем метод TesseractOcr.Recognize, поскольку метод TesseractOcr.AnalyzeLayout анализирует макет изображения документа, но не выполняет распознавание символов.

Вот C#/VB.NET код, который демонстрирует, как проанализировать макет изображения документа с помощью движка Tesseract OCR (метод TesseractOcr.AnalyzeLayout):

/// <summary>
/// Analyzes page layout without tables detection using Tesseract OCR engine.
/// </summary>
/// <param name="filename">The name of document image file.</param>
public static void AnalyzePageLayoutWithoutTablesDetectionUsingTesseractOCR(string filename)
{
    // create an image collection
    using (Vintasoft.Imaging.ImageCollection images =
        new Vintasoft.Imaging.ImageCollection())
    {
        // add images from file to the image collection
        images.Add(filename);

        System.Console.WriteLine("Create Tesseract OCR engine...");
        // create the Tesseract OCR engine
        using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
            new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
        {
            System.Console.WriteLine("Initialize OCR engine...");
            // init the Tesseract OCR engine for recognition of English characters
            tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English));

            // for each image in image collection
            foreach (Vintasoft.Imaging.VintasoftImage image in images)
            {
                System.Console.WriteLine("Recognize the image...");

                // set image for Tesseract OCR engine
                tesseractOcr.SetImage(image);

                // analyze page layout without tables detection and get result as OCR page
                Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.AnalyzeLayout();

                // clear image in Tesseract OCR engine
                tesseractOcr.ClearImage();

                // calculate count of regions, paragraphs, lines, words, symbols

                int regionCount = ocrPage.Regions.Count;
                int paragraphCount = 0;
                int lineCount = 0;
                int wordCount = 0;
                int symbolCount = 0;

                foreach (Vintasoft.Imaging.Ocr.Results.OcrRegion region in ocrPage.Regions)
                {
                    Vintasoft.Imaging.Ocr.Results.OcrTextRegion textRegion =
                        region as Vintasoft.Imaging.Ocr.Results.OcrTextRegion;
                    paragraphCount += textRegion.Paragraphs.Count;
                    foreach (Vintasoft.Imaging.Ocr.Results.OcrParagraph paragraph in textRegion.Paragraphs)
                    {
                        lineCount += paragraph.TextLines.Count;
                        foreach (Vintasoft.Imaging.Ocr.Results.OcrTextLine line in paragraph.TextLines)
                        {
                            wordCount += line.Words.Count;
                            foreach (Vintasoft.Imaging.Ocr.Results.OcrWord word in line.Words)
                            {
                                symbolCount += word.Symbols.Count;
                            }
                        }
                    }
                }

                // output information about count of regions, paragraphs, lines, words, symbols

                System.Console.WriteLine("Layout result:");
                System.Console.WriteLine(string.Format("- Region count: {0}", regionCount));
                System.Console.WriteLine(string.Format("- Paragraph count: {0}", paragraphCount));
                System.Console.WriteLine(string.Format("- Line count: {0}", lineCount));
                System.Console.WriteLine(string.Format("- Word count: {0}", wordCount));
                System.Console.WriteLine(string.Format("- Symbol count: {0}", symbolCount));
                System.Console.WriteLine();
                System.Console.ReadKey();
            }

            // shutdown the Tesseract OCR engine
            tesseractOcr.Shutdown();
        }

        // free images
        images.ClearAndDisposeItems();
    }
}

VB.NET

''' <summary>
''' Analyzes page layout without tables detection using Tesseract OCR engine.
''' </summary>
''' <param name="filename">The name of document image file.</param>
Public Shared Sub AnalyzePageLayoutWithoutTablesDetectionUsingTesseractOCR(filename As String)
    ' create an image collection
    Using images As New Vintasoft.Imaging.ImageCollection()
        ' add images from file to the image collection
        images.Add(filename)

        System.Console.WriteLine("Create Tesseract OCR engine...")
        ' create the Tesseract OCR engine
        Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
            System.Console.WriteLine("Initialize OCR engine...")
            ' init the Tesseract OCR engine for recognition of English characters
            tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English))

            ' for each image in image collection
            For Each image As Vintasoft.Imaging.VintasoftImage In images
                System.Console.WriteLine("Recognize the image...")

                ' set image for Tesseract OCR engine
                tesseractOcr.SetImage(image)

                ' analyze page layout without tables detection and get result as OCR page
                Dim ocrPage As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.AnalyzeLayout()

                ' clear image in Tesseract OCR engine
                tesseractOcr.ClearImage()

                ' calculate count of regions, paragraphs, lines, words, symbols

                Dim regionCount As Integer = ocrPage.Regions.Count
                Dim paragraphCount As Integer = 0
                Dim lineCount As Integer = 0
                Dim wordCount As Integer = 0
                Dim symbolCount As Integer = 0

                For Each region As Vintasoft.Imaging.Ocr.Results.OcrRegion In ocrPage.Regions
                    Dim textRegion As Vintasoft.Imaging.Ocr.Results.OcrTextRegion = TryCast(region, Vintasoft.Imaging.Ocr.Results.OcrTextRegion)
                    paragraphCount += textRegion.Paragraphs.Count
                    For Each paragraph As Vintasoft.Imaging.Ocr.Results.OcrParagraph In textRegion.Paragraphs
                        lineCount += paragraph.TextLines.Count
                        For Each line As Vintasoft.Imaging.Ocr.Results.OcrTextLine In paragraph.TextLines
                            wordCount += line.Words.Count
                            For Each word As Vintasoft.Imaging.Ocr.Results.OcrWord In line.Words
                                symbolCount += word.Symbols.Count
                            Next
                        Next
                    Next
                Next

                ' output information about count of regions, paragraphs, lines, words, symbols

                System.Console.WriteLine("Layout result:")
                System.Console.WriteLine(String.Format("- Region count: {0}", regionCount))
                System.Console.WriteLine(String.Format("- Paragraph count: {0}", paragraphCount))
                System.Console.WriteLine(String.Format("- Line count: {0}", lineCount))
                System.Console.WriteLine(String.Format("- Word count: {0}", wordCount))
                System.Console.WriteLine(String.Format("- Symbol count: {0}", symbolCount))
                System.Console.WriteLine()
                System.Console.ReadKey()
            Next

            ' shutdown the Tesseract OCR engine
            tesseractOcr.Shutdown()
        End Using

        ' free images
        images.ClearAndDisposeItems()
    End Using
End Sub

Отправить отзыв