Анализ макета документа с помощью Tesseract OCR в .NET
17 февраля 2022
/// <summary> /// Analyzes page layout using Tesseract OCR engine. /// </summary> /// <param name="filename">The name of document image file.</param> public static void AnalyzePageLayoutUsingTesseractOCR(string filename) { // create an image collection using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection()) { // add images from file to the image collection images.Add(filename); System.Console.WriteLine("Create Tesseract OCR engine..."); // create the Tesseract OCR engine using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { System.Console.WriteLine("Initialize OCR engine..."); // init the Tesseract OCR engine for recognition of English characters tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English)); // for each image in image collection foreach (Vintasoft.Imaging.VintasoftImage image in images) { System.Console.WriteLine("Recognize the image..."); // set image for Tesseract OCR engine tesseractOcr.SetImage(image); // analyze page layout and get result as OCR page Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.AnalyzeLayout(); // clear image in Tesseract OCR engine tesseractOcr.ClearImage(); // calculate count of regions, paragraphs, lines, words, symbols int regionCount = ocrPage.Regions.Count; int paragraphCount = 0; int lineCount = 0; int wordCount = 0; int symbolCount = 0; foreach (Vintasoft.Imaging.Ocr.Results.OcrRegion region in ocrPage.Regions) { Vintasoft.Imaging.Ocr.Results.OcrTextRegion textRegion = region as Vintasoft.Imaging.Ocr.Results.OcrTextRegion; paragraphCount += textRegion.Paragraphs.Count; foreach (Vintasoft.Imaging.Ocr.Results.OcrParagraph paragraph in textRegion.Paragraphs) { lineCount += paragraph.TextLines.Count; foreach (Vintasoft.Imaging.Ocr.Results.OcrTextLine line in paragraph.TextLines) { wordCount += line.Words.Count; foreach (Vintasoft.Imaging.Ocr.Results.OcrWord word in line.Words) { symbolCount += word.Symbols.Count; } } } } // output information about count of regions, paragraphs, lines, words, symbols System.Console.WriteLine("Layout result:"); System.Console.WriteLine(string.Format("- Region count: {0}", regionCount)); System.Console.WriteLine(string.Format("- Paragraph count: {0}", paragraphCount)); System.Console.WriteLine(string.Format("- Line count: {0}", lineCount)); System.Console.WriteLine(string.Format("- Word count: {0}", wordCount)); System.Console.WriteLine(string.Format("- Symbol count: {0}", symbolCount)); System.Console.WriteLine(); System.Console.ReadKey(); } // shutdown the Tesseract OCR engine tesseractOcr.Shutdown(); } // free images images.ClearAndDisposeItems(); } }