OCR: Как преобразовать image-only PDF документ в searchable PDF документ?
В этом разделе
Вот C#/VB.NET код, который демонстрирует, как преобразовать image-only PDF документ в searchable PDF документ:
/// <summary>
/// Converts an image-only PDF document to a searchable PDF document.
/// </summary>
/// <param name="ocrLanguage">OCR language.</param>
/// <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
/// <param name="ocrResolution">The resolution that uses to OCR <paramref name="imageOnlyPdfFilename"/>.</param>
/// <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
public static void ConvertImageOnlyPdfToSearchablePdf(
Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
string imageOnlyPdfFilename,
Vintasoft.Imaging.Resolution ocrResolution,
string searchablePdfFilename)
{
// create an image collection
using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
{
// add pages from image-only PDF document into image collection
images.Add(imageOnlyPdfFilename);
// create a searchable PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument document =
new Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
{
System.Console.WriteLine("Create OCR engine...");
// create the Tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
System.Console.WriteLine("Initialize OCR engine...");
// init the Tesseract OCR engine
tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));
// create a PDF document builder
Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
// specify that the best image compression must be calculated automatically
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
// specify that image must be place over text
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
// for each image in image collection
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
System.Console.WriteLine("Recognize text in image...");
// recognize text on image
Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);
System.Console.WriteLine("Add page to a PDF document...");
// add recognized OCR page to the PDF document
documentBuilder.AddPage(image, page);
}
// shutdown OCR engine
tesseractOcr.Shutdown();
System.Console.WriteLine("Save changes in PDF document...");
// save changes in PDF document
document.SaveChanges();
}
}
// clear and dispose images in image collection
images.ClearAndDisposeItems();
}
}
''' <summary>
''' Converts an image-only PDF document to a searchable PDF document.
''' </summary>
''' <param name="ocrLanguage">OCR language.</param>
''' <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
''' <param name="ocrResolution">The resolution that uses to OCR <paramref name="imageOnlyPdfFilename"/>.</param>
''' <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
Public Shared Sub ConvertImageOnlyPdfToSearchablePdf(ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, imageOnlyPdfFilename As String, ocrResolution As Vintasoft.Imaging.Resolution, searchablePdfFilename As String)
' create an image collection
Using images As New Vintasoft.Imaging.ImageCollection()
' add pages from image-only PDF document into image collection
images.Add(imageOnlyPdfFilename)
' create a searchable PDF document
Using document As New Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
System.Console.WriteLine("Create OCR engine...")
' create the Tesseract OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
System.Console.WriteLine("Initialize OCR engine...")
' init the Tesseract OCR engine
tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage))
' create a PDF document builder
Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document)
' specify that the best image compression must be calculated automatically
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
' specify that image must be place over text
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
' for each image in image collection
For Each image As Vintasoft.Imaging.VintasoftImage In images
System.Console.WriteLine("Recognize text in image...")
' recognize text on image
Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)
System.Console.WriteLine("Add page to a PDF document...")
' add recognized OCR page to the PDF document
documentBuilder.AddPage(image, page)
Next
' shutdown OCR engine
tesseractOcr.Shutdown()
System.Console.WriteLine("Save changes in PDF document...")
' save changes in PDF document
document.SaveChanges()
End Using
End Using
' clear and dispose images in image collection
images.ClearAndDisposeItems()
End Using
End Sub