В этом разделе
Представляет абстрактный базовый класс, определяющий алгоритм текстового поиска.
Объектная модель
Синтаксис
'Declaration
Public MustInherit Class TextSearchEngine
public abstract class TextSearchEngine
public __gc abstract class TextSearchEngine
public ref class TextSearchEngine abstract
Пример
Вот C#/VB.NET код, который демонстрирует, как создать систему текстового поиска для поиска цифр на PDF странице.
''' <summary>
''' Outputs the information about digits in content of PDF document.
''' </summary>
''' <param name="document">PDF document where digits should be searched.</param>
Public Sub SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(document As Vintasoft.Imaging.Pdf.PdfDocument)
System.Console.WriteLine("Searching the digits in text of PDF document.")
For i As Integer = 0 To document.Pages.Count - 1
Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = AdvancedDigitsSearchOnPdfPage(document.Pages(i))
If textRegions IsNot Nothing Then
For j As Integer = 0 To textRegions.Length - 1
System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
Next
End If
Next
System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
End Sub
''' <summary>
''' Searches digits on PDF page.
''' </summary>
''' <param name="page">PDF page where digits should be searched.</param>
''' <returns>An array of text regions on PDF page where text was found.</returns>
Public Function AdvancedDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As Vintasoft.Imaging.Text.TextRegion()
Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
Dim digitsSearchEngine As New DigitsSearchEngine()
Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
Dim startIndex As Integer = 0
Do
' search text
textRegion = page.TextRegion.FindText(digitsSearchEngine, startIndex, False)
If textRegion IsNot Nothing Then
' add result
textRegions.Add(textRegion)
' shitf start index
startIndex += textRegion.TextContent.Length
End If
Loop While textRegion IsNot Nothing
Return textRegions.ToArray()
End Function
''' <summary>
''' Class for searching the digits in text of PDF page.
''' </summary>
Private Class DigitsSearchEngine
Inherits Vintasoft.Imaging.Text.TextSearchEngine
''' <summary>
''' Searches the first text matching in the string of PDF page.
''' </summary>
''' <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
''' <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
''' <param name="length">The number of characters, in the sourceString, to analyze.</param>
''' <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
''' <returns>
''' Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
''' contains information about searched text if text is found; otherwise, null.
''' </returns>
Public Overrides Function Find(sourceString As String, startIndex As Integer, length As Integer, rightToLeft As Boolean) As Vintasoft.Imaging.Text.TextSearchResult
Dim startDigitIndex As Integer = -1
Dim endDigitIndex As Integer = -1
Dim start As Integer = 0
Dim [end] As Integer = 0
' if searching text from the right to the left
If rightToLeft Then
start = startIndex + length
[end] = 0
For index As Integer = start - 1 To [end] Step -1
If Char.IsDigit(sourceString(index)) AndAlso endDigitIndex = -1 Then
endDigitIndex = index + 1
ElseIf Not Char.IsDigit(sourceString(index)) AndAlso endDigitIndex <> -1 Then
startDigitIndex = index + 1
Exit For
End If
Next
If endDigitIndex <> -1 AndAlso startDigitIndex = -1 Then
startDigitIndex = 0
End If
Else
' if searching text from the left to the right
start = startIndex
[end] = startIndex + length
For index As Integer = start To [end] - 1
If Char.IsDigit(sourceString(index)) AndAlso startDigitIndex = -1 Then
startDigitIndex = index
ElseIf Not Char.IsDigit(sourceString(index)) AndAlso startDigitIndex <> -1 Then
endDigitIndex = index
Exit For
End If
Next
If startDigitIndex <> -1 AndAlso endDigitIndex = -1 Then
endDigitIndex = [end]
End If
End If
' if digit is not found
If startDigitIndex = -1 Then
Return Nothing
End If
' return the text search result
Return New Vintasoft.Imaging.Text.TextSearchResult(startDigitIndex, endDigitIndex - startDigitIndex)
End Function
End Class
/// <summary>
/// Outputs the information about digits in content of PDF document.
/// </summary>
/// <param name="document">PDF document where digits should be searched.</param>
public void SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(Vintasoft.Imaging.Pdf.PdfDocument document)
{
System.Console.WriteLine("Searching the digits in text of PDF document.");
for (int i = 0; i < document.Pages.Count; i++)
{
Vintasoft.Imaging.Text.TextRegion[] textRegions =
AdvancedDigitsSearchOnPdfPage(document.Pages[i]);
if (textRegions != null)
{
for (int j = 0; j < textRegions.Length; j++)
{
System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
textRegions[j].TextContent,
textRegions[j].Rectangle));
}
}
}
System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
}
/// <summary>
/// Searches digits on PDF page.
/// </summary>
/// <param name="page">PDF page where digits should be searched.</param>
/// <returns>An array of text regions on PDF page where text was found.</returns>
public Vintasoft.Imaging.Text.TextRegion[] AdvancedDigitsSearchOnPdfPage(
Vintasoft.Imaging.Pdf.Tree.PdfPage page)
{
System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions =
new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
DigitsSearchEngine digitsSearchEngine = new DigitsSearchEngine();
Vintasoft.Imaging.Text.TextRegion textRegion = null;
int startIndex = 0;
do
{
// search text
textRegion = page.TextRegion.FindText(digitsSearchEngine, ref startIndex, false);
if (textRegion != null)
{
// add result
textRegions.Add(textRegion);
// shitf start index
startIndex += textRegion.TextContent.Length;
}
} while (textRegion != null);
return textRegions.ToArray();
}
/// <summary>
/// Class for searching the digits in text of PDF page.
/// </summary>
class DigitsSearchEngine : Vintasoft.Imaging.Text.TextSearchEngine
{
/// <summary>
/// Searches the first text matching in the string of PDF page.
/// </summary>
/// <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
/// <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
/// <param name="length">The number of characters, in the sourceString, to analyze.</param>
/// <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
/// <returns>
/// Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
/// contains information about searched text if text is found; otherwise, null.
/// </returns>
public override Vintasoft.Imaging.Text.TextSearchResult Find(
string sourceString, int startIndex, int length, bool rightToLeft)
{
int startDigitIndex = -1;
int endDigitIndex = -1;
int start = 0;
int end = 0;
// if searching text from the right to the left
if (rightToLeft)
{
start = startIndex + length;
end = 0;
for (int index = start - 1; index >= end; index--)
{
if (char.IsDigit(sourceString[index]) && endDigitIndex == -1)
endDigitIndex = index + 1;
else if (!char.IsDigit(sourceString[index]) && endDigitIndex != -1)
{
startDigitIndex = index + 1;
break;
}
}
if (endDigitIndex != -1 && startDigitIndex == -1)
startDigitIndex = 0;
}
// if searching text from the left to the right
else
{
start = startIndex;
end = startIndex + length;
for (int index = start; index < end; index++)
{
if (char.IsDigit(sourceString[index]) && startDigitIndex == -1)
startDigitIndex = index;
else if (!char.IsDigit(sourceString[index]) && startDigitIndex != -1)
{
endDigitIndex = index;
break;
}
}
if (startDigitIndex != -1 && endDigitIndex == -1)
endDigitIndex = end;
}
// if digit is not found
if (startDigitIndex == -1)
return null;
// return the text search result
return new Vintasoft.Imaging.Text.TextSearchResult(
startDigitIndex, endDigitIndex - startDigitIndex);
}
}
Иерархия наследования
System.Object
 Vintasoft.Imaging.Text.TextSearchEngine
Требования
Целевые платформы: .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5
Смотрите также