GetWords(TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate) Метод (TextRegion)
Возвращает слова этой текстовой области.
Parameters
- wordCharacterPredicate
- Предикат символа слова, определяющий допустимые символы в словах.
- wordDelimiterPredicate
- Предикат разделителя слов, определяющий разделители слов.
- whiteSpaceCharacterPredicate
- Предикат символа пробела, определяющий символы пробела.
Return Value
Массив
TextRegion, который определяет слова.
Вот C#/VB.NET код, который демонстрирует, как извлечь только числа из PDF страницы.
''' <summary>
''' Returns the numbers only from PDF page.
''' </summary>
''' <param name="page">PDF page.</param>
''' <returns>Numbers from PDF page.</returns>
Public Shared Function GetOnlyNumbersFromPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As String
' get words of the page
Dim words As Vintasoft.Imaging.Text.TextRegion() = page.TextRegion.GetWords(AddressOf WordCharacterPredicate, AddressOf WordDelimiterPredicate, AddressOf WhiteSpaceCharacterPredicate)
Dim result As New System.Text.StringBuilder()
For Each word As Vintasoft.Imaging.Text.TextRegion In words
result.AppendLine(word.TextContent)
Next
Return result.ToString()
End Function
''' <summary>
''' The word character predicate.
''' </summary>
''' <param name="lineSymbols">The line symbols.</param>
''' <param name="symbolIndex">Index of the symbol.</param>
Public Shared Function WordCharacterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
' get the character to process
Dim processingCharacter As Char = lineSymbols(symbolIndex).TextSymbol.Symbol
' if character is a number
If System.[Char].IsNumber(processingCharacter) Then
Return True
' if character can be categorized as a punctuation symbol
ElseIf System.[Char].IsPunctuation(processingCharacter) Then
Dim previousCharacter As Char = " "C
If symbolIndex > 0 Then
previousCharacter = lineSymbols(symbolIndex - 1).TextSymbol.Symbol
End If
Dim nextCharacter As Char = " "C
If symbolIndex < lineSymbols.Length - 1 Then
nextCharacter = lineSymbols(symbolIndex + 1).TextSymbol.Symbol
End If
' if previous and next character is a number
If System.[Char].IsNumber(previousCharacter) AndAlso System.[Char].IsNumber(nextCharacter) Then
Return True
End If
End If
Return False
End Function
''' <summary>
''' The word delimiter predicate.
''' </summary>
''' <param name="lineSymbols">The line symbols.</param>
''' <param name="symbolIndex">Index of the symbol.</param>
Public Shared Function WordDelimiterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
' exclude the punctuation marks from the text extraction
Return False
End Function
''' <summary>
''' The white space character predicate.
''' </summary>
''' <param name="lineSymbols">The line symbols.</param>
''' <param name="symbolIndex">Index of the symbol.</param>
Public Shared Function WhiteSpaceCharacterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
' get the character to process
Dim processingCharacter As Char = lineSymbols(symbolIndex).TextSymbol.Symbol
' check that character can be categorized as white space
Return System.[Char].IsWhiteSpace(processingCharacter)
End Function
/// <summary>
/// Returns the numbers only from PDF page.
/// </summary>
/// <param name="page">PDF page.</param>
/// <returns>Numbers from PDF page.</returns>
public static string GetOnlyNumbersFromPdfPage(Vintasoft.Imaging.Pdf.Tree.PdfPage page)
{
// get words of the page
Vintasoft.Imaging.Text.TextRegion[] words = page.TextRegion.GetWords(
WordCharacterPredicate,
WordDelimiterPredicate,
WhiteSpaceCharacterPredicate);
System.Text.StringBuilder result = new System.Text.StringBuilder();
foreach (Vintasoft.Imaging.Text.TextRegion word in words)
{
result.AppendLine(word.TextContent);
}
return result.ToString();
}
/// <summary>
/// The word character predicate.
/// </summary>
/// <param name="lineSymbols">The line symbols.</param>
/// <param name="symbolIndex">Index of the symbol.</param>
public static bool WordCharacterPredicate(
Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
int symbolIndex)
{
// get the character to process
char processingCharacter = lineSymbols[symbolIndex].TextSymbol.Symbol;
// if character is a number
if (System.Char.IsNumber(processingCharacter))
return true;
// if character can be categorized as a punctuation symbol
else if (System.Char.IsPunctuation(processingCharacter))
{
char previousCharacter = ' ';
if (symbolIndex > 0)
previousCharacter = lineSymbols[symbolIndex - 1].TextSymbol.Symbol;
char nextCharacter = ' ';
if (symbolIndex < lineSymbols.Length - 1)
nextCharacter = lineSymbols[symbolIndex + 1].TextSymbol.Symbol;
// if previous and next character is a number
if (System.Char.IsNumber(previousCharacter) && System.Char.IsNumber(nextCharacter))
return true;
}
return false;
}
/// <summary>
/// The word delimiter predicate.
/// </summary>
/// <param name="lineSymbols">The line symbols.</param>
/// <param name="symbolIndex">Index of the symbol.</param>
public static bool WordDelimiterPredicate(
Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
int symbolIndex)
{
// exclude the punctuation marks from the text extraction
return false;
}
/// <summary>
/// The white space character predicate.
/// </summary>
/// <param name="lineSymbols">The line symbols.</param>
/// <param name="symbolIndex">Index of the symbol.</param>
public static bool WhiteSpaceCharacterPredicate(
Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
int symbolIndex)
{
// get the character to process
char processingCharacter = lineSymbols[symbolIndex].TextSymbol.Symbol;
// check that character can be categorized as white space
return System.Char.IsWhiteSpace(processingCharacter);
}
Целевые платформы: .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5