VintaSoft Imaging .NET SDK 14.0: Документация для .NET разработчика
Vintasoft.Imaging.Text Namespace / TextRegion Class / GetWords Methods / GetWords(TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate) Method
Синтаксис Example Требования Смотрите также
В этом разделе
    GetWords(TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate,TextRegionLineSymbolPredicate) Метод (TextRegion)
    В этом разделе
    Возвращает слова этой текстовой области.
    Синтаксис
    'Declaration
    
    Public Overloads Function GetWords( _
    ByVal wordCharacterPredicate
    Предикат символа слова, определяющий допустимые символы в словах.
    As TextRegionLineSymbolPredicate, _
    ByVal wordDelimiterPredicate
    Предикат разделителя слов, определяющий разделители слов.
    As TextRegionLineSymbolPredicate, _
    ByVal whiteSpaceCharacterPredicate
    Предикат символа пробела, определяющий символы пробела.
    As TextRegionLineSymbolPredicate _
    ) As TextRegion[]

    Parameters

    wordCharacterPredicate
    Предикат символа слова, определяющий допустимые символы в словах.
    wordDelimiterPredicate
    Предикат разделителя слов, определяющий разделители слов.
    whiteSpaceCharacterPredicate
    Предикат символа пробела, определяющий символы пробела.

    Return Value

    Массив TextRegion, который определяет слова.
    Пример

    Вот C#/VB.NET код, который демонстрирует, как извлечь только числа из PDF страницы.

    
    ''' <summary>
    ''' Returns the numbers only from PDF page.
    ''' </summary>
    ''' <param name="page">PDF page.</param>
    ''' <returns>Numbers from PDF page.</returns>
    Public Shared Function GetOnlyNumbersFromPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As String
        ' get words of the page
        Dim words As Vintasoft.Imaging.Text.TextRegion() = page.TextRegion.GetWords(AddressOf WordCharacterPredicate, AddressOf WordDelimiterPredicate, AddressOf WhiteSpaceCharacterPredicate)
    
        Dim result As New System.Text.StringBuilder()
        For Each word As Vintasoft.Imaging.Text.TextRegion In words
            result.AppendLine(word.TextContent)
        Next
    
        Return result.ToString()
    End Function
    
    ''' <summary>
    ''' The word character predicate.
    ''' </summary>
    ''' <param name="lineSymbols">The line symbols.</param>
    ''' <param name="symbolIndex">Index of the symbol.</param>
    Public Shared Function WordCharacterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
        ' get the character to process
        Dim processingCharacter As Char = lineSymbols(symbolIndex).TextSymbol.Symbol
    
        ' if character is a number
        If System.[Char].IsNumber(processingCharacter) Then
            Return True
        ' if character can be categorized as a punctuation symbol
        ElseIf System.[Char].IsPunctuation(processingCharacter) Then
            Dim previousCharacter As Char = " "C
            If symbolIndex > 0 Then
                previousCharacter = lineSymbols(symbolIndex - 1).TextSymbol.Symbol
            End If
    
            Dim nextCharacter As Char = " "C
            If symbolIndex < lineSymbols.Length - 1 Then
                nextCharacter = lineSymbols(symbolIndex + 1).TextSymbol.Symbol
            End If
    
            ' if previous and next character is a number
            If System.[Char].IsNumber(previousCharacter) AndAlso System.[Char].IsNumber(nextCharacter) Then
                Return True
            End If
        End If
    
        Return False
    End Function
    
    ''' <summary>
    ''' The word delimiter predicate.
    ''' </summary>
    ''' <param name="lineSymbols">The line symbols.</param>
    ''' <param name="symbolIndex">Index of the symbol.</param>
    Public Shared Function WordDelimiterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
        ' exclude the punctuation marks from the text extraction
        Return False
    End Function
    
    ''' <summary>
    ''' The white space character predicate.
    ''' </summary>
    ''' <param name="lineSymbols">The line symbols.</param>
    ''' <param name="symbolIndex">Index of the symbol.</param>
    Public Shared Function WhiteSpaceCharacterPredicate(lineSymbols As Vintasoft.Imaging.Text.TextRegionSymbol(), symbolIndex As Integer) As Boolean
        ' get the character to process
        Dim processingCharacter As Char = lineSymbols(symbolIndex).TextSymbol.Symbol
        ' check that character can be categorized as white space
        Return System.[Char].IsWhiteSpace(processingCharacter)
    End Function
    
    
    
    /// <summary>
    /// Returns the numbers only from PDF page.
    /// </summary>
    /// <param name="page">PDF page.</param>
    /// <returns>Numbers from PDF page.</returns>
    public static string GetOnlyNumbersFromPdfPage(Vintasoft.Imaging.Pdf.Tree.PdfPage page)
    {
        // get words of the page
        Vintasoft.Imaging.Text.TextRegion[] words = page.TextRegion.GetWords(
            WordCharacterPredicate,
            WordDelimiterPredicate,
            WhiteSpaceCharacterPredicate);
    
        System.Text.StringBuilder result = new System.Text.StringBuilder();
        foreach (Vintasoft.Imaging.Text.TextRegion word in words)
        {
            result.AppendLine(word.TextContent);
        }
    
        return result.ToString();
    }
    
    /// <summary>
    /// The word character predicate.
    /// </summary>
    /// <param name="lineSymbols">The line symbols.</param>
    /// <param name="symbolIndex">Index of the symbol.</param>
    public static bool WordCharacterPredicate(
       Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
       int symbolIndex)
    {
        // get the character to process
        char processingCharacter = lineSymbols[symbolIndex].TextSymbol.Symbol;
        
        // if character is a number
        if (System.Char.IsNumber(processingCharacter))
            return true;
        // if character can be categorized as a punctuation symbol
        else if (System.Char.IsPunctuation(processingCharacter))
        {
            char previousCharacter = ' ';
            if (symbolIndex > 0)
                previousCharacter = lineSymbols[symbolIndex - 1].TextSymbol.Symbol;
    
            char nextCharacter = ' ';
            if (symbolIndex < lineSymbols.Length - 1)
                nextCharacter = lineSymbols[symbolIndex + 1].TextSymbol.Symbol;
            
            // if previous and next character is a number
            if (System.Char.IsNumber(previousCharacter) && System.Char.IsNumber(nextCharacter))
                return true;
        }
    
        return false;
    }
    
    /// <summary>
    /// The word delimiter predicate.
    /// </summary>
    /// <param name="lineSymbols">The line symbols.</param>
    /// <param name="symbolIndex">Index of the symbol.</param>
    public static bool WordDelimiterPredicate(
       Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
       int symbolIndex)
    {
        // exclude the punctuation marks from the text extraction
        return false;
    }
    
    /// <summary>
    /// The white space character predicate.
    /// </summary>
    /// <param name="lineSymbols">The line symbols.</param>
    /// <param name="symbolIndex">Index of the symbol.</param>
    public static bool WhiteSpaceCharacterPredicate(
       Vintasoft.Imaging.Text.TextRegionSymbol[] lineSymbols,
       int symbolIndex)
    {
        // get the character to process
        char processingCharacter = lineSymbols[symbolIndex].TextSymbol.Symbol;
        // check that character can be categorized as white space
        return System.Char.IsWhiteSpace(processingCharacter);
    }
    
    

    Требования

    Целевые платформы: .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5

    Смотрите также