
    w;3h 1                         d Z ddlZddlZddlZddlmZ ddlmZ	 ddl
mZ ddlmZ ej                  Z ej                  e      Z G d de	j$                        Z G d de	j$                        Zy)	)PdfTextPagePdfTextSearcher    N)PdfiumError)PDFIUM_INFOc                   t     e Zd ZdZ fdZed        ZddZddZddZ	d Z
ddZd	 Zdd
Zd ZddZ xZS )r   z
    Text page helper class.
    
    Attributes:
        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
        page (PdfPage): Reference to the page this textpage belongs to.
    c                 \    || _         || _        t        |   t        j
                         y N)rawpagesuper__init__pdfium_cFPDFText_ClosePage)selfr
   r   	__class__s      [/var/www/html/audio-gradio/venv/lib/python3.12/site-packages/pypdfium2/_helpers/textpage.pyr   zPdfTextPage.__init__   s$    	445    c                     | j                   S r	   )r   r   s    r   parentzPdfTextPage.parent!   s    yyr   c                     ||kD  ryt        j                  | |      }|dk(  r| j                  |dz   ||dz   |      S t        j                  | |      }|dk(  r| j                  ||dz
  ||dz         S ||||fS )Nr      )r   "FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   c_startc_end	l_passive	r_passivet_startt_ends          r   r   z"PdfTextPage._get_active_text_range&   s    U?==dGLb=..wqy%1iXX;;D%HB;..waIVWKXXy)33r   c                    ||fdk(  r)|s't        j                  d       | j                  |      S |dk(  r| j                         |z
  }| j	                  |||z   dz
        }|dk(  ry|\  }}}}	||z  }|||	z   z  }|dz   |z
  }
dt
        j                  cxk  rd	k  rn n|
d
z  }
|
dz  }
t        j                  |
d
z        }t        j                  |t        j                  t        j                              }t        j                  | |||      }|
|k\  sJ d|
 d|        |j                  d|dz
  d
z   j                  d|      S )a  
        Warning:
            .. versionchanged:: 4.28
               For various reasons, calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
        
        Extract text from a given range.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Note:
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        r   r   z]get_text_range() call with default params will be implicitly redirected to get_text_bounded()errorsr   r   r    i  i     zBuffer too small: z vs N	utf-16-le)warningswarnget_text_boundedcount_charsr   r   buildctypescreate_string_buffercastPOINTERc_ushortr   FPDFText_GetTextr
   decode)r   indexcountr%   
force_thisactive_ranger    r!   r   r   in_countbuffer
buffer_ptr	out_counts                 r   get_text_rangezPdfTextPage.get_text_range6   sd   2 5>W$ZMMyz(((77B;$$&.E 225%+a-H1 0<,	9Y&&7W$
 +##*d*MHA,,X\:[[)HI
--dE5*M	9$T(:8*D&TT$zz*IaK?+22;v2NNr   c                    | j                   j                         }||d   }||d   }||d   }||d   }| ||||f}t        j                  g |dd }|dk  ryt	        j
                  |dz        }	t	        j                  |	t	        j                  t        j                              }
t        j                  g ||
|  |	j                  j                  d|      S )	a  
        Extract text from given boundaries in PDF coordinates.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        Nr   r   r'      r&   r(   r$   )r   get_bboxr   FPDFText_GetBoundedTextr.   r/   r0   r1   r2   r
   r4   )r   leftbottomrighttopr%   bboxargsn_charsr:   r;   s              r   r+   zPdfTextPage.get_text_boundedq   s     yy!!#<7D>!WF=GE;q'CdC/22BDB$BBa<,,Wq[9[[)HI
((D$D
DGDzz  V <<r   c                 P    t        j                  |       }|dk(  rt        d      |S )zV
        Returns:
            int: The number of characters on the text page.
        r   zFailed to get character count.)r   FPDFText_CountCharsr   )r   rH   s     r   r,   zPdfTextPage.count_chars   s,    
 ..t4b=>??r   c                 T    t        j                  | ||      }|dk(  rt        d      |S )a  
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        r   zFailed to count rectangles.)r   FPDFText_CountRectsr   )r   r5   r6   n_rectss       r   count_rectszPdfTextPage.count_rects   s0     ..tUEBb=;<<r   c                 D    t        j                  | ||||      }|dk  ry|S )a  
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character or an error occurred.
        r   N)r   FPDFText_GetCharIndexAtPos)r   xyx_toly_tolr5   s         r   	get_indexzPdfTextPage.get_index   s+     33D!QuM19r   c                    |r_t        j                         }t        j                  | ||      }|j                  |j                  |j
                  |j                  f\  }}}}nxt               t               t               t               f\  }}}}t        j                  | |||||      }|j                  |j                  |j                  |j                  f\  }}}}|st        d      ||||fS )a  
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zFailed to get charbox.)r   FS_RECTFFPDFText_GetLooseCharBoxrB   rC   rD   rE   c_doubleFPDFText_GetCharBoxvaluer   )	r   r5   looserectoklbrts	            r   get_charboxzPdfTextPage.get_charbox   s     $$&D224EBDKKTXXEJAq!Q!XZXZGJAq!Q--dE1aAFB!''177AGG;JAq!Q677!Qzr   c                    t               t               t               t               f\  }}}}t        j                  | |||||      }|st        d      |j                  |j                  |j                  |j                  fS )al  
        Get the bounding box of a text rectangle at the given index.
        Note that :meth:`.count_rects` must be called once with default parameters
        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))rY   r   FPDFText_GetRectr   r[   )r   r5   r_   r`   ra   rb   r^   s          r   get_rectzPdfTextPage.get_rect   ss     ZXZC
1a&&tUAq!Q?  [  \  \!''17733r   c                    t        |      dk(  rt        d      d}|r|t        j                  z  }|r|t        j                  z  }|r|t        j
                  z  }|dz   j                  d      }t        j                  |t        j                  t        j                              }t        j                  | |||      }	t        |	|       }
| j                  |
       |
S )au  
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
        Returns:
            PdfTextSearcher: A helper object to search text.
        r   z#Text length must be greater than 0. r(   )len
ValueErrorr   FPDF_MATCHCASEFPDF_MATCHWHOLEWORDFPDF_CONSECUTIVEencoder.   r0   r1   r2   FPDFText_FindStartr   _add_kid)r   textr5   
match_casematch_whole_wordconsecutiveflagsenc_textenc_text_ptrraw_searchersearchers              r   searchzPdfTextPage.search   s    ( t9>BCCX,,,EX111EX...E6M))+6{{8V^^FOO-LM224ueT"<6hr   )r   r   )r   r   ignoreF)NNNNr{   r#   )F)r   FFF)__name__
__module____qualname____doc__r   propertyr   r   r=   r+   r,   rN   rU   rc   rf   rz   __classcell__r   s   @r   r   r      sQ    6
  4 8Ov=@&84 $r   r   c                   D     e Zd ZdZ fdZed        Zd Zd Zd Z	 xZ
S )r   z
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    c                 \    || _         || _        t        |   t        j
                         y r	   )r
   textpager   r   r   FPDFText_FindClose)r   r
   r   r   s      r   r   zPdfTextSearcher.__init__  s$     445r   c                     | j                   S r	   )r   r   s    r   r   zPdfTextSearcher.parent  s    }}r   c                 t     ||       }|sy t        j                  |       }t        j                  |       }||fS r	   )r   FPDFText_GetSchResultIndexFPDFText_GetSchCount)r   	find_funcr^   r5   r6   s        r   _get_occurrencezPdfTextSearcher._get_occurrence#  s;    t_33D9--d3e|r   c                 @    | j                  t        j                        S )z
        Returns:
            (int, int): Start character index and count of the next occurrence,
            or None if the last occurrence was passed.
        )r   r   FPDFText_FindNextr   s    r   get_nextzPdfTextSearcher.get_next+       ##H$>$>??r   c                 @    | j                  t        j                        S )z
        Returns:
            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
            or None if the last occurrence was passed.
        )r   r   FPDFText_FindPrevr   s    r   get_prevzPdfTextSearcher.get_prev3  r   r   )r|   r}   r~   r   r   r   r   r   r   r   r   r   s   @r   r   r     s2    6
  @@r   r   )__all__r.   loggingr)   pypdfium2.rawr
   r   pypdfium2.internalinternalpdfium_ipypdfium2._helpers.miscr   pypdfium2.versionr   rY   	getLoggerr|   loggerAutoCloseabler   r    r   r   <module>r      se    -      % / )??			8	$z8)) zz)@x-- )@r   