
    Āi{C                       d Z ddlmZ ddlZddlZddlZddlmZm	Z	 erddl
mZ  ej                  d      Z ej                  d      Zej                  j!                  d      Zej                  j%                  e      Zej(                  j+                  e       eej,                  d	<    ej                  d
      e_         ej                  d      e_         ej                  d      e_        ej2                  e_         ej                  dej6                        e_         ej                  dej6                        e_         ej                  d      Z G d dej>                        Z y)a  
This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
A copy is imported rather than the module being directly imported as this ensures that the user can import
and  use the unmodified library for their own needs.
    )annotationsN)TYPE_CHECKINGSequence)Markdownz--!?>z-?>zhtml.parser
htmlparserz<[a-zA-Z]|</>z\?>z&([a-zA-Z][-.a-zA-Z0-9]*);a  
  <[a-zA-Z][^`\t\n\r\f />\x00]*       # tag name <= added backtick here
  (?:[\s/]*                           # optional whitespace before attribute name
    (?:(?<=['"\s/])[^`\s/>][^\s/=>]*  # attribute name <= added backtick here
      (?:\s*=+\s*                     # value indicator
        (?:'[^']*'                    # LITA-enclosed value
          |"[^"]*"                    # LIT-enclosed value
          |(?!['"])[^`>\s]*           # bare value <= added backtick here
         )
         (?:\s*,)*                    # possibly followed by a comma
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                 # trailing whitespace
a  
  [a-zA-Z][^`\t\n\r\f />]*           # tag name
  [\t\n\r\f /]*                     # optional whitespace before attribute name
  (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^\t\n\r\f /=>]*  # attribute name
    (?:=                            # value indicator
      (?:'[^']*'                    # LITA-enclosed value
        |"[^"]*"                    # LIT-enclosed value
        |(?!['"])[^>\t\n\r\f ]*     # bare value
       )
     )?
    [\t\n\r\f /]*                   # possibly followed by a space
   )*
   >?
z^([ ]*\n){2}c                      e Zd ZU dZd fdZ fdZ fdZedd       ZddZ	d dZ
d!dZd"d	Zd#d
Zd$dZd"dZd%dZd%dZd#dZd& fdZd#dZd#dZd#dZd' fdZd(dZd' fdZd)d* fdZdZded<   d+dZd'dZ xZS ),HTMLExtractorz
    Extract raw HTML from text.

    The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
    [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
    is stored in `cleandoc` as a list of strings.
    c                    d|vrd|d<   t        dg      | _        dg| _        d| _        d| _        t        |   |i | || _        y )Nconvert_charrefsFhrr   )set
empty_tagslineno_start_cacheoverride_comment_updateoverride_comment_startsuper__init__md)selfr   argskwargs	__class__s       W/var/www/html/content-pipeline/venv/lib/python3.12/site-packages/markdown/htmlparser.pyr   zHTMLExtractor.__init__h   sZ    V+).F%& tf+#$#',$&'# 	$)&)    c                    d| _         d| _        g | _        g | _        g | _        dg| _        d| _        d| _        t        | %          y)z1Reset this instance.  Loses all unprocessed data.Fr   N)
inrawintailstack_cachecleandocr   r   r   r   resetr   r   s    r   r!   zHTMLExtractor.resetx   sI    
 "
!##%#$#&'#',$r   c                   t         |           t        | j                        rb| j                  r;| j
                  s/| j                  t        j                  | j                               n| j                  | j                         t        | j                        r_| j                  j                  | j                  j                  j                  dj                  | j                                     g | _	        yy)zHandle any buffered data. N)r   closelenrawdatar   
cdata_elemhandle_datar   unescaper   r    appendr   	htmlStashstorejoinr"   s    r   r%   zHTMLExtractor.close   s    t|| $$T__  !4!4T\\!BC  .t{{MM  !2!2!8!89M!NODK r   c                h   t        t        | j                        dz
  | j                  dz
        D ]e  }| j                  |   }| j                  j                  d|      }|dk(  rt        | j                        }| j                  j                  |dz          g | j                  | j                  dz
     S )zHReturns char index in `self.rawdata` for the start of the current line.    
)ranger&   r   linenor'   findr+   )r   iilast_line_start_poslf_poss       r   line_offsetzHTMLExtractor.line_offset   s     D334Q6AF 	5B"&"9"9""=\\&&t-@AF|T\\*##**6!84	5 &&t{{1}55r   c                    | j                   dk(  ry| j                   dkD  ry| j                  | j                  | j                  | j                   z    j                         dk(  S )z
        Returns True if current position is at start of line.

        Allows for up to three blank spaces at start of line.
        r   T   Fr$   )offsetr'   r9   stripr   s    r   at_line_startzHTMLExtractor.at_line_start   sV     ;;!;;?||D,,T-=-=-KLRRTXZZZr   c                    | j                   | j                  z   }t        j                  j	                  | j
                  |      }|r| j
                  ||j                          S dj                  |      S )z
        Returns the text of the end tag.

        If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
        z</{}>)r9   r<   r   	endendtagsearchr'   endformat)r   tagstartms       r   get_endtag_textzHTMLExtractor.get_endtag_text   s_       4;;.  ''e<<<aeeg.. >>#&&r   c                *   || j                   v r| j                  ||       y | j                  j                  |      rJ| j                  s| j                         r.| j                  s"d| _        | j                  j                  d       | j                         }| j                  r7| j                  j                  |       | j                  j                  |       y | j                  j                  |       || j                  v r| j                          y y )NTr1   )r   handle_startendtagr   is_block_levelr   r?   r   r    r+   get_starttag_textr   r   CDATA_CONTENT_ELEMENTSclear_cdata_mode)r   rE   attrstexts       r   handle_starttagzHTMLExtractor.handle_starttag   s    $//!##C/77!!#&DKKD<N<N<PY]YcYcDJMM  &%%'::JJc"KKt$MM  &d111%%' 2r   c                :   | j                  |      }| j                  rb| j                  j                  |       || j                  v r7| j                  r+| j                  j                         |k(  rn| j                  r+t        | j                        dk(  rt        j                  | j                  | j                  | j                  z   t        |      z   d        r| j                  j                  d       nd| _        d| _        | j                  j                  | j                  j                  j!                  dj#                  | j                                     | j                  j                  d       g | _        y y | j                  j                  |       y )Nr   r1   TFr$   

)rH   r   r   r+   r   popr&   blank_line_rematchr'   r9   r<   r   r    r   r,   r-   r.   )r   rE   rP   s      r   handle_endtagzHTMLExtractor.handle_endtag   s,   ##C(::KKt$djj jjzz~~'3. jj 4::!# &&t||D4D4Dt{{4RUXY]U^4^4_'`aKK&&t, #'DK"
$$TWW%6%6%<%<RWWT[[=Q%RS$$V,  $ MM  &r   c                    | j                   rd|v rd| _         | j                  r| j                  j                  |       y | j                  j                  |       y )Nr1   F)r   r   r   r+   r    r   datas     r   r)   zHTMLExtractor.handle_data   sA    ;;44<DK::KKt$MM  &r   c                   | j                   s| j                  r| j                  j                  |       y| j	                         r	|rt
        j                  | j                  | j                  | j                  z   t        |      z   d       r|dz  }nd| _        | j                  r| j                  d   nd}|j                  d      s,|j                  d      r| j                  j                  d       | j                  j                  | j                  j                  j                  |             | j                  j                  d       y| j                  j                  |       y)z Handle empty tags (`<data>`). Nr1   Tr2   r$   rS   )r   r   r   r+   r?   rU   rV   r'   r9   r<   r&   r    endswithr   r,   r-   )r   rZ   is_blockitems       r   handle_empty_tagzHTMLExtractor.handle_empty_tag   s    ::KKt$!h""4<<0@0@4;;0NQTUYQZ0Z0[#\] #(,4==$2D==(T]]4-@$$T*MM  !2!2!8!8!>?MM  (MM  &r   c                x    | j                  | j                         | j                  j                  |             y )Nr]   )r_   rL   r   rK   )r   rE   rO   s      r   rJ   z HTMLExtractor.handle_startendtag  s.    d446AWAWX[A\]r   c                H    | j                  dj                  |      d       y )Nz&#{};Fra   r_   rD   r   names     r   handle_charrefzHTMLExtractor.handle_charref  s    gnnT2UCr   c                H    | j                  dj                  |      d       y )Nz&{};Fra   rc   rd   s     r   handle_entityrefzHTMLExtractor.handle_entityref  s    fmmD1EBr   c                    t        | j                        t        |      z
  }|dz
  }| j                  || dk(  r | j                  d       || _        d| _        y | j                  dj                  |      d       y )N   z</<Tz	<!--{}-->ra   )r&   r'   r)   r   r   r_   rD   )r   rZ   jis       r   handle_commentzHTMLExtractor.handle_comment  st    D	)E<<!$S!*+D'+/D(k006Fr   c                    | j                   r"d| _         | j                  }| j                  dz   }t        |   ||      S )NFr0   )r   r   r   	updatepos)r   rm   rl   r   s      r   rp   zHTMLExtractor.updatepos   sA    ''+0D(++A++a/Aw A&&r   c                H    | j                  dj                  |      d       y )Nz<!{}>Tra   rc   rY   s     r   handle_declzHTMLExtractor.handle_decl'  s    gnnT2TBr   c                H    | j                  dj                  |      d       y )Nz<?{}?>Tra   rc   rY   s     r   	handle_pizHTMLExtractor.handle_pi*  s    hood3dCr   c                t    |j                  d      rdnd}| j                  dj                  ||      d       y )NzCDATA[z]]>z]>z<![{}{}Tra   )
startswithr_   rD   )r   rZ   rC   s      r   unknown_declzHTMLExtractor.unknown_decl-  s4    x0edi..tS9DIr   c                    | j                         s| j                  rt        |   |      S | j	                  d       |dz   S )Nz<?rj   )r?   r   r   parse_pir)   )r   rm   r   s     r   ry   zHTMLExtractor.parse_pi1  s>    4;;7#A&& 	1ur   c                    | j                   }|j                  d|      sJ d       t        j                  ||dz         }|s| j	                  d       |dz   S |r'|j                         }| j                  ||dz   |        |j                         S )Nz<!--z"unexpected call to parse_comment()   rk   r0   )r'   rv   commentcloserB   r)   rF   rn   rC   )r   rm   reportr'   rV   rl   s         r   parse_commentzHTMLExtractor.parse_comment;  s    ,,!!&!,R.RR,##GQqS1S!q5LA!Q0yy{r   c                V   | j                         s| j                  rw| j                  ||dz    dk(  rS| j                  ||dz    dk(  s>| j                  |      }|dk(  r&| j	                  | j                  ||dz           |dz   S |S t
        |   |      S | j	                  d       |dz   S )	Nr;   z<![	   z	<![CDATA[r2   r0   z<!rj   )r?   r   r'   parse_bogus_commentr)   r   parse_html_declaration)r   rm   resultr   s      r   r   z$HTMLExtractor.parse_html_declarationG  s    4;;||Aac"e+DLL1Q34G;4V 11!4R<$$T\\!AE%:;q5L71!44 	1ur   c                t    t         |   ||      }|dk(  ry| j                  | j                  || d       |S )Nr2   Fra   )r   r   r_   r'   )r   rm   r}   posr   s       r   r   z!HTMLExtractor.parse_bogus_commentW  sC     g)!V4"9dll1S1EB
r   Nz
str | None_HTMLExtractor__starttag_textc                    | j                   S )z)Return full source of start tag: `<...>`.)r   r>   s    r   rL   zHTMLExtractor.get_starttag_textf  s    ###r   c                   | j                   ||dz    dk(  r&| j                  | j                   ||dz           |dz   S d | _        | j                  |      }|dk  r&| j                  | j                   ||dz           |dz   S | j                   }||| | _        g }t        j
                  j                  ||dz         }|sJ d       |j                         }|j                  d      j                         x| _
        }||k  rt        j                  j                  ||      }|sn|j                  ddd      \  }	}
}|
sd }n,|d d dcxk(  r|dd  k(  sn |d d d	cxk(  r|dd  k(  rn n|dd }|rt        j                  |      }|j                  |	j                         |f       |j                         }||k  r||| j                         }|d
vr| j                         \  }}d| j                  v rP|| j                  j!                  d      z   }t#        | j                        | j                  j%                  d      z
  }n|t#        | j                        z   }| j                  |||        |S |j'                  d      r| j)                  ||       |S || j*                  v r| j-                  |       | j/                  ||       |S )Nr;   z</>r   r0   z#unexpected call to parse_starttag()rj   'r2   ")>/>r1   r   )r'   r)   r   check_for_whole_start_tagr   tagfind_tolerantrV   rC   grouplowerlasttagattrfind_tolerantr*   r+   r=   getposcountr&   rfindr\   rJ   rM   set_cdata_moderQ   )r   rm   endposr'   rO   rV   krE   rG   attrnamerest	attrvaluerC   r4   r<   s                  r   parse_starttagzHTMLExtractor.parse_starttagj  s   <<!a% E)T\\!AE23q5L#//2A:T\\!AE23q5L,,&q0 ++11'1Q3?;;;uIIK"[[^1133s&j,,227A>A()1a(8%HdI 	2A$8)BC.82A#7237%aO	&//	:	LL(..*I67A &j a%%'k!![[]NFFt+++$"6"6"<"<T"BBT112//55d;<  #d&:&:";;WQv./M<<##C/  d111##C(  e,r   )r   r   )returnint)r   bool)rE   strr   r   )rE   r   rO   zSequence[tuple[str, str]])rE   r   )rZ   r   )rZ   r   r]   r   )re   r   )rm   r   rl   r   r   r   )rm   r   r   r   )T)r   )rm   r   r}   r   r   r   )r   r   ) __name__
__module____qualname____doc__r   r!   r%   propertyr9   r?   rH   rQ   rW   r)   r_   rJ   rf   rh   rn   rp   rr   rt   rw   ry   r~   r   r   r   __annotations__rL   r   __classcell__)r   s   @r   r	   r	   _   s      
6 
6['(*'6''.^DC	G'CDJ
  #'OZ&$6r   r	   )!r   
__future__r   reimportlib.util	importlibsystypingr   r   markdownr   compiler|   commentabruptcloseutil	find_specspecmodule_from_specr   loaderexec_modulemodulesstarttagopenpiclose	entityref
incompleteVERBOSElocatestarttagend_tolerantlocatetagendrU   
HTMLParserr	    r   r   <module>r      sF  ( # 	  
 *! rzz(#RZZ'  ~~.^^,,T2
   
 #&L  %"**_5
   RZZ'
 !rzz"?@
  #,,
 (2

 4 ZZ)
 % %"** & ZZ
 " 

?+AJ)) Ar   