
    i%                     B   d Z ddlZddlZddlmZ ddlZddlZddlmZ  ej                  e
      Zg dZg dg ddd	gdd	gd
dgd
gd
dgdZdZddddZdZdedefdZd dedee   defdZdedefdZd dedee   defdZd dedee   defdZdedefdZdedefdZy)!z1
Markdown to HTML conversion with CTA injection.
    N)Optional)TocExtension) h1h2h3h4h5h6pbrhrulollistrongembiua
blockquotecodepretabletheadtbodytrthtdimgfigure
figcaptiondivspan)hreftitletargetrel)srcaltr&   widthheightcolspanrowspanclassid)r   r    r   r   r#   r$   *u   
<div class="content-cta">
    <h3>Interessiert an ANPR-Lösungen?</h3>
    <p>Erfahren Sie, wie moderne Kennzeichenerkennung Ihr Parkraummanagement revolutionieren kann.</p>
    <a href="/kontakt/" class="btn">Mehr erfahren →</a>
</div>
u  
<div class="content-cta">
    <h3>ANPR-Parkraummanagement für 0 €?</h3>
    <p>Parketry übernimmt Kameras, Installation, Betrieb und Wartung — komplett kostenfrei für Grundstückseigentümer.</p>
    <a href="/kontakt/" class="btn">Kostenfreies Angebot anfragen →</a>
</div>
u'  
<div class="content-cta">
    <h3>Alle Kosten, null Risiko</h3>
    <p>Während klassische Anbieter fünfstellige Investitionen verlangen, übernimmt Parketry sämtliche Kosten — von der Kamera bis zur Wartung.</p>
    <a href="/kontakt/" class="btn">Unverbindlich vergleichen →</a>
</div>
u  
<div class="content-cta">
    <h3>Parkprobleme lösen — ohne Investitionsrisiko</h3>
    <p>Parketry liefert die komplette ANPR-Lösung kostenfrei. Sie stellen nur Strom und Internet bereit.</p>
    <a href="/kontakt/" class="btn">Jetzt Lösung anfragen →</a>
</div>
)kosten	vergleichproblemeu   
<div class="content-cta">
    <h3>Bereit für den nächsten Schritt?</h3>
    <p>Lassen Sie sich unverbindlich beraten. Unsere Experten analysieren Ihre Situation und zeigen konkrete Lösungswege auf.</p>
    <a href="/kontakt/" class="btn">Jetzt Beratungstermin anfragen →</a>
</div>
markdown_textreturnc                     t        j                  dd|       } t        j                  ddt	        d      g      }|j                  |       }t        j                  |t        t        d	      }t        |      }|S )
z_
    Convert Markdown to sanitized HTML.

    Uses Python-Markdown with common extensions.
    z\[INTERNER LINK:[^\]]*\] extrasmartyF)	permalink)
extensionsT)tags
attributesstrip)resubmarkdownMarkdownr   convertbleachcleanALLOWED_TAGSALLOWED_ATTRIBUTES_process_external_links)r5   mdhtmls      8/var/www/html/content-pipeline/modules/html_converter.pyconvert_markdown_to_htmlrM   Q   sy     FF6MJM 
		5)

B ::m$D <<%	D #4(DK    content_clusterc                 L    t        |       } t        |       }t        ||      }|S )a'  
    Convert Markdown to HTML and inject CTAs.

    Injects two CTAs:
    - Middle CTA: After ~50% of H2 headings (cluster-specific for kosten/vergleich/probleme)
    - Bottom CTA: Before FAQ section (direct conversion)

    Strips the first H1 heading since WordPress uses the title field.
    )rO   )strip_first_h1rM   inject_ctas)r5   rO   rK   s      rL   convert_with_ctarS   t   s)     #=1M#M2Dt_=DKrN   c                    | j                  d      }d}t        |      D ],  \  }}|j                  d      s|j                  d      r*|} n || S ||dz   d }|re|d   j                         r|d   j                         dv r=|j	                  d       |r*|d   j                         s'|d   j                         dv r=dj                  |      S )z
    Remove the first H1 heading and any preamble before it.

    WordPress displays the title separately, so the H1 in content
    creates duplication. Also removes LLM preamble like
    "Hier ist der Artikel..." that appears before the H1.
    
Nz# z##    r   )z---z***___)split	enumerate
startswithr?   popjoin)r5   linesh1_indexr   lineresult_liness         rL   rQ   rQ      s     %E HU# 4??4 )?H
  A'L Q 5 5 7<?;P;P;RVk;k Q 5 5 7<?;P;P;RVk;k 99\""rN   rK   c                    t        t        j                  d|             }t        j	                  |t
              }d}t        |      dk\  r_t        |      dz  }||   j                         }| d| |z   dz   | |d z   } t        |      dz   }t        j                  d|dz    d	| d
       ddg}d}|D ]g  }	t        j                  |	| t        j                        }
|
s+|
j                         }| d| t        z   dz   | |d z   } t        j                  d       d} n |s!| dz   t        z   } t        j                  d       | S )a  
    Inject two CTAs: middle of content + before FAQ.

    - Middle CTA: Inserted after ~50% of H2s (only if article has 4+ H2s).
      Uses cluster-specific CTA for kosten/vergleich/probleme, generic otherwise.
    - Bottom CTA: Inserted before FAQ section, or appended at end
    	<h2[^>]*>r         N

zMiddle CTA injected before H2 #rV   z (cluster: ):   (<h2[^>]*>.*?(?:FAQ|Häufig\s+gestellte\s+Fragen).*?</h2>)2(<h2[^>]*>.*?(?:Fragen\s+und\s+Antworten).*?</h2>)Fz&Bottom CTA injected before FAQ sectionTz)Bottom CTA appended at end (no FAQ found))listr@   finditerCLUSTER_CTA_MIDDLEget
CTA_MIDDLElenstartloggerdebugsearch
IGNORECASE
CTA_BOTTOM)rK   rO   
h2_matches
middle_ctaoffset
middle_idx
middle_posfaq_patterns	faq_foundpatternmatch	faq_starts               rL   rR   rR      s]    bkk,56J $''DJ F :!_)

+113
KZ :-6jk9JJZ1$6zA~6FkRaQbbcde 	F=L
 I 		'47I
#j069D<LLDLLABI f}z)@AKrN   cta_htmlc                    |t         }ddg}|D ]c  }t        j                  || t        j                        }|s+|j	                         }| d| |z   dz   | |d z   } t
        j                  d       | c S  t        t        j                  d|             }|r;|d   j	                         }| d| |z   dz   | |d z   } t
        j                  d       | S | dz   |z   } t
        j                  d	       | S )
z
    Legacy function - inject single CTA before FAQ section.

    Kept for backwards compatibility. New code should use inject_ctas().
    Nrg   rh   re   zCTA injected before FAQ sectionrb   z*CTA injected before last H2 (no FAQ found)z3No suitable position for CTA found, appended at end)
rt   r@   rr   rs   ro   rp   rq   ri   rj   warning)rK   r   rz   r|   r}   r~   ru   last_h2_poss           rL   
inject_ctar      s     	F=L
   		'47I
#h.7$yz:JJDLL:;K bkk,56J n**,L[!H,v5[\8JJAB K f}x'LMKrN   c                 :    d }d}t        j                  |||       S )zN
    Add target="_blank" and rel="noopener noreferrer" to external links.
    c                     | j                  d      }| j                  d      }|r'|j                  d      sd|vr|j                  dd      }|S )Nr   rV   )/#zmailto:ztarget=>z+ target="_blank" rel="noopener noreferrer">)grouprZ   replace)r}   tagr%   s      rL   replace_linkz-_process_external_links.<locals>.replace_link   sK    kk!n{{1~ (=>#kk#'TU
rN   z#<a\s+href=["\']([^"\']*)["\'][^>]*>)r@   rA   )rK   r   r|   s      rL   rI   rI      s!    
	 5G66'<..rN   textc                    t        j                  dd|       } t        j                  dd|       } t        j                  dd| t         j                        } t        j                  dd|       } t        j                  dd|       } t        j                  d	d|       } t        j                  d
d|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } | j                         S )z\
    Strip markdown formatting for plain text output.
    Useful for meta descriptions.
    z```[\s\S]*?```r8   z`[^`]+`z
^#{1,6}\s+)flagsz\*\*([^*]+)\*\*z\1z\*([^*]+)\*z__([^_]+)__z	_([^_]+)_z\[([^\]]+)\]\([^)]+\)z!\[([^\]]*)\]\([^)]+\)z\s+ )r@   rA   	MULTILINEr?   )r   s    rL   strip_markdownr     s     66#R.D66*b$'D66-T>D66$eT2D66.%.D66.%.D66,t,D66*E48D66+UD9D66&#t$D::<rN   )N)__doc__loggingr@   typingr   rE   rB   markdown.extensions.tocr   	getLogger__name__rp   rG   rH   rm   rk   rt   strrM   rS   rQ   rR   r   rI   r    rN   rL   <module>r      s9    	    0			8	$	 
,5i
 i
 T?I
4 
 2
 C  C  FC (3- SV "## ## #<-c -HSM -S -`#S #HSM #S #L/# /# /(  rN   