"""
Markdown to HTML conversion with CTA injection.
"""

import logging
import re
from typing import Optional

import bleach
import markdown
from markdown.extensions.toc import TocExtension

logger = logging.getLogger(__name__)

# Allowed HTML tags and attributes for sanitization
ALLOWED_TAGS = [
    "h1", "h2", "h3", "h4", "h5", "h6",
    "p", "br", "hr",
    "ul", "ol", "li",
    "strong", "em", "b", "i", "u",
    "a", "blockquote", "code", "pre",
    "table", "thead", "tbody", "tr", "th", "td",
    "img", "figure", "figcaption",
    "div", "span",
]

ALLOWED_ATTRIBUTES = {
    "a": ["href", "title", "target", "rel"],
    "img": ["src", "alt", "title", "width", "height"],
    "th": ["colspan", "rowspan"],
    "td": ["colspan", "rowspan"],
    "div": ["class", "id"],
    "span": ["class"],
    "*": ["class", "id"],
}

# Middle CTA - default (uses theme's .content-cta styling)
CTA_MIDDLE = """
<div class="content-cta">
    <h3>Interessiert an ANPR-Lösungen?</h3>
    <p>Erfahren Sie, wie moderne Kennzeichenerkennung Ihr Parkraummanagement revolutionieren kann.</p>
    <a href="/kontakt/" class="btn">Mehr erfahren →</a>
</div>
"""

# Cluster-specific middle CTAs — reinforce the free-model message where it matters
CLUSTER_CTA_MIDDLE = {
    "kosten": """
<div class="content-cta">
    <h3>ANPR-Parkraummanagement für 0 €?</h3>
    <p>Parketry übernimmt Kameras, Installation, Betrieb und Wartung — komplett kostenfrei für Grundstückseigentümer.</p>
    <a href="/kontakt/" class="btn">Kostenfreies Angebot anfragen →</a>
</div>
""",
    "vergleich": """
<div class="content-cta">
    <h3>Alle Kosten, null Risiko</h3>
    <p>Während klassische Anbieter fünfstellige Investitionen verlangen, übernimmt Parketry sämtliche Kosten — von der Kamera bis zur Wartung.</p>
    <a href="/kontakt/" class="btn">Unverbindlich vergleichen →</a>
</div>
""",
    "probleme": """
<div class="content-cta">
    <h3>Parkprobleme lösen — ohne Investitionsrisiko</h3>
    <p>Parketry liefert die komplette ANPR-Lösung kostenfrei. Sie stellen nur Strom und Internet bereit.</p>
    <a href="/kontakt/" class="btn">Jetzt Lösung anfragen →</a>
</div>
""",
}

# Bottom CTA - direct conversion (uses theme's .content-cta styling)
CTA_BOTTOM = """
<div class="content-cta">
    <h3>Bereit für den nächsten Schritt?</h3>
    <p>Lassen Sie sich unverbindlich beraten. Unsere Experten analysieren Ihre Situation und zeigen konkrete Lösungswege auf.</p>
    <a href="/kontakt/" class="btn">Jetzt Beratungstermin anfragen →</a>
</div>
"""


def convert_markdown_to_html(markdown_text: str) -> str:
    """
    Convert Markdown to sanitized HTML.

    Uses Python-Markdown with common extensions.
    """
    # Strip internal link placeholders (LLM editorial notes, not actual content)
    markdown_text = re.sub(r'\[INTERNER LINK:[^\]]*\]', '', markdown_text)

    # Initialize markdown converter with extensions
    md = markdown.Markdown(
        extensions=[
            "extra",  # Tables, fenced code, footnotes
            "smarty",  # Smart quotes
            TocExtension(permalink=False),
        ]
    )

    # Convert to HTML
    html = md.convert(markdown_text)

    # Sanitize HTML to prevent XSS
    html = bleach.clean(
        html,
        tags=ALLOWED_TAGS,
        attributes=ALLOWED_ATTRIBUTES,
        strip=True,
    )

    # Add target="_blank" and rel="noopener" to external links
    html = _process_external_links(html)

    return html


def convert_with_cta(markdown_text: str, content_cluster: Optional[str] = None) -> str:
    """
    Convert Markdown to HTML and inject CTAs.

    Injects two CTAs:
    - Middle CTA: After ~50% of H2 headings (cluster-specific for kosten/vergleich/probleme)
    - Bottom CTA: Before FAQ section (direct conversion)

    Strips the first H1 heading since WordPress uses the title field.
    """
    # Strip first H1 (title is set separately in WordPress)
    markdown_text = strip_first_h1(markdown_text)
    html = convert_markdown_to_html(markdown_text)
    html = inject_ctas(html, content_cluster=content_cluster)
    return html


def strip_first_h1(markdown_text: str) -> str:
    """
    Remove the first H1 heading and any preamble before it.

    WordPress displays the title separately, so the H1 in content
    creates duplication. Also removes LLM preamble like
    "Hier ist der Artikel..." that appears before the H1.
    """
    lines = markdown_text.split("\n")

    # Find first H1 line
    h1_index = None
    for i, line in enumerate(lines):
        if line.startswith("# ") and not line.startswith("## "):
            h1_index = i
            break

    if h1_index is None:
        return markdown_text  # No H1 found, return as-is

    # Skip everything up to and including the H1
    result_lines = lines[h1_index + 1:]

    # Remove leading empty lines and horizontal rules after H1 removal
    while result_lines and (not result_lines[0].strip() or result_lines[0].strip() in ["---", "***", "___"]):
        result_lines.pop(0)

    return "\n".join(result_lines)


def inject_ctas(html: str, content_cluster: Optional[str] = None) -> str:
    """
    Inject two CTAs: middle of content + before FAQ.

    - Middle CTA: Inserted after ~50% of H2s (only if article has 4+ H2s).
      Uses cluster-specific CTA for kosten/vergleich/probleme, generic otherwise.
    - Bottom CTA: Inserted before FAQ section, or appended at end
    """
    h2_matches = list(re.finditer(r"<h2[^>]*>", html))

    # Select cluster-specific or default middle CTA
    middle_cta = CLUSTER_CTA_MIDDLE.get(content_cluster, CTA_MIDDLE)

    # Track positions where we've inserted (to adjust subsequent insertions)
    offset = 0

    # 1. Insert MIDDLE CTA after ~50% of H2s (only if enough content)
    if len(h2_matches) >= 4:
        middle_idx = len(h2_matches) // 2
        middle_pos = h2_matches[middle_idx].start()
        html = html[:middle_pos] + middle_cta + "\n\n" + html[middle_pos:]
        offset = len(middle_cta) + 2  # Account for inserted content
        logger.debug(f"Middle CTA injected before H2 #{middle_idx + 1} (cluster: {content_cluster})")

    # 2. Insert BOTTOM CTA before FAQ section
    faq_patterns = [
        r"(<h2[^>]*>.*?(?:FAQ|Häufig\s+gestellte\s+Fragen).*?</h2>)",
        r"(<h2[^>]*>.*?(?:Fragen\s+und\s+Antworten).*?</h2>)",
    ]

    faq_found = False
    for pattern in faq_patterns:
        match = re.search(pattern, html, re.IGNORECASE)
        if match:
            faq_start = match.start()
            html = html[:faq_start] + CTA_BOTTOM + "\n\n" + html[faq_start:]
            logger.debug("Bottom CTA injected before FAQ section")
            faq_found = True
            break

    if not faq_found:
        # Fallback: append at end
        html = html + "\n\n" + CTA_BOTTOM
        logger.debug("Bottom CTA appended at end (no FAQ found)")

    return html


def inject_cta(html: str, cta_html: Optional[str] = None) -> str:
    """
    Legacy function - inject single CTA before FAQ section.

    Kept for backwards compatibility. New code should use inject_ctas().
    """
    if cta_html is None:
        cta_html = CTA_BOTTOM

    # Find FAQ section (case insensitive)
    faq_patterns = [
        r"(<h2[^>]*>.*?(?:FAQ|Häufig\s+gestellte\s+Fragen).*?</h2>)",
        r"(<h2[^>]*>.*?(?:Fragen\s+und\s+Antworten).*?</h2>)",
    ]

    for pattern in faq_patterns:
        match = re.search(pattern, html, re.IGNORECASE)
        if match:
            # Insert CTA before FAQ heading
            faq_start = match.start()
            html = html[:faq_start] + cta_html + "\n\n" + html[faq_start:]
            logger.debug("CTA injected before FAQ section")
            return html

    # No FAQ section found - inject before last H2
    h2_matches = list(re.finditer(r"<h2[^>]*>", html))
    if h2_matches:
        last_h2_pos = h2_matches[-1].start()
        html = html[:last_h2_pos] + cta_html + "\n\n" + html[last_h2_pos:]
        logger.debug("CTA injected before last H2 (no FAQ found)")
    else:
        # No H2 found - append at end
        html = html + "\n\n" + cta_html
        logger.warning("No suitable position for CTA found, appended at end")

    return html


def _process_external_links(html: str) -> str:
    """
    Add target="_blank" and rel="noopener noreferrer" to external links.
    """
    # Match anchor tags with href
    def replace_link(match):
        tag = match.group(0)
        href = match.group(1)

        # Check if external link (not starting with / or #)
        if href and not href.startswith(("/", "#", "mailto:")):
            # Check if target already exists
            if 'target=' not in tag:
                tag = tag.replace(">", ' target="_blank" rel="noopener noreferrer">')
        return tag

    pattern = r'<a\s+href=["\']([^"\']*)["\'][^>]*>'
    return re.sub(pattern, replace_link, html)


def strip_markdown(text: str) -> str:
    """
    Strip markdown formatting for plain text output.
    Useful for meta descriptions.
    """
    # Remove code blocks
    text = re.sub(r"```[\s\S]*?```", "", text)
    # Remove inline code
    text = re.sub(r"`[^`]+`", "", text)
    # Remove headers
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
    # Remove bold/italic
    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
    text = re.sub(r"\*([^*]+)\*", r"\1", text)
    text = re.sub(r"__([^_]+)__", r"\1", text)
    text = re.sub(r"_([^_]+)_", r"\1", text)
    # Remove links
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    # Remove images
    text = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"\1", text)
    # Clean up whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()