"""
Internal link injection for SEO optimization.
Finds related published articles and injects markdown links.
Uses LLM-based anchor text suggestion via Gemini Flash.
"""

import json
import logging
import re
import time
import uuid
from pathlib import Path
from typing import Optional

from database.models import PublishedContentRepository

logger = logging.getLogger(__name__)

# Dynamic link count settings
MAX_LINKS = 8          # Upper ceiling
MIN_LINKS = 3          # Floor
LINKS_PER_WORDS = 450  # One link per N words

# LLM settings — model fallback chain (best to most stable)
LLM_MODELS = ["gemini-3-flash-preview", "gemini-2.5-flash"]
LLM_MAX_RETRIES = 3
LLM_INITIAL_DELAY = 5  # seconds, doubles each retry (exponential backoff)
MAX_CONTENT_CHARS = 15000  # Trim article content sent to LLM (covers ~2500 word articles)

# Pattern to match existing markdown links: [text](url)
LINK_PATTERN = re.compile(r"\[([^\]]+)\]\([^\)]+\)")

# Pattern to match markdown headings: # Heading or ## Heading etc.
HEADING_PATTERN = re.compile(r"^(#{1,6}\s+.*)$", re.MULTILINE)


def _calculate_max_links(word_count: int) -> int:
    """Calculate dynamic link count based on article length."""
    return max(MIN_LINKS, min(MAX_LINKS, word_count // LINKS_PER_WORDS))


def _load_prompt(filename: str) -> str:
    """Load prompt template from prompts directory."""
    prompt_path = Path(__file__).parent.parent / "prompts" / filename
    if not prompt_path.exists():
        raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
    return prompt_path.read_text()


def _suggest_anchors_with_llm(
    content: str,
    targets: list[dict],
    max_links: int,
) -> list[dict]:
    """
    Use Gemini Flash to find natural anchor text phrases in article content.

    Args:
        content: Article markdown content
        targets: List of link target dicts (slug, title, summary)
        max_links: Maximum links to suggest

    Returns:
        List of validated suggestions: [{"anchor_text": ..., "target_slug": ...}]
    """
    try:
        from google import genai
        from google.genai import types
        from modules.article_generator import get_gemini_client

        client = get_gemini_client()
    except Exception as e:
        logger.warning(f"Could not initialize Gemini client for link suggestion: {e}")
        return []

    # Build target descriptions for prompt
    target_lines = []
    target_slugs = set()
    for t in targets:
        summary = t.get("summary", "") or ""
        target_lines.append(f"- Slug: {t['slug']} | Titel: {t['title']} | Thema: {summary[:150]}")
        target_slugs.add(t["slug"])

    targets_text = "\n".join(target_lines)

    # Trim content to avoid excessive token usage
    trimmed_content = content[:MAX_CONTENT_CHARS]
    if len(content) > MAX_CONTENT_CHARS:
        trimmed_content += "\n[... Artikel gekürzt ...]"

    try:
        prompt_template = _load_prompt("internal_links.txt")
        prompt = prompt_template.format(
            max_links=max_links,
            targets=targets_text,
            article_content=trimmed_content,
        )
    except Exception as e:
        logger.warning(f"Could not load internal links prompt: {e}")
        return []

    # Call LLM with model fallback and exponential backoff retries
    response = None
    for model in LLM_MODELS:
        delay = LLM_INITIAL_DELAY
        for attempt in range(LLM_MAX_RETRIES):
            try:
                response = client.models.generate_content(
                    model=model,
                    contents=prompt,
                    config=types.GenerateContentConfig(
                        temperature=0.3,
                    ),
                )
                break
            except Exception as e:
                if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e):
                    if attempt < LLM_MAX_RETRIES - 1:
                        logger.warning(f"Rate limited on {model} (attempt {attempt + 1}/{LLM_MAX_RETRIES}), waiting {delay}s")
                        time.sleep(delay)
                        delay *= 2  # Exponential backoff
                        continue
                    logger.warning(f"Model {model} exhausted retries, falling back")
                else:
                    logger.warning(f"Model {model} failed: {e}")
                break  # Non-retryable error or retries exhausted — try next model
        else:
            continue  # All retries exhausted via continue, try next model
        if response is not None:
            break  # Success

    if not response or not response.text:
        logger.warning("All LLM models failed for anchor suggestion")
        return []

    # Parse JSON response
    raw_text = response.text.strip()
    # Strip markdown code fences if present
    if raw_text.startswith("```"):
        raw_text = re.sub(r"^```(?:json)?\s*\n?", "", raw_text)
        raw_text = re.sub(r"\n?```\s*$", "", raw_text)

    try:
        suggestions = json.loads(raw_text)
    except json.JSONDecodeError:
        logger.warning(f"LLM returned invalid JSON for anchors: {raw_text[:200]}")
        return []

    if not isinstance(suggestions, list):
        logger.warning("LLM response is not a JSON array")
        return []

    # Validate: each suggestion must have anchor_text that exists verbatim in the content
    # and target_slug that matches one of our targets
    validated = []
    content_lower = content.lower()
    for s in suggestions:
        if not isinstance(s, dict):
            continue
        anchor = s.get("anchor_text", "").strip()
        slug = s.get("target_slug", "").strip()

        if not anchor or not slug:
            continue
        if slug not in target_slugs:
            logger.debug(f"LLM suggested unknown slug '{slug}', skipping")
            continue
        if len(anchor.split()) < 2 or len(anchor.split()) > 6:
            logger.debug(f"LLM anchor '{anchor}' has wrong word count, skipping")
            continue

        # Verify the phrase actually exists in the article (case-insensitive word boundary)
        pattern = rf"\b{re.escape(anchor)}\b"
        if not re.search(pattern, content, re.IGNORECASE):
            logger.debug(f"LLM anchor '{anchor}' not found verbatim in article, skipping")
            continue

        validated.append({"anchor_text": anchor, "target_slug": slug})

    logger.info(f"LLM suggested {len(suggestions)} anchors, validated {len(validated)}")
    return validated


def _inject_link_at_position(
    content: str,
    anchor_text: str,
    url: str,
    placeholders: dict[str, str],
) -> tuple[str, bool]:
    """
    Find and replace anchor_text with a markdown link in content.

    Searches for the anchor text using word-boundary matching, preferring matches
    in the latter half of the article for better distribution.

    Returns:
        Tuple of (modified content, success boolean)
    """
    pattern = rf"\b({re.escape(anchor_text)})\b"
    matches = list(re.finditer(pattern, content, re.IGNORECASE))

    if not matches:
        return content, False

    # Prefer match in latter 2/3 of article for better distribution
    content_len = len(content)
    latter_matches = [m for m in matches if m.start() > content_len // 3]
    match = latter_matches[0] if latter_matches else matches[0]

    # Create link and protect with placeholder
    link = f"[{match.group(1)}]({url})"
    placeholder = f"__PROTECTED_{uuid.uuid4().hex[:8]}__"
    placeholders[placeholder] = link

    content = (
        content[: match.start()]
        + placeholder
        + content[match.end() :]
    )
    return content, True


def extract_linkable_keywords(
    content: str,
    topic_keywords: str,
    title: str = "",
) -> str:
    """
    Extract keywords suitable for internal linking from article content.

    Combines topic keywords with multi-word phrases from the title.

    Args:
        content: Article markdown content
        topic_keywords: Original topic keywords (comma-separated)
        title: Article title (for extracting multi-word phrases)

    Returns:
        Enhanced comma-separated keywords including extracted terms
    """
    # Start with original topic keywords
    keywords = [k.strip() for k in topic_keywords.split(",") if k.strip()]
    keywords_lower = {k.lower() for k in keywords}

    # Extract 2-3 word phrases from title
    if title:
        # Split title on punctuation to avoid cross-boundary phrases
        segments = re.split(r"[?!:–—|/]", title)
        stopwords = {"und", "der", "die", "das", "für", "im", "in", "von", "mit", "zu", "am", "auf", "ein", "eine", "den", "dem", "des"}
        added_from_title = 0
        for segment in segments:
            segment_words = segment.strip().split()
            for n in (3, 2):
                for i in range(len(segment_words) - n + 1):
                    if added_from_title >= 3:
                        break
                    phrase = " ".join(segment_words[i : i + n])
                    phrase_words = phrase.lower().split()
                    non_stop = [w for w in phrase_words if w not in stopwords]
                    if len(non_stop) >= max(1, n - 1) and phrase.lower() not in keywords_lower:
                        keywords.append(phrase)
                        keywords_lower.add(phrase.lower())
                        added_from_title += 1

    return ", ".join(keywords)


def _protect_existing_links(content: str) -> tuple[str, dict[str, str]]:
    """Replace existing markdown links and headings with placeholders to protect them."""
    placeholders = {}

    def replace_match(match):
        placeholder = f"__PROTECTED_{uuid.uuid4().hex[:8]}__"
        placeholders[placeholder] = match.group(0)
        return placeholder

    # Protect existing links
    protected = LINK_PATTERN.sub(replace_match, content)
    # Protect headings (don't link inside headings - bad for SEO)
    protected = HEADING_PATTERN.sub(replace_match, protected)

    return protected, placeholders


def _restore_links(content: str, placeholders: dict[str, str]) -> str:
    """Restore original links from placeholders."""
    for placeholder, original in placeholders.items():
        content = content.replace(placeholder, original)
    return content


def inject_internal_links(
    content: str,
    topic_keywords: str,
    content_cluster: Optional[str] = None,
    exclude_wp_id: Optional[int] = None,
    word_count: int = 0,
) -> tuple[str, int]:
    """
    Inject internal links to related published articles.

    Uses Gemini Flash to suggest natural multi-word anchor phrases that
    semantically match the target articles.

    Args:
        content: Markdown content to inject links into
        topic_keywords: Comma-separated keywords for the current topic
        content_cluster: Content cluster for relevance filtering
        exclude_wp_id: WordPress post ID to exclude (current article)
        word_count: Article word count for dynamic link calculation

    Returns:
        Tuple of (modified content, number of links injected)
    """
    keywords = [k.strip().lower() for k in topic_keywords.split(",") if k.strip()]
    if not keywords:
        logger.debug("No keywords provided for link injection")
        return content, 0

    targets = PublishedContentRepository.get_link_targets(
        cluster=content_cluster,
        exclude_wp_id=exclude_wp_id,
    )

    if not targets:
        logger.debug("No link targets found")
        return content, 0

    max_links = _calculate_max_links(word_count) if word_count > 0 else MIN_LINKS

    # Protect existing links and headings from being matched
    protected_content, placeholders = _protect_existing_links(content)

    links_injected = 0
    linked_ids = set()
    used_anchors = set()
    modified_content = protected_content

    # --- LLM-based anchor suggestion ---
    llm_suggestions = _suggest_anchors_with_llm(content, targets, max_links)

    # Build slug -> target lookup
    slug_to_target = {t["slug"]: t for t in targets}

    for suggestion in llm_suggestions:
        if links_injected >= max_links:
            break

        slug = suggestion["target_slug"]
        anchor_text = suggestion["anchor_text"]
        target = slug_to_target.get(slug)

        if not target or target["wp_post_id"] in linked_ids:
            continue
        if anchor_text.lower() in used_anchors:
            continue

        url = f"/blog/{slug}/"
        modified_content, success = _inject_link_at_position(
            modified_content, anchor_text, url, placeholders,
        )

        if success:
            links_injected += 1
            linked_ids.add(target["wp_post_id"])
            used_anchors.add(anchor_text.lower())
            logger.debug(f"Linked '{anchor_text}' -> {url}")

    # Restore original links from placeholders
    modified_content = _restore_links(modified_content, placeholders)

    logger.info(f"Injected {links_injected} internal links")
    return modified_content, links_injected
