"""
Content validation for generated articles.
Checks word count, error patterns, and blacklisted content.
"""

import logging
import re
from dataclasses import dataclass
from typing import Optional

logger = logging.getLogger(__name__)


@dataclass
class ValidationResult:
    valid: bool
    reason: Optional[str] = None
    word_count: int = 0
    warnings: list[str] = None

    def __post_init__(self):
        if self.warnings is None:
            self.warnings = []


# Patterns indicating AI-generated placeholder or error text
ERROR_PATTERNS = [
    r"(?i)as an ai",
    r"(?i)als ki",
    r"(?i)i cannot",
    r"(?i)ich kann nicht",
    r"(?i)i don't have access",
    r"(?i)i'm unable to",
    r"(?i)i am unable to",
    r"(?i)please note that i",
    r"(?i)it's important to note",
    r"(?i)es ist wichtig zu beachten, dass ich",
    r"(?i)leider kann ich",
    r"(?i)unfortunately,? i",
    r"(?i)\[insert",
    r"(?i)\[placeholder",
    r"(?i)\[TODO",
    r"(?i)\[EXAMPLE",
    r"(?i)\[your ",
    r"(?i)lorem ipsum",
    r"(?i)xxx+",
    r"(?i)tbd",
]

# Competitor mentions - HARD BLOCK (these leak SEO to competitors)
BLACKLIST_PATTERNS = [
    r"(?i)\bapcoa\b",
    r"(?i)\bcontipark\b",
    r"(?i)\bq-park\b",
    r"(?i)\bparkraum24\b",
    r"(?i)\bfairparken\b",
    r"(?i)\bfair\s+parken\b",
    r"(?i)\beasypark\b",
    r"(?i)\bpaybyphone\b",
    r"(?i)\bparkster\b",
    r"(?i)\bparknow\b",
    r"(?i)\bpark now\b",
    r"(?i)\bscheidt\s*&\s*bachmann\b",
    r"(?i)\bpark\s*(?:&|and|und)\s*control\b",
]

# Minimum and maximum word counts
MIN_WORD_COUNT = 1200  # LLMs consistently underdeliver on word count
MAX_WORD_COUNT = 4000
WORD_COUNT_TOLERANCE = 200  # Allow variance


def validate(content: str) -> ValidationResult:
    """
    Validate article content against quality requirements.

    Returns ValidationResult with valid=True if content passes all checks.
    """
    warnings = []

    # Check word count
    word_count = _count_words(content)

    if word_count < MIN_WORD_COUNT - WORD_COUNT_TOLERANCE:
        return ValidationResult(
            valid=False,
            reason=f"Word count too low: {word_count} (minimum: {MIN_WORD_COUNT})",
            word_count=word_count,
        )

    if word_count > MAX_WORD_COUNT + WORD_COUNT_TOLERANCE:
        warnings.append(
            f"Word count high: {word_count} (target max: {MAX_WORD_COUNT})"
        )

    # Check for error patterns
    for pattern in ERROR_PATTERNS:
        if re.search(pattern, content):
            match = re.search(pattern, content)
            context = content[max(0, match.start() - 20) : match.end() + 20]
            return ValidationResult(
                valid=False,
                reason=f"Error pattern detected: '{match.group()}' in context: '...{context}...'",
                word_count=word_count,
            )

    # Check for blacklisted competitor mentions - HARD BLOCK
    for pattern in BLACKLIST_PATTERNS:
        match = re.search(pattern, content)
        if match:
            # Extract context around the match
            start = max(0, match.start() - 30)
            end = min(len(content), match.end() + 30)
            context = content[start:end].replace('\n', ' ')
            return ValidationResult(
                valid=False,
                reason=f"Competitor mention blocked: '{match.group()}' in '...{context}...'",
                word_count=word_count,
            )

    # Check for required structure
    if not re.search(r"^#\s+", content, re.MULTILINE):
        warnings.append("No H1 heading found")

    if not re.search(r"^##\s+", content, re.MULTILINE):
        return ValidationResult(
            valid=False,
            reason="No H2 headings found - article lacks structure",
            word_count=word_count,
        )

    # Check for FAQ section
    if not re.search(r"(?i)(faq|häufig\s+gestellte\s+fragen)", content):
        warnings.append("No FAQ section detected")

    # Check for minimum heading count
    h2_count = len(re.findall(r"^##\s+", content, re.MULTILINE))
    if h2_count < 3:
        warnings.append(f"Only {h2_count} H2 headings found (recommend 4+)")

    # Log warnings
    for warning in warnings:
        logger.warning(f"Content warning: {warning}")

    return ValidationResult(
        valid=True,
        word_count=word_count,
        warnings=warnings,
    )


def _count_words(text: str) -> int:
    """Count words in markdown text, excluding code blocks."""
    # Remove code blocks
    text = re.sub(r"```[\s\S]*?```", "", text)
    # Remove inline code
    text = re.sub(r"`[^`]+`", "", text)
    # Remove URLs
    text = re.sub(r"https?://\S+", "", text)
    # Remove markdown link syntax
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    # Remove markdown formatting characters
    text = re.sub(r"[#*_\[\]()]", " ", text)
    # Split and count non-empty words
    words = [w for w in text.split() if len(w) > 0]
    return len(words)


def quick_check(content: str) -> tuple[bool, str]:
    """Quick validation check, returns (valid, reason)."""
    result = validate(content)
    return result.valid, result.reason or "OK"