#!/usr/bin/env python3
"""
Re-optimize internal links for all published articles.
Strips old single-word links and re-injects using LLM-based anchor text.
Updates both database and live WordPress posts.
"""

import logging
import re
import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

import requests
from config.settings import get_settings
from database.connection import execute_query, get_cursor
from modules import html_converter, internal_link_injector

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

# Pattern to match internal blog links: [text](/blog/slug/)
INTERNAL_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\((/blog/[^\)]+)\)")


def strip_internal_links(markdown: str) -> str:
    """Remove all internal /blog/ links, keeping the anchor text."""
    return INTERNAL_LINK_PATTERN.sub(r"\1", markdown)


def update_wp_post_content(wp_post_id: int, content_html: str) -> bool:
    """Update WordPress post content via REST API."""
    settings = get_settings()
    api_url = f"{settings.wp_url.rstrip('/')}/wp-json/wp/v2/posts/{wp_post_id}"

    try:
        response = requests.post(
            api_url,
            auth=settings.wp_auth,
            json={"content": content_html},
            timeout=60,
        )
        response.raise_for_status()
        return True
    except Exception as e:
        logger.error(f"Failed to update WP post {wp_post_id}: {e}")
        return False


def reoptimize_article(article: dict, dry_run: bool = False) -> dict:
    """
    Re-optimize internal links for a single article.

    Returns dict with results.
    """
    wp_post_id = article["wp_post_id"]
    article_id = article["id"]
    title = article["title"]
    word_count = article["word_count"]
    keywords = article["target_keywords"]
    cluster = article["content_cluster"]
    content = article["content_markdown"]

    # Count old links
    old_links = INTERNAL_LINK_PATTERN.findall(content)
    old_link_count = len(old_links)
    old_anchors = [anchor for anchor, _ in old_links]

    # Strip existing internal links
    stripped = strip_internal_links(content)

    # Re-inject with new LLM-based system
    new_content, new_link_count = internal_link_injector.inject_internal_links(
        content=stripped,
        topic_keywords=keywords,
        content_cluster=cluster,
        exclude_wp_id=wp_post_id,
        word_count=word_count,
    )

    # Count new links
    new_links = INTERNAL_LINK_PATTERN.findall(new_content)
    new_anchors = [anchor for anchor, _ in new_links]
    multi_word_count = sum(1 for a in new_anchors if len(a.split()) >= 2)

    result = {
        "article_id": article_id,
        "wp_post_id": wp_post_id,
        "title": title[:60],
        "word_count": word_count,
        "old_links": old_link_count,
        "new_links": new_link_count,
        "multi_word": multi_word_count,
        "old_anchors": old_anchors,
        "new_anchors": new_anchors,
        "db_updated": False,
        "wp_updated": False,
    }

    if dry_run:
        return result

    # Update database
    new_html = html_converter.convert_with_cta(new_content)
    try:
        with get_cursor() as cursor:
            cursor.execute(
                "UPDATE parketry_articles SET content_markdown = %s, content_html = %s WHERE id = %s",
                (new_content, new_html, article_id),
            )
            result["db_updated"] = True
    except Exception as e:
        logger.error(f"DB update failed for article {article_id}: {e}")

    # Update WordPress post
    result["wp_updated"] = update_wp_post_content(wp_post_id, new_html)

    return result


def update_published_content_keywords(article: dict) -> None:
    """Update the published_content table with enhanced keywords including title phrases."""
    enhanced = internal_link_injector.extract_linkable_keywords(
        article["content_markdown"],
        article["target_keywords"],
        title=article["title"],
    )
    try:
        with get_cursor() as cursor:
            cursor.execute(
                "UPDATE parketry_published_content SET main_keywords = %s WHERE wp_post_id = %s",
                (enhanced, article["wp_post_id"]),
            )
    except Exception as e:
        logger.error(f"Failed to update keywords for wp_post_id={article['wp_post_id']}: {e}")


def main():
    dry_run = "--dry-run" in sys.argv
    if dry_run:
        logger.info("DRY RUN MODE - no changes will be made")

    # Get all published articles
    rows = execute_query("""
        SELECT a.id, a.title, a.word_count, a.wp_post_id, a.content_markdown,
               t.target_keywords, t.content_cluster
        FROM parketry_articles a
        JOIN parketry_content_topics t ON a.topic_id = t.id
        WHERE a.wp_post_id IS NOT NULL AND a.wp_post_id > 0
        ORDER BY a.published_at ASC
    """)

    if not rows:
        logger.info("No published articles found")
        return 0

    logger.info(f"Found {len(rows)} published articles to re-optimize")

    results = []
    for i, article in enumerate(rows, 1):
        logger.info(f"\n[{i}/{len(rows)}] Processing: {article['title'][:60]}...")

        result = reoptimize_article(article, dry_run=dry_run)
        results.append(result)

        # Update published_content keywords (enriched with title phrases)
        if not dry_run:
            update_published_content_keywords(article)

        logger.info(
            f"  Links: {result['old_links']} -> {result['new_links']} "
            f"(multi-word: {result['multi_word']})"
        )
        if result["new_anchors"]:
            for a in result["new_anchors"]:
                wc = len(a.split())
                marker = "MW" if wc >= 2 else "SW"
                logger.info(f"    [{marker}] {a}")

        # Rate limit for LLM calls
        if i < len(rows):
            time.sleep(8)

    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    total_old = sum(r["old_links"] for r in results)
    total_new = sum(r["new_links"] for r in results)
    total_multi = sum(r["multi_word"] for r in results)
    print(f"Articles processed: {len(results)}")
    print(f"Total links: {total_old} -> {total_new}")
    print(f"Multi-word anchors: {total_multi}/{total_new} ({total_multi/max(total_new,1)*100:.0f}%)")
    if not dry_run:
        db_ok = sum(1 for r in results if r["db_updated"])
        wp_ok = sum(1 for r in results if r["wp_updated"])
        print(f"DB updated: {db_ok}/{len(results)}")
        print(f"WP updated: {wp_ok}/{len(results)}")

    return 0


if __name__ == "__main__":
    sys.exit(main())
