
    Gіi2                        d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ  ej                  e      ZdZdZdZd	d
gZdZdZdZ ej,                  d      Z ej,                  dej0                        ZdedefdZdedefdZdedee   dedee   fdZ dedededeeef   de!ee"f   f
dZ#	 d$dedededefdZ$dede!eeeef   f   fdZ%dedeeef   defd Z&	 	 	 d%deded!e	e   d"e	e   dede!eef   fd#Z'y)&z
Internal link injection for SEO optimization.
Finds related published articles and injects markdown links.
Uses LLM-based anchor text suggestion via Gemini Flash.
    N)Path)Optional)PublishedContentRepository      i  zgemini-3-flash-previewzgemini-2.5-flash   i:  z\[([^\]]+)\]\([^\)]+\)z^(#{1,6}\s+.*)$
word_countreturnc                 L    t        t        t        t        | t        z              S )z5Calculate dynamic link count based on article length.)max	MIN_LINKSmin	MAX_LINKSLINKS_PER_WORDS)r	   s    @/var/www/html/content-pipeline/modules/internal_link_injector.py_calculate_max_linksr   %   s    y#i)FGHH    filenamec                     t        t              j                  j                  dz  | z  }|j                         st	        d|       |j                         S )z,Load prompt template from prompts directory.promptszPrompt file not found: )r   __file__parentexistsFileNotFoundError	read_text)r   prompt_paths     r   _load_promptr   *   sO    x.''..:XEK"9+ GHH  ""r   contenttargets	max_linksc                 d	   	 ddl m} ddlm} ddlm}  |       }g }t               }	|D ]O  }
|
j                  dd      xs d}|j                  d	|
d
    d|
d    d|dd         |	j                  |
d
          Q dj                  |      }| dt         }t        |       t        kD  r|dz  }	 t!        d      }|j#                  |||      }d}t$        D ]Q  }t&        }t)        t*              D ]2  }	 |j,                  j/                  |||j1                  d            } n N|Q n |r|j8                  st        j                  d#       g S |j8                  j;                         }|j=                  d$      r.t?        j@                  d%d|      }t?        j@                  d&d|      }	 tC        jD                  |      }tI        |tJ              st        j                  d)       g S g }| jM                         }|D ]6  }tI        |tN              s|j                  d*d      j;                         }|j                  d+d      j;                         }|r|sZ||	vrt        jQ                  d,| d-       xt        |jS                               dk  st        |jS                               d.kD  rt        jQ                  d/| d0       d1t?        jT                  |       d1}t?        jV                  || t>        jX                        st        jQ                  d/| d2       #|j                  ||d3       9 t        j[                  d4t        |       d5t        |              |S # t        $ r$}t        j                  d|        g cY d}~S d}~ww xY w# t        $ r$}t        j                  d|        g cY d}~S d}~ww xY w# t        $ r}dt3        |      v sdt3        |      v ro|t*        dz
  k  rIt        j                  d| d|dz    dt*         d| d	       t5        j6                  |       |dz  }Y d}~Lt        j                  d | d!       nt        j                  d | d"|        Y d}~ Rd}~ww xY w# tB        jF                  $ r  t        j                  d'|dd(         g cY S w xY w)6aU  
    Use Gemini Flash to find natural anchor text phrases in article content.

    Args:
        content: Article markdown content
        targets: List of link target dicts (slug, title, summary)
        max_links: Maximum links to suggest

    Returns:
        List of validated suggestions: [{"anchor_text": ..., "target_slug": ...}]
    r   )genai)types)get_gemini_clientz8Could not initialize Gemini client for link suggestion: Nsummary z- Slug: slugz
 | Titel: titlez
 | Thema:    
u   
[... Artikel gekürzt ...]zinternal_links.txt)r    r   article_contentz&Could not load internal links prompt: g333333?)temperature)modelcontentsconfig429RESOURCE_EXHAUSTED   zRate limited on z
 (attempt /z), waiting s   zModel z  exhausted retries, falling backz	 failed: z+All LLM models failed for anchor suggestionz```z^```(?:json)?\s*\n?z
\n?```\s*$z'LLM returned invalid JSON for anchors:    z LLM response is not a JSON arrayanchor_texttarget_slugzLLM suggested unknown slug 'z', skipping   zLLM anchor 'z ' has wrong word count, skippingz\bz)' not found verbatim in article, skipping)r7   r8   zLLM suggested z anchors, validated ).googler"   google.genair#   modules.article_generatorr$   	ExceptionloggerwarningsetgetappendaddjoinMAX_CONTENT_CHARSlenr   format
LLM_MODELSLLM_INITIAL_DELAYrangeLLM_MAX_RETRIESmodelsgenerate_contentGenerateContentConfigstrtimesleeptextstrip
startswithresubjsonloadsJSONDecodeError
isinstancelistlowerdictdebugsplitescapesearch
IGNORECASEinfo)r   r   r    r"   r#   r$   clientetarget_linestarget_slugstr%   targets_texttrimmed_contentprompt_templatepromptresponser-   delayattemptraw_textsuggestions	validatedcontent_lowerr4   anchorr'   patterns                               r   _suggest_anchors_with_llmrv   2   s     &?"$ L5L $%%	2&,"hqykAgJ<zRYZ^[^R_Q`ab6#$
 99\*L 001O
7|''99	&';< '' + ( 
 H !_- 	G!==99# 66$' 7  :  	, 58 8==DE	 }}""$H5!660"h?66-X6jj*
 k4(9:	 IMMOM G!T"}b)//1uu]B'--/T|#LL7v[IJv||~"c&,,.&9A&=LL<x/OPQ 		&)*"-yy'2==9LL<x/XYZEF+G. KK.[!1 22Fs9~FVWX[  QRSQTUV	4  ?sCD	$  
CF?&:c!f&D1!44)9%
7UV;-WXYhXiituzt{{|'}~

5)
 NNVE72R#STNNVE7)A3#?@
:  @$3@PQR	sl   M  -N 0.O 'Q<  	N)NNN	N=N82N=8N= 	Q9	A)Q485Q44Q9<0R/.R/r7   urlplaceholdersc                     dt        j                  |       d}t        t        j                  || t         j                              }|s| dfS t        |       }|D cg c]  }|j                         |dz  kD  s| }}|r|d   n|d   }	d|	j                  d       d| d	}
d
t        j                         j                  dd  d}|
||<   | d|	j                          |z   | |	j                         d z   } | dfS c c}w )a  
    Find and replace anchor_text with a markdown link in content.

    Searches for the anchor text using word-boundary matching, preferring matches
    in the latter half of the article for better distribution.

    Returns:
        Tuple of (modified content, success boolean)
    z\b(z)\bFr   r   [r2   z]()__PROTECTED_Nr   __T)rU   r`   r[   finditerrb   rF   startgroupuuiduuid4hexend)r   r7   rw   rx   ru   matchescontent_lenmlatter_matchesmatchlinkplaceholders               r   _inject_link_at_positionr      s    RYY{+,C0G2;;w?@G~ g,K!(IAAGGIq8H,HaINI!/N1WQZE u{{1~bQ'D !1!1"1!5 6b9K $L 	%++- 
	
%))+-
 	! 
 D= Js   C;9C;topic_keywordsr(   c           	      2   |j                  d      D cg c]#  }|j                         s|j                         % }}|D ch c]  }|j                          }}|rt        j                   d|      }h d}d}|D ]  }	|	j                         j                         }
dD ]  }t	        t        |
      |z
  dz         D ]  }|dk\  r 'dj                  |
|||z          }|j                         j                         }|D cg c]	  }||vs| }}t        |      t        d|dz
        k\  so|j                         |vs|j                  |       |j                  |j                                |dz  }   d	j                  |      S c c}w c c}w c c}w )
a  
    Extract keywords suitable for internal linking from article content.

    Combines topic keywords with multi-word phrases from the title.

    Args:
        content: Article markdown content
        topic_keywords: Original topic keywords (comma-separated)
        title: Article title (for extracting multi-word phrases)

    Returns:
        Enhanced comma-separated keywords including extracted terms
    ,u   [?!:–—|/]>      füramiminzuaufdasdemdenderdesdieeinmitundvoneiner   )r   r5   r2   r    z, )
r_   rS   r\   rU   rJ   rF   rD   r   rB   rC   )r   r   r(   kkeywordskeywords_lowersegments	stopwordsadded_from_titlesegmentsegment_wordsniphrasephrase_wordswnon_stops                    r   extract_linkable_keywordsr      s   & $2#7#7#<Ja		JHJ)12Aaggi2N2 88,e4 J	 	.G#MMO113M 
.s=1A59: 	.A'1, XXmAA&>?F#)<<>#7#7#9L+7Na1I;MNHN8}Aq1u5&,,.P^:^ /&**6<<>:(A-(	.
.	. 99X/ K2   Os   F
F
F?	F
	F
c                 r    i fd}t         j                  ||       }t        j                  ||      }|fS )zOReplace existing markdown links and headings with placeholders to protect them.c                 z    dt        j                         j                  d d  d}| j                  d      |<   |S )Nr|   r   r}   r   )r   r   r   r   )r   r   rx   s     r   replace_matchz._protect_existing_links.<locals>.replace_match  s=    $TZZ\%5%5bq%9$:"=$)KKN[!r   )LINK_PATTERNrV   HEADING_PATTERN)r   r   	protectedrx   s      @r   _protect_existing_linksr     s>    L   8I##M9=Il""r   c                 Z    |j                         D ]  \  }}| j                  ||      }  | S )z)Restore original links from placeholders.)itemsreplace)r   rx   r   originals       r   _restore_linksr     s4    !-!3!3!5 9X//+x89Nr   content_clusterexclude_wp_idc                    |j                  d      D cg c]1  }|j                         s|j                         j                         3 }}|st        j	                  d       | dfS t        j                  ||      }|st        j	                  d       | dfS |dkD  rt        |      nt        }t        |       \  }	}
d}t               }t               }|	}t        | ||      }|D ci c]  }|d   |
 }}|D ]  }||k\  r n|d   }|d   }|j                  |      }|r|d	   |v r/|j                         |v rBd
| d}t        ||||
      \  }}|s\|dz  }|j                  |d	          |j                  |j                                t        j	                  d| d|         t        ||
      }t        j!                  d| d       ||fS c c}w c c}w )aS  
    Inject internal links to related published articles.

    Uses Gemini Flash to suggest natural multi-word anchor phrases that
    semantically match the target articles.

    Args:
        content: Markdown content to inject links into
        topic_keywords: Comma-separated keywords for the current topic
        content_cluster: Content cluster for relevance filtering
        exclude_wp_id: WordPress post ID to exclude (current article)
        word_count: Article word count for dynamic link calculation

    Returns:
        Tuple of (modified content, number of links injected)
    r   z'No keywords provided for link injectionr   )clusterr   zNo link targets foundr'   r8   r7   
wp_post_idz/blog/r3   r2   zLinked 'z' -> z	Injected z internal links)r_   rS   r\   r>   r^   r   get_link_targetsr   r   r   r@   rv   rA   r   rC   r   rc   )r   r   r   r   r	   r   r   r   r    protected_contentrx   links_injected
linked_idsused_anchorsmodified_contentllm_suggestionsrh   slug_to_target
suggestionr'   r7   targetrw   successs                           r   inject_internal_linksr   $  s   . ,:+?+?+DRa		!RHR>?z(99#G
 ,-z4>N$Z0	I '>g&F#|NJ5L( 0)LO -44qail4N4% =
Y&-( /##D)-;,.tfA$<k3%
!' aNNN6,/0[..01LL8K=cU;<-=2 &&6E
KK)N+?;<^++u S8 5s   G G&G)r&   )NNr   )(__doc__rW   loggingrU   rP   r   pathlibr   typingr   database.modelsr   	getLogger__name__r>   r   r   r   rH   rK   rI   rE   compiler   	MULTILINEr   intr   rO   r   r[   r]   rv   tupleboolr   r   r   r   r    r   r   <module>r      s     	     6			8	$ 		 '(:;
   rzz34 "**/>IS IS I
#3 #3 #CC$ZC C 
$Z	CL$$$ 
$ sCx.	$
 39$T *** * 		*Z#S #U3S#X3F-G #"C tCH~ #  &*#'Q,Q,Q, c]Q, C=	Q,
 Q, 38_Q,r   