
    tĀi                     b   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ  ej(                  e      Z G d
 dee      Ze G d d             Zdej4                  fdZde
e   fdZ	 	 ddedede
e   de
e   def
dZdedefdZ dedefdZ!dedefdZ"y)zB
Topic selection and semantic deduplication for content pipeline.
    N)	dataclass)Enum)Path)Optional)genai)get_settings)PublishedContentRepositoryTopicTopicRepositoryTopicStatusc                       e Zd ZdZdZdZy)DupeVerdictuniquepartial_overlap	duplicateN)__name__
__module____qualname__UNIQUEPARTIAL_OVERLAP	DUPLICATE     8/var/www/html/content-pipeline/modules/topic_selector.pyr   r      s    F'OIr   r   c                   <    e Zd ZU eed<   eed<   eed<   ee   ed<   y)
DupeResultverdictreasonoverlap_scoresimilar_articlesN)r   r   r   r   __annotations__strfloatlistr   r   r   r   r      s    K3ir   r   returnc                  V    t               } t        j                  | j                        S )zInitialize Gemini client.)api_key)r   r   Clientgemini_api_key)settingss    r   get_gemini_clientr+   '   s    ~H<< 7 788r   c                  *    t        j                         S )z%Fetch highest-priority pending topic.)r   get_next_pendingr   r   r   get_next_topicr.   -   s    ++--r   topic_idstatusr   
article_idc                 2    t        j                  | |||      S )zUpdate topic status.)r   update_status)r/   r0   r   r1   s       r   mark_statusr4   2   s     ((66:NNr   topicc                    t        j                         }|s2t        j                  d       t	        t
        j                  ddg       S dj                  t        |      D cg c]4  \  }}d|dz    d|j                   d	|j                   d
|j                   6 c}}      }t        d      j                  | j                  | j                  | j                  xs d| j                   |      }	 t#               }|j$                  j'                  d|      }t)        |j*                        }t        j                  d| j                  dd  d|j,                  j.                   d|j0                  dd       |S c c}}w # t2        $ rK}	t        j5                  d|	        t	        t
        j                  dt7        |	       dg       cY d}	~	S d}	~	ww xY w)a  
    Check if topic overlaps with published content using Gemini.

    Returns a DupeResult with verdict:
    - UNIQUE: No significant overlap, proceed with generation
    - PARTIAL_OVERLAP: Some overlap, but unique angle possible
    - DUPLICATE: Too similar, skip this topic
    z%No published content to check againstzNo existing content in database        r   r   r   r    z

zARTIKEL    z	:
Titel: z
Zusammenfassung: z
Keywords: zsemantic_dedupe.txt )topic_titletopic_keywordssecondary_keywordssearch_intentexisting_contentzgemini-2.0-flash)modelcontentszDedupe check for 'N2   z...': z (overlap: z.0%)zSemantic dedupe failed: zDedupe check failed: )r	   get_allloggerinfor   r   r   join	enumeratetitlesummarymain_keywords_load_promptformattarget_keywordsr=   r>   r+   modelsgenerate_content_parse_dedupe_responsetextr   valuer   	Exceptionerrorr"   )
r5   	publishedipr?   promptclientresponseresultes
             r   semantic_dedupe_checkr^   <   s    +224I;<&&4	
 	
 {{ "),	
1 qse:aggY.A!))LYZYhYhXij	
 /077KK,, 339r))) 8 F
"$==11$ 2 
 (6 Sb!1 2&9M9M8N O--c2!5	
 3	
4  
/s34&&*3q6(3	
 	

s&   "9E8
/BE> >	GA GGGfilenamec                     t        t              j                  j                  dz  | z  }|j                         st	        d|       |j                         S )z,Load prompt template from prompts directory.promptszPrompt file not found: )r   __file__parentexistsFileNotFoundError	read_text)r_   prompt_paths     r   rL   rL   w   sO    x.''..:XEK"9+ GHH  ""r   response_textc           
         	 | j                         }d|v r'|j                  d      d   j                  d      d   }n*d|v r&|j                  d      d   j                  d      d   }t        j                  |      }|j	                  dd      j                         }|dk(  rt        j                  }n&|dk(  rt        j                  }nt        j                  }t        ||j	                  d	d
      t        |j	                  dd            |j	                  dg             S # t        j                  t        f$ r?}t        j                  d|        t        t        j                  ddg       cY d}~S d}~ww xY w)z&Parse Gemini response into DupeResult.z```jsonr9   z```r   r   r   r   r   r   r:   r   r7   r    r8   z!Failed to parse dedupe response: zFailed to parse responseN)stripsplitjsonloadsgetlowerr   r   r   r   r   r#   JSONDecodeErrorKeyErrorrE   warning)rh   rR   dataverdict_strr   r]   s         r   rQ   rQ      sS    
""$::i(+11%8;Dd]::e$Q'--e4Q7Dzz$hhy(399;+%!++G--!11G!((G88Hb)# >?!XX&8"=	
 	
   (+ 
:1#>?&&-	
 	

s   DD   E894E3-E83E8)NN)#__doc__rl   loggingdataclassesr   enumr   pathlibr   typingr   googler   config.settingsr   database.modelsr	   r
   r   r   	getLoggerr   rE   r"   r   r   r(   r+   r.   intboolr4   r^   rL   rQ   r   r   r   <module>r      s     !     (  
		8	$#t       95<< 9. . ! $	OOO SMO 	O
 
O8
 8
: 8
v#3 #3 #"
# "
* "
r   