
    i%                         d Z ddlZddlZddlmZ ddlmZ  ej                  e      Z	e G d d             Z
g dZg dZd	Zd
ZdZdede
fdZdedefdZdedeeef   fdZy)zh
Content validation for generated articles.
Checks word count, error patterns, and blacklisted content.
    N)	dataclass)Optionalc                   T    e Zd ZU eed<   dZee   ed<   dZe	ed<   dZ
ee   ed<   d Zy)ValidationResultvalidNreasonr   
word_countwarningsc                 ,    | j                   g | _         y y )N)r
   )selfs    ;/var/www/html/content-pipeline/modules/content_validator.py__post_init__zValidationResult.__post_init__   s    == DM !    )__name__
__module____qualname__bool__annotations__r   r   strr	   intr
   listr    r   r   r   r      s2    K FHSM JHd3ir   r   )z(?i)as an aiz
(?i)als kiz(?i)i cannotz(?i)ich kann nichtz(?i)i don't have accessz(?i)i'm unable toz(?i)i am unable toz(?i)please note that iz(?i)it's important to notez((?i)es ist wichtig zu beachten, dass ichz(?i)leider kann ichz(?i)unfortunately,? iz(?i)\[insertz(?i)\[placeholderz
(?i)\[TODOz(?i)\[EXAMPLEz(?i)\[your z(?i)lorem ipsumz(?i)xxx+z(?i)tbd)z(?i)\bapcoa\bz(?i)\bcontipark\bz(?i)\bq-park\bz(?i)\bparkraum24\bz(?i)\bfairparken\bz(?i)\bfair\s+parken\bz(?i)\beasypark\bz(?i)\bpaybyphone\bz(?i)\bparkster\bz(?i)\bparknow\bz(?i)\bpark now\bz(?i)\bscheidt\s*&\s*bachmann\bz&(?i)\bpark\s*(?:&|and|und)\s*control\bi  i     contentreturnc           	      *   g }t        |       }|t        t        z
  k  rt        dd| dt         d|      S |t        t        z   kD  r|j                  d| dt         d       t        D ]  }t        j                  ||       st        j                  ||       }| t        d|j                         d	z
        |j                         d	z    }t        dd
|j                          d| d|      c S  t        D ]  }t        j                  ||       }|st        d|j                         dz
        }t        t        |       |j                         dz         }| || j!                  dd      }t        dd|j                          d| d|      c S  t        j                  d| t        j"                        s|j                  d       t        j                  d| t        j"                        st        dd|      S t        j                  d|       s|j                  d       t        t        j$                  d| t        j"                              }|dk  r|j                  d| d       |D ]  }	t&        j)                  d|	         t        d||      S )z
    Validate article content against quality requirements.

    Returns ValidationResult with valid=True if content passes all checks.
    FzWord count too low: z (minimum: ))r   r   r	   zWord count high: z (target max: r      zError pattern detected: 'z' in context: '...z...'   
 zCompetitor mention blocked: 'z	' in '...z^#\s+zNo H1 heading foundz^##\s+z.No H2 headings found - article lacks structureu&   (?i)(faq|häufig\s+gestellte\s+fragen)zNo FAQ section detected   zOnly z! H2 headings found (recommend 4+)zContent warning: T)r   r	   r
   )_count_wordsMIN_WORD_COUNTWORD_COUNT_TOLERANCEr   MAX_WORD_COUNTappendERROR_PATTERNSresearchmaxstartendgroupBLACKLIST_PATTERNSminlenreplace	MULTILINEfindallloggerwarning)
r   r
   r	   patternmatchcontextr,   r-   h2_countr6   s
             r   validater;   I   s}    H g&JN%999)*[@PPQR!
 	
 N%999
|>.9IK	

 " 99Wg&IIgw/Ec!U[[]R%78599;;KLG#25;;=/AST[S\\`a% 	 & 		'7+5;;=2-.Ec'lEIIK"$45CeC(00s;G#6u{{}oYwiW[\%  99Xw5-.99Y6C!
 	
 99>H12 2::i",,?@H!|%z)JKL  6*7)456  r   textc                 `   t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } | j                         D cg c]  }t        |      d	kD  s| }}t        |      S c c}w )
z4Count words in markdown text, excluding code blocks.z```[\s\S]*?``` z`[^`]+`zhttps?://\S+z\[([^\]]+)\]\([^)]+\)z\1z[#*_\[\]()]r!   r   )r)   subsplitr1   )r<   wwordss      r   r#   r#      s     66#R.D66*b$'D66/2t,D66*E48D66.#t,D

31A
Q3E3u: 4s   B+B+c                 P    t        |       }|j                  |j                  xs dfS )z0Quick validation check, returns (valid, reason).OK)r;   r   r   )r   results     r   quick_checkrF      s$    gF<<.$..r   )__doc__loggingr)   dataclassesr   typingr   	getLoggerr   r5   r   r(   r/   r$   r&   r%   r   r;   r   r#   tupler   rF   r   r   r   <module>rM      s   
  	 ! 			8	$   0 "  Lc L. L^s s "/ /tSy!1 /r   