#!/usr/bin/env python3
"""
Enhanced placeholder scanner for HTML template files.
- Scans for ###PLACEHOLDER### values
- Checks for matched ###IF_xx### and ###END_IF_xx### pairs
- Identifies placeholders only in non-English files
"""

import os
import re
import argparse
from pathlib import Path
from collections import defaultdict, Counter
import sys

# Configuration
DIRECTORIES = ['cc', 'ww']
LANGUAGES = ['en', 'es', 'fr', 'de', 'it', 'pt']

# Known valid non-English placeholders (product 200307 is for non-English users)
VALID_NON_ENGLISH_PLACEHOLDERS = {
    'FIRSTPRICEYR_200307',
    'PRICEMONTH_200307',
    'PRICEYR_200307'
}

def extract_placeholders(content):
    """Extract all ###PLACEHOLDER### values from content."""
    pattern = r'###([A-Z_][A-Z0-9_]*)###'
    matches = re.findall(pattern, content)
    return sorted(set(matches))

def extract_if_blocks(content):
    """Extract and validate ###IF_xx### and ###END_IF_xx### pairs."""
    # Find all IF and END_IF placeholders
    if_pattern = r'###(IF_[A-Z0-9_]+)###'
    end_if_pattern = r'###(END_IF_[A-Z0-9_]+)###'

    if_matches = [(m.group(1), m.start()) for m in re.finditer(if_pattern, content)]
    end_if_matches = [(m.group(1), m.start()) for m in re.finditer(end_if_pattern, content)]

    # Helper function to convert position to line number
    def pos_to_line(pos):
        return content[:pos].count('\n') + 1

    # Check for matching pairs
    errors = []
    stack = []

    # Combine and sort by position
    all_matches = []
    for match, pos in if_matches:
        all_matches.append(('IF', match, pos))
    for match, pos in end_if_matches:
        all_matches.append(('END_IF', match, pos))

    all_matches.sort(key=lambda x: x[2])  # Sort by position

    for match_type, match_name, pos in all_matches:
        line_num = pos_to_line(pos)
        if match_type == 'IF':
            stack.append((match_name, line_num))
        elif match_type == 'END_IF':
            expected_if = match_name.replace('END_IF_', 'IF_')
            if not stack:
                errors.append(f"Unmatched {match_name} at line {line_num}")
            elif stack[-1][0] != expected_if:
                errors.append(f"Mismatched {match_name} at line {line_num}, expected END_{stack[-1][0]}")
            else:
                stack.pop()

    # Check for unclosed IF blocks
    for unclosed, line_num in stack:
        errors.append(f"Unclosed {unclosed} (opened at line {line_num})")

    return errors

def scan_file(file_path):
    """Scan a single HTML file for placeholders and IF/END_IF blocks."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        placeholders = extract_placeholders(content)
        if_errors = extract_if_blocks(content)
        
        return placeholders, if_errors
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return [], [f"File read error: {e}"]

def scan_directory(dir_path, lang):
    """Scan all HTML files in a directory."""
    results = {}
    if not os.path.exists(dir_path):
        return results
    
    html_files = Path(dir_path).glob('*.html')
    
    for file_path in html_files:
        relative_path = str(file_path).replace('\\', '/')
        placeholders, if_errors = scan_file(file_path)
        
        results[relative_path] = {
            'placeholders': placeholders,
            'if_errors': if_errors,
            'language': lang
        }
    
    return results

def main():
    parser = argparse.ArgumentParser(description='Scan HTML template files for placeholders and IF/END_IF blocks')
    parser.add_argument('--verbose', action='store_true', help='Show all placeholder analysis (not just errors)')
    parser.add_argument('--file', type=str, help='Scan only a specific file')
    args = parser.parse_args()

    if args.file:
        # Scan single file
        if not os.path.exists(args.file):
            print(f"Error: File {args.file} not found")
            return

        print(f"Scanning single file: {args.file}\n")
        placeholders, if_errors = scan_file(args.file)

        if if_errors:
            print("IF/END_IF BLOCK ERRORS:")
            print("-" * 40)
            for error in if_errors:
                print(f"  ❌ {error}")
        else:
            print("✅ All IF/END_IF blocks are properly matched!")

        if args.verbose:
            print(f"\nPlaceholders found ({len(placeholders)}):")
            print("-" * 40)
            for placeholder in placeholders:
                print(f"  ###{placeholder}###")

        return

    print("Scanning HTML template files for placeholders and IF/END_IF blocks...\n")

    all_placeholders = defaultdict(list)
    file_results = {}
    english_placeholders = defaultdict(list)
    if_errors_found = {}

    # Scan all directories and languages
    for directory in DIRECTORIES:
        for lang in LANGUAGES:
            scan_dir = os.path.join(directory, lang)
            if not args.verbose:
                print(f"Scanning: {scan_dir}")

            results = scan_directory(scan_dir, lang)
            file_results.update(results)

            for file_path, data in results.items():
                # Collect placeholders
                for placeholder in data['placeholders']:
                    all_placeholders[placeholder].append(file_path)

                # Store English placeholders separately
                if lang == 'en':
                    filename = os.path.basename(file_path)
                    english_placeholders[filename].extend(data['placeholders'])
                    english_placeholders[filename] = sorted(set(english_placeholders[filename]))

                # Store IF/END_IF errors
                if data['if_errors']:
                    if_errors_found[file_path] = data['if_errors']
    
    print(f"\n{'=' * 80}")
    print("IF/END_IF BLOCK VALIDATION")
    print(f"{'=' * 80}\n")
    
    if if_errors_found:
        print("ERRORS FOUND:")
        print("-" * 60)
        for file_path, errors in if_errors_found.items():
            print(f"\n{file_path}:")
            for error in errors:
                print(f"  ❌ {error}")
    else:
        print("✅ All IF/END_IF blocks are properly matched!")
    
    if args.verbose:
        print(f"\n{'=' * 80}")
        print("SUSPICIOUS PLACEHOLDERS (only in non-English files)")
        print(f"{'=' * 80}\n")

    # Find placeholders that only appear in non-English files
    non_english_only = {}

    for placeholder, files in all_placeholders.items():
        appears_in_english = any('/en/' in file for file in files)
        non_english_files = [file for file in files if '/en/' not in file]

        if not appears_in_english and non_english_files:
            # Skip known valid non-English placeholders
            if placeholder not in VALID_NON_ENGLISH_PLACEHOLDERS:
                non_english_only[placeholder] = non_english_files

    if args.verbose:
        if not non_english_only:
            print("✅ No suspicious placeholders found!")
        else:
            for placeholder in sorted(non_english_only.keys()):
                print(f"###{placeholder}###")
                print("  Found in:")
                for file in non_english_only[placeholder]:
                    print(f"    - {file}")
                print()
    elif non_english_only:
        print("ERRORS: Suspicious placeholders found (use --verbose to see details)")
        for placeholder in sorted(non_english_only.keys()):
            print(f"  ❌ ###{placeholder}### (only in non-English files)")
    
    if args.verbose:
        print(f"\n{'=' * 80}")
        print("PLACEHOLDER FREQUENCY ANALYSIS")
        print(f"{'=' * 80}\n")

        # Sort placeholders by frequency
        placeholder_counts = {p: len(files) for p, files in all_placeholders.items()}
        sorted_placeholders = sorted(placeholder_counts.items(), key=lambda x: x[1], reverse=True)

        print("Most common placeholders:")
        print("-" * 40)
        for i, (placeholder, count) in enumerate(sorted_placeholders[:20]):
            print(f"###{placeholder}###".ljust(32) + f"{count} files")

        print(f"\n{'=' * 80}")
        print("ENGLISH TEMPLATE REFERENCE")
        print(f"{'=' * 80}\n")

        print("Placeholders used in English templates:")
        print("-" * 50)
        for filename in sorted(english_placeholders.keys()):
            placeholders = english_placeholders[filename]
            if placeholders:
                print(f"\n{filename}:")
                for placeholder in placeholders:
                    print(f"  ###{placeholder}###")

if __name__ == "__main__":
    main()
