import os
import sys
from pypdf import PdfReader
import pandas as pd
import arabic_reshaper
from bidi.algorithm import get_display
from datetime import datetime

def process_pdf(file_path):
    """Process a PDF file and extract text with proper RTL formatting"""
    reader = PdfReader(file_path)
    result = {
        "type": "pdf",
        "pages": len(reader.pages),
        "content": [],
        "processed_at": datetime.now().isoformat()
    }
    
    for page_num, page in enumerate(reader.pages, 1):
        text = page.extract_text()
        reshaped_text = arabic_reshaper.reshape(text)
        bidi_text = get_display(reshaped_text)
        result["content"].append({
            "page": page_num,
            "text": bidi_text,
            "lines": len(bidi_text.split('\n'))
        })
    return result

def process_excel(file_path):
    """Process Excel files and extract data with proper RTL formatting"""
    df = pd.read_excel(file_path)
    result = {
        "type": "excel",
        "sheets": 1,
        "rows": len(df),
        "columns": [],
        "sample_data": [],
        "processed_at": datetime.now().isoformat()
    }
    
    # Process column names
    for col in df.columns:
        reshaped = arabic_reshaper.reshape(str(col))
        rtl_text = get_display(reshaped)
        result["columns"].append(rtl_text)
    
    # Process first few rows
    for _, row in df.head().iterrows():
        processed_row = []
        for cell in row:
            reshaped = arabic_reshaper.reshape(str(cell))
            rtl_text = get_display(reshaped)
            processed_row.append(rtl_text)
        result["sample_data"].append(processed_row)
    
    return result

def process_word(file_path):
    """Basic metadata for Word documents"""
    return {
        "type": "word",
        "path": file_path,
        "size": os.path.getsize(file_path),
        "processed_at": datetime.now().isoformat()
    }

def test_document_processing():
    test_dir = r"e:\projects\metaboard_backend\src\test\raw_file"
    results = []
    
    try:
        if not os.path.exists(test_dir):
            print(f"Directory not found: {test_dir}")
            return False
            
        print(f"Processing documents in: {test_dir}")
        print("-" * 50)
        
        for filename in os.listdir(test_dir):
            file_path = os.path.join(test_dir, filename)
            print(f"\nProcessing: {filename}")
            
            try:
                if filename.lower().endswith('.pdf'):
                    result = process_pdf(file_path)
                elif filename.lower().endswith(('.xlsx', '.xls')):
                    result = process_excel(file_path)
                elif filename.lower().endswith(('.doc', '.docx')):
                    result = process_word(file_path)
                else:
                    print(f"Skipping unsupported file: {filename}")
                    continue
                
                result["filename"] = filename
                results.append(result)
                print(f"Successfully processed {filename}")
                print(f"Type: {result['type']}")
                
                if result['type'] == 'pdf':
                    print(f"Pages: {result['pages']}")
                    print("First page preview (first 100 chars):")
                    print(result['content'][0]['text'][:100])
                elif result['type'] == 'excel':
                    print(f"Columns: {len(result['columns'])}")
                    print("Column names:", result['columns'])
                elif result['type'] == 'word':
                    print(f"File size: {result['size']} bytes")
                
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                
        print(f"\nProcessed {len(results)} files successfully")
        return True
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return False

if __name__ == "__main__":
    success = test_document_processing()
    print(f"\nTest {'succeeded' if success else 'failed'}")