
```py
import pandas as pd
import json
from pymongo import MongoClient
import spacy
from bs4 import BeautifulSoup
import tabula
import re
import textract
from hazm import *  # Persian NLP library
import arabic_reshaper
from bidi.algorithm import get_display
class DataProcessor:
    def __init__(self):
        # Load both English and Persian NLP models
        self.nlp_en = spacy.load('en_core_web_sm')
        self.normalizer = Normalizer()  # Hazm Persian normalizer
        self.stemmer = Stemmer()  # Hazm Persian stemmer
        self.lemmatizer = Lemmatizer()  # Hazm Persian lemmatizer
        self.tagger = POSTagger(model='resources/postagger.model')  # Hazm POS tagger
        self.chunker = Chunker(model='resources/chunker.model')  # Hazm chunker
        self.client = MongoClient('mongodb://localhost:27017/')
        self.db = self.client.vr_expo
    def detect_language(self, text):
        """Detect if text is primarily Persian or English"""
        persian_chars = len(re.findall('[\u0600-\u06FF]', text))
        english_chars = len(re.findall('[a-zA-Z]', text))
        return 'fa' if persian_chars > english_chars else 'en'
    def preprocess_persian(self, text):
        """Preprocess Persian text"""
        # Normalize text
        normalized = self.normalizer.normalize(text)
        
        # Word tokenization
        words = word_tokenize(normalized)
        
        # POS tagging
        tagged = self.tagger.tag(words)
        
        # Chunking for phrases
        chunks = self.chunker.parse(tagged)
        
        # Get lemmas (dictionary form of words)
        lemmas = [self.lemmatizer.lemmatize(word) for word, _ in tagged]
        
        # Text reshaping for proper display
        reshaped_text = arabic_reshaper.reshape(normalized)
        bidi_text = get_display(reshaped_text)
        
        return {
            "normalized": normalized,
            "words": words,
            "pos_tags": tagged,
            "phrases": chunks,
            "lemmas": lemmas,
            "display_text": bidi_text
        }
    def structure_text(self, text):
        """Convert unstructured text to structured format based on language"""
        lang = self.detect_language(text)
        
        structured = {
            "language": lang,
            "entities": [],
            "key_phrases": [],
            "sentences": [],
            "metadata": {}
        }
        if lang == 'fa':
            # Process Persian text
            processed = self.preprocess_persian(text)
            structured.update({
                "normalized_text": processed["normalized"],
                "tokens": processed["words"],
                "pos_tags": [tag for _, tag in processed["pos_tags"]],
                "key_phrases": [' '.join(chunk.leaves()) for chunk in processed["phrases"]],
                "lemmas": processed["lemmas"],
                "display_text": processed["display_text"]
            })
        else:
            # Process English text using spaCy
            doc = self.nlp_en(text)
            structured.update({
                "entities": [{"text": ent.text, "label": ent.label_} for ent in doc.ents],
                "key_phrases": [chunk.text for chunk in doc.noun_chunks],
                "sentences": [{"text": sent.text} for sent in doc.sents]
            })
        return structured
    def process_batch(self, batch_id):
        """Process all files in a batch"""
        raw_files = self.db.rawfiles.find({"batchId": batch_id})
        
        batch_data = {
            "batchId": batch_id,
            "processed_files": [],
            "combined_structure": {
                "persian_content": {
                    "normalized_texts": [],
                    "key_phrases": [],
                    "lemmas": []
                },
                "english_content": {
                    "entities": [],
                    "key_phrases": []
                },
                "tables": [],
                "metadata": {}
            }
        }
        for file in raw_files:
            text = self.extract_text(file['path'], file['mimeType'])
            structured = self.structure_text(text)
            tables = self.extract_tables(file['path'])
            
            file_data = {
                "fileId": str(file['_id']),
                "originalName": file['originalName'],
                "language": structured["language"],
                "structured_content": structured,
                "tables": tables,
                "metadata": file['metadata']
            }
            
            # Add to batch data based on language
            if structured["language"] == 'fa':
                batch_data["combined_structure"]["persian_content"]["normalized_texts"].append(
                    structured.get("normalized_text", "")
                )
                batch_data["combined_structure"]["persian_content"]["key_phrases"].extend(
                    structured.get("key_phrases", [])
                )
                batch_data["combined_structure"]["persian_content"]["lemmas"].extend(
                    structured.get("lemmas", [])
                )
            else:
                batch_data["combined_structure"]["english_content"]["entities"].extend(
                    structured.get("entities", [])
                )
                batch_data["combined_structure"]["english_content"]["key_phrases"].extend(
                    structured.get("key_phrases", [])
                )
            
            batch_data["processed_files"].append(file_data)
            batch_data["combined_structure"]["tables"].extend(tables)
        # Store processed data
        self.db.processeddata.insert_one(batch_data)
        return batch_data
def transform_for_openai(batch_data):
    """Transform processed data into OpenAI-friendly format with language awareness"""
    return {
        "batchId": batch_data["batchId"],
        "content": {
            "persian": {
                "normalized_texts": batch_data["combined_structure"]["persian_content"]["normalized_texts"],
                "key_phrases": batch_data["combined_structure"]["persian_content"]["key_phrases"],
                "lemmas": batch_data["combined_structure"]["persian_content"]["lemmas"]
            },
            "english": {
                "entities": batch_data["combined_structure"]["english_content"]["entities"],
                "key_phrases": batch_data["combined_structure"]["english_content"]["key_phrases"]
            },
            "structured_data": {
                "tables": batch_data["combined_structure"]["tables"],
                "files": [
                    {
                        "name": f["originalName"],
                        "language": f["language"],
                        "content": f["structured_content"]
                    }
                    for f in batch_data["processed_files"]
                ]
            }
        }
    }
```

The updated pipeline now handles Farsi text by:
Using Hazm library for Persian NLP processing
Implementing proper text normalization for Persian
Handling bidirectional text with arabic_reshaper
Separate processing paths for Persian and English content
Language-specific text structuring
Maintaining proper text display format for Persian
You'll need to install additional dependencies:
pip install hazm arabic-reshaper python-bidi
This ensures your Persian content is properly:
Normalized (combining different forms of letters)
Tokenized (breaking into words)
Tagged (identifying parts of speech)
Chunked (identifying phrases)
Lemmatized (finding dictionary forms)
Displayed (handling right-to-left text)

The provided script is quite comprehensive and includes functionality for both English and Persian text processing, leveraging libraries such as spaCy for English and Hazm for Persian NLP tasks. It also handles the extraction and structuring of data from various file types and stores the processed data in MongoDB.

To adapt this script to your specific needs based on the provided files (PDFs, DOCX, XLSX), here are the steps you can follow:

### 1. **Data Extraction**

Ensure that all relevant data from PDFs, DOCX, and XLSX files are extracted properly. The script already uses `tabula` for extracting tables from PDFs and `textract` for general text extraction. You might need to fine-tune these parts depending on the structure of your PDFs.

```python
def extract_text(self, path, mime_type):
    if mime_type == 'application/pdf':
        # Use tabula for tables and textract for text
        text = textract.process(path, encoding='utf-8').decode('utf-8')
        tables = tabula.read_pdf(path, pages='all', multiple_tables=True)
        return text, tables
    elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        text = textract.process(path, encoding='utf-8').decode('utf-8')
        return text, []
    elif mime_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
        df = pd.read_excel(path)
        text = df.to_string()
        return text, [df.to_dict()]
    else:
        raise ValueError(f"Unsupported MIME type: {mime_type}")
```

### 2. **Preprocessing and Structuring**

Enhance the preprocessing and structuring functions to handle the specific content from your files. For instance, ensure that Persian text normalization and tokenization work correctly with your dataset.

```python
def preprocess_persian(self, text):
    """Preprocess Persian text"""
    normalized = self.normalizer.normalize(text)
    words = word_tokenize(normalized)
    tagged = self.tagger.tag(words)
    chunks = self.chunker.parse(tagged)
    lemmas = [self.lemmatizer.lemmatize(word) for word, _ in tagged]
    reshaped_text = arabic_reshaper.reshape(normalized)
    bidi_text = get_display(reshaped_text)

    return {
        "normalized": normalized,
        "words": words,
        "pos_tags": tagged,
        "phrases": chunks,
        "lemmas": lemmas,
        "display_text": bidi_text
    }
```

### 3. **Batch Processing**

Ensure that batch processing integrates well with your existing MongoDB setup. Adjust the `process_batch` function to accommodate your file types and structure the data accordingly.

```python
def process_batch(self, batch_id):
    raw_files = self.db.rawfiles.find({"batchId": batch_id})
    
    batch_data = {
        "batchId": batch_id,
        "processed_files": [],
        "combined_structure": {
            "persian_content": {
                "normalized_texts": [],
                "key_phrases": [],
                "lemmas": []
            },
            "english_content": {
                "entities": [],
                "key_phrases": []
            },
            "tables": [],
            "metadata": {}
        }
    }
    
    for file in raw_files:
        text, tables = self.extract_text(file['path'], file['mimeType'])
        structured = self.structure_text(text)
        
        file_data = {
            "fileId": str(file['_id']),
            "originalName": file['originalName'],
            "language": structured["language"],
            "structured_content": structured,
            "tables": tables,
            "metadata": file['metadata']
        }
        
        if structured["language"] == 'fa':
            batch_data["combined_structure"]["persian_content"]["normalized_texts"].append(
                structured.get("normalized_text", "")
            )
            batch_data["combined_structure"]["persian_content"]["key_phrases"].extend(
                structured.get("key_phrases", [])
            )
            batch_data["combined_structure"]["persian_content"]["lemmas"].extend(
                structured.get("lemmas", [])
            )
        else:
            batch_data["combined_structure"]["english_content"]["entities"].extend(
                structured.get("entities", [])
            )
            batch_data["combined_structure"]["english_content"]["key_phrases"].extend(
                structured.get("key_phrases", [])
            )
        
        batch_data["processed_files"].append(file_data)
        batch_data["combined_structure"]["tables"].extend(tables)
    
    self.db.processeddata.insert_one(batch_data)
    return batch_data
```

### 4. **Transform for OpenAI**

Modify the transformation function to fit the format expected by your OpenAI assistant.

```python
def transform_for_openai(batch_data):
    return {
        "batchId": batch_data["batchId"],
        "content": {
            "persian": {
                "normalized_texts": batch_data["combined_structure"]["persian_content"]["normalized_texts"],
                "key_phrases": batch_data["combined_structure"]["persian_content"]["key_phrases"],
                "lemmas": batch_data["combined_structure"]["persian_content"]["lemmas"]
            },
            "english": {
                "entities": batch_data["combined_structure"]["english_content"]["entities"],
                "key_phrases": batch_data["combined_structure"]["english_content"]["key_phrases"]
            },
            "structured_data": {
                "tables": batch_data["combined_structure"]["tables"],
                "files": [
                    {
                        "name": f["originalName"],
                        "language": f["language"],
                        "content": f["structured_content"]
                    }
                    for f in batch_data["processed_files"]
                ]
            }
        }
    }
```

### 5. **Integration with MongoDB**

Ensure that your MongoDB connection is correctly set up using PyMongo [[1]]. If you're using a cloud-based MongoDB service like MongoDB Atlas, adjust the connection URI accordingly [[7]].

```python
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client.vr_expo
```

### Conclusion

This approach ensures that your data is efficiently extracted, preprocessed, structured, and stored in a manner suitable for integration with an OpenAI assistant. Make sure to install all necessary dependencies and test each part of the pipeline independently to identify and resolve any issues early on [[3]].
