AI_Detector_2

Running

App Files Files Community

mahmoudsaber0 commited on 8 days ago

Commit

6101878

verified ·

1 Parent(s): 97cd85d

Update app.py

Browse files

Files changed (1) hide show

app.py +623 -337

app.py CHANGED Viewed

@@ -4,43 +4,21 @@ import torch
 import logging
 import gc
 import sys
-import pwd  # Added for monkey patch
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import Dict, List, Optional
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from tokenizers.normalizers import Sequence, Replace, Strip
 from tokenizers import Regex
-from huggingface_hub import hf_hub_download  # Added for reliable HF downloads
-# =====================================================
-# 🛠️ Monkey Patch for Docker/Container UID Issue
-# =====================================================
-# Fix for 'getpwuid(): uid not found: 1000' in containerized environments
-def patched_getpwuid(uid_num):
-    try:
-        return original_getpwuid(uid_num)
-    except KeyError:
-        if uid_num == os.getuid():
-            # Create fake user entry
-            return pwd.struct_pwent(
-                name='dockeruser',
-                passwd='x',
-                uid=uid_num,
-                gid=os.getgid(),
-                gecos='Docker User',
-                dir='/tmp',
-                shell='/bin/sh'
-            )
-        raise
-original_getpwuid = pwd.getpwuid
-pwd.getpwuid = patched_getpwuid
-# Set fallback env vars to avoid user-dependent paths
-os.environ.setdefault('HOME', '/tmp')
-os.environ.setdefault('USER', 'dockeruser')
 # =====================================================
 # 🔧 تكوين البيئة والإعدادات
@@ -55,14 +33,15 @@ logger = logging.getLogger(__name__)
 CACHE_DIR = "/tmp/huggingface_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
-# تكوين متغيرات البيئة لـ Hugging Face (removed TRANSFORMERS_CACHE to avoid deprecation warning)
 os.environ.update({
     "HF_HOME": CACHE_DIR,
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
     "TORCH_HOME": CACHE_DIR,
-    "TOKENIZERS_PARALLELISM": "false",  # منع مشاكل threading
-    "TRANSFORMERS_OFFLINE": "0",  # السماح بالتحميل من الإنترنت
 })
 # إعدادات PyTorch للذاكرة
@@ -96,282 +75,574 @@ label_mapping = {
 }
 # =====================================================
-# 🤖 Model Manager - إدارة الموديلات
 # =====================================================
-class ModelManager:
     def __init__(self):
-        self.tokenizer = None
-        self.models = []
         self.models_loaded = False
-        self.model_urls = [
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
         ]
-        self.base_model_id = "answerdotai/ModernBERT-base"  # Primary
-        self.fallback_model_id = "bert-base-uncased"  # Fallback if ModernBERT fails
-        self.using_fallback = False
-    def load_tokenizer(self):
-        """تحميل الـ Tokenizer مع fallback"""
         try:
-            logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.base_model_id,
                 cache_dir=CACHE_DIR,
                 use_fast=True,
                 trust_remote_code=False
             )
-            logger.info("✅ Primary tokenizer loaded successfully")
-        except Exception as e:
-            logger.warning(f"⚠️ Failed to load primary tokenizer: {e}")
             try:
-                logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    self.fallback_model_id,
-                    cache_dir=CACHE_DIR,
-                    use_fast=True,
-                    trust_remote_code=False
-                )
-                self.using_fallback = True
-                logger.info("✅ Fallback tokenizer loaded successfully")
-            except Exception as fallback_e:
-                logger.error(f"❌ Failed to load fallback tokenizer: {fallback_e}")
-                return False
-        # إعداد معالج النصوص
-        try:
-            newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
-            join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
-            self.tokenizer.backend_tokenizer.normalizer = Sequence([
-                self.tokenizer.backend_tokenizer.normalizer,
-                join_hyphen_break,
-                newline_to_space,
-                Strip()
-            ])
         except Exception as e:
-            logger.warning(f"⚠️ Could not set custom normalizer: {e}")
-        return True
-    def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
-        """تحميل موديل واحد مع fallback ومعالجة شاملة للأخطاء"""
-        base_model = None
         try:
-            logger.info(f"🤖 Loading base {model_name} from {self.base_model_id}...")
-            # محاولة تحميل الموديل الأساسي الرئيسي
             base_model = AutoModelForSequenceClassification.from_pretrained(
-                self.base_model_id,
                 num_labels=41,
                 cache_dir=CACHE_DIR,
-                dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # Updated from torch_dtype
                 low_cpu_mem_usage=True,
                 trust_remote_code=False
             )
-            logger.info("✅ Primary base model loaded")
-        except Exception as e:
-            logger.warning(f"⚠️ Failed to load primary base model: {e}")
-            try:
-                logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
-                base_model = AutoModelForSequenceClassification.from_pretrained(
-                    self.fallback_model_id,
-                    num_labels=41,
-                    cache_dir=CACHE_DIR,
-                    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # Updated from torch_dtype
-                    low_cpu_mem_usage=True,
-                    trust_remote_code=False
-                )
-                self.using_fallback = True
-                logger.info("✅ Fallback base model loaded (note: weights may not be compatible)")
-            except Exception as fallback_e:
-                logger.error(f"❌ Failed to load fallback base model: {fallback_e}")
-                return None
-        # محاولة تحميل الأوزان (فقط إذا لم نستخدم fallback، أو إذا كانت متوافقة)
-        try:
             if model_path and os.path.exists(model_path):
                 logger.info(f"📁 Loading from local file: {model_path}")
                 state_dict = torch.load(model_path, map_location=device, weights_only=True)
                 base_model.load_state_dict(state_dict, strict=False)
             elif model_url:
-                # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
-                logger.info(f"🌐 Downloading weights from HF repo...")
-                repo_id = "mihalykiss/modernbert_2"
-                filename = model_url.split('/')[-1]  # Extract filename like "Model_groups_3class_seed12"
-                pt_file = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=filename,
-                    cache_dir=CACHE_DIR,
-                    local_dir_use_symlinks=False
                 )
-                state_dict = torch.load(pt_file, map_location=device, weights_only=True)
-                # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
-                if not self.using_fallback:
-                    base_model.load_state_dict(state_dict, strict=False)
-                    logger.info("✅ Weights loaded successfully")
-                else:
-                    logger.warning("⚠️ Skipping weight load in fallback mode (incompatible architecture)")
-            else:
-                logger.info("📊 Using model with random initialization")
-        except Exception as weight_error:
-            logger.warning(f"⚠️ Could not load weights: {weight_error}")
-            logger.info("📊 Continuing with base model (random or pre-trained init)")
-        # نقل الموديل للجهاز المناسب
-        model = base_model.to(device)
-        model.eval()
-        # تنظيف الذاكرة
-        if 'state_dict' in locals():
-            del state_dict
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
-        return model
-    def load_models(self, max_models=3):  # Increased default to 3 to load local + 2 URLs
-        """تحميل الموديلات بحد أقصى للذاكرة"""
         if self.models_loaded:
             logger.info("✨ Models already loaded")
             return True
-        # تحميل الـ Tokenizer أولاً
-        if not self.load_tokenizer():
-            logger.error("❌ Tokenizer load failed - cannot proceed")
             return False
-        # تحميل الموديلات
-        logger.info(f"🚀 Loading up to {max_models} models...")
-        # محاولة تحميل الملف المحلي أولاً
-        local_model_path = "modernbert.bin"
-        if os.path.exists(local_model_path):
-            model = self.load_single_model(
-                model_path=local_model_path,
-                model_name="Model 1 (Local)"
             )
             if model is not None:
-                self.models.append(model)
-        # تحميل الموديلات من URLs (استخراج filenames)
-        for i, full_url in enumerate(self.model_urls[:max_models - len(self.models)]):
-            if len(self.models) >= max_models:
                 break
-            # استخدام full_url كما هو، لكن في load_single_model نستخرج filename
-            model = self.load_single_model(
-                model_url=full_url,
-                model_name=f"Model {len(self.models) + 1}"
             )
             if model is not None:
-                self.models.append(model)
-            # التحقق من الذاكرة المتاحة
-            if torch.cuda.is_available():
-                mem_allocated = torch.cuda.memory_allocated() / 1024**3
-                mem_reserved = torch.cuda.memory_reserved() / 1024**3
-                logger.info(f"💾 GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
-                # إيقاف التحميل إذا كانت الذاكرة ممتلئة
-                if mem_allocated > 6:  # حد أقصى 6GB
-                    logger.warning("⚠️ Memory limit reached, stopping model loading")
-                    break
-        # التحقق من نجاح التحميل
-        if len(self.models) > 0:
             self.models_loaded = True
-            logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
             return True
         else:
             logger.error("❌ No models could be loaded")
             return False
-    def classify_text(self, text: str) -> Dict:
-        """تحليل النص باستخدام الموديلات المحملة"""
-        if not self.models_loaded or len(self.models) == 0:
-            raise ValueError("No models loaded")
-        # تنظيف النص
-        cleaned_text = clean_text(text)
-        if not cleaned_text.strip():
-            raise ValueError("Empty text after cleaning")
-        # Tokenization (max_length adjusted for fallback BERT if needed)
-        max_len = 512 if not self.using_fallback else 512  # BERT max is 512
         try:
-            inputs = self.tokenizer(
                 cleaned_text,
                 return_tensors="pt",
                 truncation=True,
-                max_length=max_len,
                 padding=True
             ).to(device)
         except Exception as e:
-            logger.error(f"Tokenization error: {e}")
-            raise ValueError(f"Failed to tokenize text: {e}")
-        # الحصول على التنبؤات
-        all_probabilities = []
-        with torch.no_grad():
-            for i, model in enumerate(self.models):
-                try:
-                    logits = model(**inputs).logits
-                    probs = torch.softmax(logits, dim=1)
-                    all_probabilities.append(probs)
-                except Exception as e:
-                    logger.warning(f"Model {i+1} prediction failed: {e}")
-                    continue
-            if not all_probabilities:
-                raise ValueError("All models failed to make predictions")
-            # حساب المتوسط (Soft Voting)
-            averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
-            probabilities = averaged_probs[0]
-        # حساب نسب Human vs AI
-        human_prob = probabilities[24].item()
-        ai_probs = probabilities.clone()
-        ai_probs[24] = 0  # إزالة احتمالية Human
-        ai_total_prob = ai_probs.sum().item()
-        # التطبيع
-        total = human_prob + ai_total_prob
-        if total > 0:
-            human_percentage = (human_prob / total) * 100
-            ai_percentage = (ai_total_prob / total) * 100
-        else:
-            human_percentage = 50
-            ai_percentage = 50
-        # تحديد الموديل الأكثر احتمالاً
-        ai_model_idx = torch.argmax(ai_probs).item()
-        predicted_model = label_mapping.get(ai_model_idx, "Unknown")
-        # أعلى 5 تنبؤات
-        top_5_probs, top_5_indices = torch.topk(probabilities, 5)
-        top_5_results = []
-        for prob, idx in zip(top_5_probs, top_5_indices):
-            top_5_results.append({
-                "model": label_mapping.get(idx.item(), "Unknown"),
-                "probability": round(prob.item() * 100, 2)
-            })
-        return {
-            "human_percentage": round(human_percentage, 2),
-            "ai_percentage": round(ai_percentage, 2),
-            "predicted_model": predicted_model,
-            "top_5_predictions": top_5_results,
-            "is_human": human_percentage > ai_percentage,
-            "models_used": len(all_probabilities),
-            "using_fallback": self.using_fallback
-        }
 # =====================================================
 # 🧹 دوال التنظيف والمعالجة
@@ -391,12 +662,12 @@ def split_into_paragraphs(text: str) -> List[str]:
 # 🌐 FastAPI Application
 # =====================================================
 app = FastAPI(
-    title="ModernBERT AI Text Detector",
-    description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
-    version="2.3.0"  # Updated version with 3 models and deprecation fixes
 )
-# إضافة CORS للسماح بالاستخدام من المتصفح
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -405,8 +676,8 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# إنشاء مدير الموديلات
-model_manager = ModelManager()
 # =====================================================
 # 📝 نماذج البيانات (Pydantic Models)
@@ -414,11 +685,12 @@ model_manager = ModelManager()
 class TextInput(BaseModel):
     text: str
     analyze_paragraphs: Optional[bool] = False
 class SimpleTextInput(BaseModel):
     text: str
-class DetectionResult(BaseModel):
     success: bool
     code: int
     message: str
@@ -431,33 +703,46 @@ class DetectionResult(BaseModel):
 async def startup_event():
     """تحميل الموديلات عند بداية التشغيل"""
     logger.info("=" * 50)
-    logger.info("🚀 Starting ModernBERT AI Detector...")
     logger.info(f"🐍 Python version: {sys.version}")
     logger.info(f"🔥 PyTorch version: {torch.__version__}")
-    import transformers
-    logger.info(f"🔧 Transformers version: {transformers.__version__}")
-    logger.info("🛡️ UID Monkey Patch Applied (for Docker/Container)")
     logger.info("=" * 50)
-    # محاولة تحميل الموديلات
-    max_models = int(os.environ.get("MAX_MODELS", "3"))  # Updated default to 3
-    success = model_manager.load_models(max_models=max_models)
     if success:
-        logger.info(f"✅ Application ready! (Fallback mode: {model_manager.using_fallback})")
     else:
         logger.error("⚠️ Failed to load models - API will return errors")
-        logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
 @app.get("/")
 async def root():
     """الصفحة الرئيسية"""
     return {
-        "message": "ModernBERT AI Text Detector API",
         "status": "online" if model_manager.models_loaded else "initializing",
-        "models_loaded": len(model_manager.models),
-        "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "endpoints": {
             "analyze": "/analyze",
             "simple": "/analyze-simple",
@@ -478,112 +763,111 @@ async def health_check():
     return {
         "status": "healthy" if model_manager.models_loaded else "unhealthy",
-        "models_loaded": len(model_manager.models),
-        "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "cuda_available": torch.cuda.is_available(),
         "memory_info": memory_info
     }
-@app.post("/analyze", response_model=DetectionResult)
-async def analyze_text(data: TextInput):
     """
-    تحليل النص للكشف عن AI
-    يحاكي نفس وظيفة Gradio classify_text
     """
     try:
-        # التحقق من النص
         text = data.text.strip()
         if not text:
-            return DetectionResult(
                 success=False,
                 code=400,
                 message="Empty input text",
                 data={}
             )
-        # التأكد من تحميل الموديلات
         if not model_manager.models_loaded:
-            # محاولة تحميل الموديلات
-            if not model_manager.load_models():
-                return DetectionResult(
                     success=False,
                     code=503,
-                    message="Models not available. Check logs for details.",
                     data={}
                 )
-        # حساب عدد الكلمات
-        total_words = len(text.split())
-        # التحليل الأساسي
-        result = model_manager.classify_text(text)
-        # النتائج الأساسية
-        ai_percentage = result["ai_percentage"]
-        human_percentage = result["human_percentage"]
         ai_words = int(total_words * (ai_percentage / 100))
-        # تحليل الفقرات إذا طُلب ذلك
         paragraphs_analysis = []
-        if data.analyze_paragraphs and ai_percentage > 50:
             paragraphs = split_into_paragraphs(text)
-            recalc_ai_words = 0
-            recalc_total_words = 0
-            for para in paragraphs[:10]:  # حد أقصى 10 فقرات
                 if para.strip():
                     try:
-                        para_result = model_manager.classify_text(para)
                         para_words = len(para.split())
-                        recalc_total_words += para_words
-                        recalc_ai_words += para_words * (para_result["ai_percentage"] / 100)
                         paragraphs_analysis.append({
                             "paragraph": para[:200] + "..." if len(para) > 200 else para,
-                            "ai_generated_score": para_result["ai_percentage"] / 100,
-                            "human_written_score": para_result["human_percentage"] / 100,
-                            "predicted_model": para_result["predicted_model"]
                         })
                     except Exception as e:
                         logger.warning(f"Failed to analyze paragraph: {e}")
-            # إعادة حساب النسب بناءً على الفقرات
-            if recalc_total_words > 0:
-                ai_percentage = round((recalc_ai_words / recalc_total_words) * 100, 2)
-                human_percentage = round(100 - ai_percentage, 2)
-                ai_words = int(recalc_ai_words)
-        # إنشاء رسالة التغذية الراجعة
-        if ai_percentage > 50:
-            feedback = "Most of Your Text is AI/GPT Generated"
-        else:
-            feedback = "Most of Your Text Appears Human-Written"
-        # إرجاع النتائج بنفس تنسيق الكود الأصلي
-        return DetectionResult(
             success=True,
             code=200,
-            message="analysis completed",
-            data={
-                "fakePercentage": ai_percentage,
-                "isHuman": human_percentage,
-                "textWords": total_words,
-                "aiWords": ai_words,
-                "paragraphs": paragraphs_analysis,
-                "predicted_model": result["predicted_model"],
-                "feedback": feedback,
-                "input_text": text[:500] + "..." if len(text) > 500 else text,
-                "detected_language": "en",
-                "top_5_predictions": result.get("top_5_predictions", []),
-                "models_used": result.get("models_used", 1),
-                "using_fallback": result.get("using_fallback", False)
-            }
         )
     except Exception as e:
         logger.error(f"Analysis error: {e}", exc_info=True)
-        return DetectionResult(
             success=False,
             code=500,
             message=f"Analysis failed: {str(e)}",
@@ -593,7 +877,7 @@ async def analyze_text(data: TextInput):
 @app.post("/analyze-simple")
 async def analyze_simple(data: SimpleTextInput):
     """
-    تحليل مبسط - يرجع النتائج الأساسية فقط
     """
     try:
         text = data.text.strip()
@@ -601,18 +885,20 @@ async def analyze_simple(data: SimpleTextInput):
             raise HTTPException(status_code=400, detail="Empty text")
         if not model_manager.models_loaded:
-            if not model_manager.load_models():
                 raise HTTPException(status_code=503, detail="Models not available")
-        result = model_manager.classify_text(text)
         return {
-            "is_ai": result["ai_percentage"] > 50,
-            "ai_score": result["ai_percentage"],
-            "human_score": result["human_percentage"],
-            "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
-            "confidence": max(result["ai_percentage"], result["human_percentage"]),
-            "using_fallback": result.get("using_fallback", False)
         }
     except HTTPException:
@@ -627,21 +913,21 @@ async def analyze_simple(data: SimpleTextInput):
 if __name__ == "__main__":
     import uvicorn
-    # الحصول على الإعدادات من البيئة
     port = int(os.environ.get("PORT", 8000))
     host = os.environ.get("HOST", "0.0.0.0")
     workers = int(os.environ.get("WORKERS", 1))
     logger.info("=" * 50)
-    logger.info(f"🌐 Starting server on {host}:{port}")
     logger.info(f"👷 Workers: {workers}")
     logger.info(f"📚 Documentation: http://{host}:{port}/docs")
     logger.info("=" * 50)
     uvicorn.run(
-        "main:app",  # Assuming this file is named main.py
         host=host,
         port=port,
         workers=workers,
-        reload=False  # Set to True for dev
     )

 import logging
 import gc
 import sys
+import numpy as np
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import Dict, List, Optional
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    AutoModelForCausalLM,
+    pipeline
+)
 from tokenizers.normalizers import Sequence, Replace, Strip
 from tokenizers import Regex
+import math
+from collections import Counter
 # =====================================================
 # 🔧 تكوين البيئة والإعدادات
 CACHE_DIR = "/tmp/huggingface_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
+# تكوين متغيرات البيئة لـ Hugging Face
 os.environ.update({
     "HF_HOME": CACHE_DIR,
+    "TRANSFORMERS_CACHE": CACHE_DIR,
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
     "TORCH_HOME": CACHE_DIR,
+    "TOKENIZERS_PARALLELISM": "false",
+    "TRANSFORMERS_OFFLINE": "0",
 })
 # إعدادات PyTorch للذاكرة
 }
 # =====================================================
+# 📈 حسابات Perplexity و Burstiness
+# =====================================================
+class TextMetrics:
+    """حساب المقاييس الإحصائية للنص"""
+    @staticmethod
+    def calculate_perplexity(text: str, model=None, tokenizer=None):
+        """
+        حساب Perplexity - قياس مدى "تفاجؤ" الموديل بالنص
+        نصوص AI عادة لها perplexity أقل (أكثر قابلية للتنبؤ)
+        """
+        try:
+            if model is None or tokenizer is None:
+                # حساب تقريبي بناءً على تكرار الكلمات
+                words = text.lower().split()
+                word_freq = Counter(words)
+                total_words = len(words)
+                # حساب entropy
+                entropy = 0
+                for count in word_freq.values():
+                    probability = count / total_words
+                    if probability > 0:
+                        entropy -= probability * math.log2(probability)
+                # تقريب perplexity
+                perplexity = 2 ** entropy
+                return min(perplexity, 1000)  # Cap at 1000
+            else:
+                # حساب حقيقي باستخدام موديل
+                inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+                with torch.no_grad():
+                    outputs = model(**inputs, labels=inputs["input_ids"])
+                    loss = outputs.loss
+                    perplexity = torch.exp(loss).item()
+                return min(perplexity, 1000)
+        except Exception as e:
+            logger.warning(f"Error calculating perplexity: {e}")
+            return 50.0  # Default value
+    @staticmethod
+    def calculate_burstiness(text: str):
+        """
+        حساب Burstiness - قياس التنوع في طول الجمل
+        البشر عندهم burstiness أعلى (جمل متنوعة الطول)
+        AI عادة أكثر اتساقاً
+        """
+        try:
+            # تقسيم النص لجمل
+            sentences = re.split(r'[.!?]+', text)
+            sentences = [s.strip() for s in sentences if s.strip()]
+            if len(sentences) < 2:
+                return 0.0
+            # حسا�� طول كل جملة
+            sentence_lengths = [len(s.split()) for s in sentences]
+            # حساب الانحراف المعياري والمتوسط
+            mean_length = np.mean(sentence_lengths)
+            std_length = np.std(sentence_lengths)
+            # Burstiness = الانحراف المعياري / المتوسط
+            if mean_length > 0:
+                burstiness = std_length / mean_length
+            else:
+                burstiness = 0.0
+            return round(burstiness, 4)
+        except Exception as e:
+            logger.warning(f"Error calculating burstiness: {e}")
+            return 0.5
+    @staticmethod
+    def calculate_vocabulary_diversity(text: str):
+        """
+        حساب تنوع المفردات
+        البشر يستخدمون كلمات أكثر تنوعاً
+        """
+        words = text.lower().split()
+        unique_words = set(words)
+        if len(words) > 0:
+            diversity = len(unique_words) / len(words)
+        else:
+            diversity = 0
+        return round(diversity, 4)
+    @staticmethod
+    def detect_ai_patterns(text: str):
+        """
+        كشف الأنماط الشائعة في نصوص AI
+        """
+        ai_patterns = [
+            r"it['\s]+s important to note",
+            r"in conclusion",
+            r"furthermore",
+            r"comprehensive understanding",
+            r"it is worth noting",
+            r"however, it should be noted",
+            r"on the other hand",
+            r"in summary",
+            r"to begin with",
+            r"first and foremost"
+        ]
+        pattern_count = 0
+        for pattern in ai_patterns:
+            if re.search(pattern, text.lower()):
+                pattern_count += 1
+        return pattern_count
+    @staticmethod
+    def detect_human_patterns(text: str):
+        """
+        كشف الأنماط الشائعة في الكتابة البشرية
+        """
+        human_patterns = [
+            r"kinda|sorta|gonna|wanna|gotta",
+            r"tbh|idk|lol|omg|btw",
+            r"!{2,}|\?{2,}|\.{3,}",
+            r"i think|i feel|i believe",
+            r"like,|you know,|i mean,",
+            r"anyway|anyhow|whatever"
+        ]
+        pattern_count = 0
+        for pattern in human_patterns:
+            if re.search(pattern, text.lower()):
+                pattern_count += 1
+        return pattern_count
+# =====================================================
+# 🤖 Model Manager - إدارة الموديلات المحسنة
 # =====================================================
+class EnhancedModelManager:
     def __init__(self):
+        self.modernbert_tokenizer = None
+        self.modernbert_models = []
+        self.additional_models = {}
+        self.additional_tokenizers = {}
         self.models_loaded = False
+        self.metrics = TextMetrics()
+        # ModernBERT URLs
+        self.modernbert_urls = [
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
         ]
+        # Additional models to try
+        self.additional_model_configs = [
+            {
+                "name": "chatgpt-detector-roberta",
+                "model_id": "Hello-SimpleAI/chatgpt-detector-roberta",
+                "type": "classification"
+            },
+            {
+                "name": "openai-detector",
+                "model_id": "roberta-base-openai-detector",
+                "type": "classification"
+            },
+            {
+                "name": "ai-content-detector",
+                "model_id": "PirateXX/AI-Content-Detector",
+                "type": "classification"
+            }
+        ]
+    def load_modernbert_tokenizer(self):
+        """تحميل ModernBERT tokenizer"""
         try:
+            logger.info("📝 Loading ModernBERT tokenizer...")
+            self.modernbert_tokenizer = AutoTokenizer.from_pretrained(
+                "answerdotai/ModernBERT-base",
                 cache_dir=CACHE_DIR,
                 use_fast=True,
                 trust_remote_code=False
             )
+            # إعداد معالج النصوص
             try:
+                newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
+                join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
+                self.modernbert_tokenizer.backend_tokenizer.normalizer = Sequence([
+                    self.modernbert_tokenizer.backend_tokenizer.normalizer,
+                    join_hyphen_break,
+                    newline_to_space,
+                    Strip()
+                ])
+            except Exception as e:
+                logger.warning(f"⚠️ Could not set custom normalizer: {e}")
+            logger.info("✅ ModernBERT tokenizer loaded")
+            return True
         except Exception as e:
+            logger.error(f"❌ Failed to load tokenizer: {e}")
+            return False
+    def load_modernbert_model(self, model_url=None, model_path=None, model_name="ModernBERT"):
+        """تحميل موديل ModernBERT واحد"""
         try:
+            logger.info(f"🤖 Loading {model_name}...")
             base_model = AutoModelForSequenceClassification.from_pretrained(
+                "answerdotai/ModernBERT-base",
                 num_labels=41,
                 cache_dir=CACHE_DIR,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 low_cpu_mem_usage=True,
                 trust_remote_code=False
             )
             if model_path and os.path.exists(model_path):
                 logger.info(f"📁 Loading from local file: {model_path}")
                 state_dict = torch.load(model_path, map_location=device, weights_only=True)
                 base_model.load_state_dict(state_dict, strict=False)
             elif model_url:
+                logger.info(f"🌐 Downloading weights from URL...")
+                try:
+                    state_dict = torch.hub.load_state_dict_from_url(
+                        model_url,
+                        map_location=device,
+                        progress=True,
+                        check_hash=False,
+                        file_name=f"{model_name}.pt"
+                    )
+                    base_model.load_state_dict(state_dict, strict=False)
+                except Exception as e:
+                    logger.warning(f"⚠️ Could not load weights: {e}")
+                    logger.info("📊 Using model with random initialization")
+            model = base_model.to(device)
+            model.eval()
+            if 'state_dict' in locals():
+                del state_dict
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            logger.info(f"✅ {model_name} loaded")
+            return model
+        except Exception as e:
+            logger.error(f"❌ Failed to load {model_name}: {e}")
+            return None
+    def load_additional_model(self, model_config):
+        """تحميل موديلات إضافية للكشف عن AI"""
+        try:
+            model_name = model_config["name"]
+            model_id = model_config["model_id"]
+            logger.info(f"🔧 Loading {model_name}...")
+            # Try loading as a pipeline first (easier)
+            try:
+                classifier = pipeline(
+                    "text-classification",
+                    model=model_id,
+                    device=0 if torch.cuda.is_available() else -1,
+                    model_kwargs={"cache_dir": CACHE_DIR}
                 )
+                self.additional_models[model_name] = classifier
+                logger.info(f"✅ {model_name} loaded as pipeline")
+                return True
+            except:
+                # Try loading manually
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_id,
+                    cache_dir=CACHE_DIR
+                )
+                model = AutoModelForSequenceClassification.from_pretrained(
+                    model_id,
+                    cache_dir=CACHE_DIR,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+                ).to(device)
+                model.eval()
+                self.additional_tokenizers[model_name] = tokenizer
+                self.additional_models[model_name] = model
+                logger.info(f"✅ {model_name} loaded manually")
+                return True
+        except Exception as e:
+            logger.warning(f"⚠️ Could not load {model_config['name']}: {e}")
+            return False
+    def load_all_models(self, max_modernbert=2, load_additional=True):
+        """تحميل جميع الموديلات"""
         if self.models_loaded:
             logger.info("✨ Models already loaded")
             return True
+        # Load ModernBERT tokenizer
+        if not self.load_modernbert_tokenizer():
             return False
+        # Load ModernBERT models
+        logger.info(f"🚀 Loading up to {max_modernbert} ModernBERT models...")
+        # Try local file first
+        local_path = "modernbert.bin"
+        if os.path.exists(local_path):
+            model = self.load_modernbert_model(
+                model_path=local_path,
+                model_name="ModernBERT-Local"
             )
             if model is not None:
+                self.modernbert_models.append(model)
+        # Load from URLs
+        for i, url in enumerate(self.modernbert_urls[:max_modernbert - len(self.modernbert_models)]):
+            if len(self.modernbert_models) >= max_modernbert:
                 break
+            model = self.load_modernbert_model(
+                model_url=url,
+                model_name=f"ModernBERT-{i+1}"
             )
             if model is not None:
+                self.modernbert_models.append(model)
+        # Load additional models
+        if load_additional:
+            logger.info("🎯 Loading additional AI detection models...")
+            for config in self.additional_model_configs:
+                self.load_additional_model(config)
+        # Check success
+        total_models = len(self.modernbert_models) + len(self.additional_models)
+        if total_models > 0:
             self.models_loaded = True
+            logger.info(f"✅ Loaded {len(self.modernbert_models)} ModernBERT + {len(self.additional_models)} additional models")
             return True
         else:
             logger.error("❌ No models could be loaded")
             return False
+    def classify_with_modernbert(self, text: str, model_index: int):
+        """تصنيف النص باستخدام موديل ModernBERT واحد"""
         try:
+            if model_index >= len(self.modernbert_models):
+                return None
+            model = self.modernbert_models[model_index]
+            cleaned_text = clean_text(text)
+            inputs = self.modernbert_tokenizer(
                 cleaned_text,
                 return_tensors="pt",
                 truncation=True,
+                max_length=512,
                 padding=True
             ).to(device)
+            with torch.no_grad():
+                logits = model(**inputs).logits
+                probs = torch.softmax(logits[0], dim=0)
+                human_prob = probs[24].item()
+                ai_probs = probs.clone()
+                ai_probs[24] = 0
+                ai_total = ai_probs.sum().item()
+                total = human_prob + ai_total
+                if total > 0:
+                    human_pct = (human_prob / total) * 100
+                    ai_pct = (ai_total / total) * 100
+                else:
+                    human_pct = ai_pct = 50
+                ai_model_idx = torch.argmax(ai_probs).item()
+                return {
+                    "model_name": f"ModernBERT-{model_index+1}",
+                    "human_score": round(human_pct, 2),
+                    "ai_score": round(ai_pct, 2),
+                    "predicted_model": label_mapping.get(ai_model_idx, "Unknown"),
+                    "confidence": round(max(human_pct, ai_pct), 2)
+                }
         except Exception as e:
+            logger.error(f"Error in ModernBERT {model_index}: {e}")
+            return None
+    def classify_with_additional(self, text: str, model_name: str):
+        """تصنيف النص باستخدام موديل إضافي"""
+        try:
+            if model_name not in self.additional_models:
+                return None
+            model = self.additional_models[model_name]
+            # Check if it's a pipeline or model
+            if hasattr(model, '__call__'):
+                # It's a pipeline
+                result = model(text, truncation=True, max_length=512)
+                # Parse results based on model output format
+                ai_score = 0
+                human_score = 0
+                for item in result:
+                    label = item['label'].lower()
+                    score = item['score'] * 100
+                    if 'fake' in label or 'ai' in label or 'gpt' in label:
+                        ai_score = max(ai_score, score)
+                    elif 'real' in label or 'human' in label:
+                        human_score = max(human_score, score)
+                # Normalize if needed
+                if ai_score == 0 and human_score == 0:
+                    ai_score = human_score = 50
+                return {
+                    "model_name": model_name,
+                    "human_score": round(human_score, 2),
+                    "ai_score": round(ai_score, 2),
+                    "predicted_model": "AI" if ai_score > human_score else "Human",
+                    "confidence": round(max(ai_score, human_score), 2)
+                }
+            else:
+                # It's a model, use tokenizer
+                tokenizer = self.additional_tokenizers.get(model_name)
+                if tokenizer is None:
+                    return None
+                inputs = tokenizer(
+                    text,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=512,
+                    padding=True
+                ).to(device)
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    probs = torch.softmax(outputs.logits[0], dim=0)
+                    # Assuming binary classification (AI vs Human)
+                    if len(probs) == 2:
+                        human_score = probs[0].item() * 100
+                        ai_score = probs[1].item() * 100
+                    else:
+                        # Handle multi-class
+                        ai_score = human_score = 50
+                    return {
+                        "model_name": model_name,
+                        "human_score": round(human_score, 2),
+                        "ai_score": round(ai_score, 2),
+                        "predicted_model": "AI" if ai_score > human_score else "Human",
+                        "confidence": round(max(ai_score, human_score), 2)
+                    }
+        except Exception as e:
+            logger.warning(f"Error in {model_name}: {e}")
+            return None
+    def comprehensive_analysis(self, text: str):
+        """تحليل شامل باستخدام جميع الموديلات والمقاييس"""
+        if not self.models_loaded:
+            raise ValueError("No models loaded")
+        results = {
+            "individual_models": [],
+            "ensemble_result": {},
+            "metrics": {},
+            "pattern_analysis": {}
+        }
+        # 1. Calculate text metrics
+        logger.info("📊 Calculating text metrics...")
+        results["metrics"] = {
+            "perplexity": self.metrics.calculate_perplexity(text),
+            "burstiness": self.metrics.calculate_burstiness(text),
+            "vocabulary_diversity": self.metrics.calculate_vocabulary_diversity(text),
+            "text_length": len(text.split()),
+            "sentence_count": len(re.split(r'[.!?]+', text))
+        }
+        # 2. Pattern detection
+        results["pattern_analysis"] = {
+            "ai_patterns_found": self.metrics.detect_ai_patterns(text),
+            "human_patterns_found": self.metrics.detect_human_patterns(text)
+        }
+        # 3. Run ModernBERT models
+        modernbert_results = []
+        for i in range(len(self.modernbert_models)):
+            result = self.classify_with_modernbert(text, i)
+            if result:
+                results["individual_models"].append(result)
+                modernbert_results.append(result)
+        # 4. Run additional models
+        for model_name in self.additional_models.keys():
+            result = self.classify_with_additional(text, model_name)
+            if result:
+                results["individual_models"].append(result)
+        # 5. Calculate ensemble result (weighted average)
+        if results["individual_models"]:
+            total_ai = 0
+            total_human = 0
+            weights_sum = 0
+            for i, result in enumerate(results["individual_models"]):
+                # Give ModernBERT models higher weight
+                weight = 1.5 if i < len(modernbert_results) else 1.0
+                total_ai += result["ai_score"] * weight
+                total_human += result["human_score"] * weight
+                weights_sum += weight
+            if weights_sum > 0:
+                ensemble_ai = total_ai / weights_sum
+                ensemble_human = total_human / weights_sum
+            else:
+                ensemble_ai = ensemble_human = 50
+            # Adjust based on metrics
+            # High perplexity suggests human text
+            if results["metrics"]["perplexity"] > 100:
+                ensemble_human += 5
+                ensemble_ai -= 5
+            elif results["metrics"]["perplexity"] < 30:
+                ensemble_ai += 5
+                ensemble_human -= 5
+            # High burstiness suggests human text
+            if results["metrics"]["burstiness"] > 0.8:
+                ensemble_human += 5
+                ensemble_ai -= 5
+            elif results["metrics"]["burstiness"] < 0.3:
+                ensemble_ai += 5
+                ensemble_human -= 5
+            # Pattern analysis adjustment
+            pattern_adjustment = (results["pattern_analysis"]["ai_patterns_found"] -
+                                 results["pattern_analysis"]["human_patterns_found"]) * 3
+            ensemble_ai += pattern_adjustment
+            ensemble_human -= pattern_adjustment
+            # Normalize to 100%
+            total = ensemble_ai + ensemble_human
+            if total > 0:
+                ensemble_ai = (ensemble_ai / total) * 100
+                ensemble_human = (ensemble_human / total) * 100
+            # Determine most likely AI model
+            if ensemble_ai > ensemble_human and modernbert_results:
+                predicted_model = modernbert_results[0]["predicted_model"]
+            else:
+                predicted_model = "Human"
+            results["ensemble_result"] = {
+                "ai_percentage": round(min(max(ensemble_ai, 0), 100), 2),
+                "human_percentage": round(min(max(ensemble_human, 0), 100), 2),
+                "predicted_model": predicted_model,
+                "confidence": round(max(ensemble_ai, ensemble_human), 2),
+                "is_human": ensemble_human > ensemble_ai,
+                "models_used": len(results["individual_models"])
+            }
+        return results
 # =====================================================
 # 🧹 دوال التنظيف والمعالجة
 # 🌐 FastAPI Application
 # =====================================================
 app = FastAPI(
+    title="Enhanced ModernBERT AI Detector",
+    description="Advanced AI detection with multiple models, perplexity, and burstiness analysis",
+    version="3.0.0"
 )
+# إضافة CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# إنشاء مدير الموديلات المحسن
+model_manager = EnhancedModelManager()
 # =====================================================
 # 📝 نماذج البيانات (Pydantic Models)
 class TextInput(BaseModel):
     text: str
     analyze_paragraphs: Optional[bool] = False
+    return_individual_scores: Optional[bool] = True
 class SimpleTextInput(BaseModel):
     text: str
+class EnhancedDetectionResult(BaseModel):
     success: bool
     code: int
     message: str
 async def startup_event():
     """تحميل الموديلات عند بداية التشغيل"""
     logger.info("=" * 50)
+    logger.info("🚀 Starting Enhanced ModernBERT AI Detector...")
     logger.info(f"🐍 Python version: {sys.version}")
     logger.info(f"🔥 PyTorch version: {torch.__version__}")
     logger.info("=" * 50)
+    # Load models
+    max_modernbert = int(os.environ.get("MAX_MODERNBERT_MODELS", "2"))
+    load_additional = os.environ.get("LOAD_ADDITIONAL_MODELS", "true").lower() == "true"
+    success = model_manager.load_all_models(
+        max_modernbert=max_modernbert,
+        load_additional=load_additional
+    )
     if success:
+        logger.info("✅ Application ready with enhanced features!")
     else:
         logger.error("⚠️ Failed to load models - API will return errors")
 @app.get("/")
 async def root():
     """الصفحة الرئيسية"""
+    models_info = {
+        "modernbert_models": len(model_manager.modernbert_models),
+        "additional_models": list(model_manager.additional_models.keys())
+    }
     return {
+        "message": "Enhanced ModernBERT AI Text Detector API",
         "status": "online" if model_manager.models_loaded else "initializing",
+        "models": models_info,
         "device": str(device),
+        "features": [
+            "Multiple AI detection models",
+            "Perplexity analysis",
+            "Burstiness analysis",
+            "Pattern detection",
+            "Individual model scores",
+            "Ensemble predictions"
+        ],
         "endpoints": {
             "analyze": "/analyze",
             "simple": "/analyze-simple",
     return {
         "status": "healthy" if model_manager.models_loaded else "unhealthy",
+        "modernbert_models": len(model_manager.modernbert_models),
+        "additional_models": len(model_manager.additional_models),
+        "total_models": len(model_manager.modernbert_models) + len(model_manager.additional_models),
         "device": str(device),
         "cuda_available": torch.cuda.is_available(),
         "memory_info": memory_info
     }
+@app.post("/analyze", response_model=EnhancedDetectionResult)
+async def analyze_text_enhanced(data: TextInput):
     """
+    Enhanced analysis with multiple models and metrics
     """
     try:
+        # Validate input
         text = data.text.strip()
         if not text:
+            return EnhancedDetectionResult(
                 success=False,
                 code=400,
                 message="Empty input text",
                 data={}
             )
+        # Ensure models are loaded
         if not model_manager.models_loaded:
+            if not model_manager.load_all_models():
+                return EnhancedDetectionResult(
                     success=False,
                     code=503,
+                    message="Models not available",
                     data={}
                 )
+        # Comprehensive analysis
+        analysis_result = model_manager.comprehensive_analysis(text)
+        # Basic stats
+        total_words = len(text.split())
+        ai_percentage = analysis_result["ensemble_result"]["ai_percentage"]
+        human_percentage = analysis_result["ensemble_result"]["human_percentage"]
         ai_words = int(total_words * (ai_percentage / 100))
+        # Paragraph analysis if requested
         paragraphs_analysis = []
+        if data.analyze_paragraphs:
             paragraphs = split_into_paragraphs(text)
+            for para in paragraphs[:10]:
                 if para.strip():
                     try:
+                        para_result = model_manager.comprehensive_analysis(para)
                         para_words = len(para.split())
                         paragraphs_analysis.append({
                             "paragraph": para[:200] + "..." if len(para) > 200 else para,
+                            "ai_generated_score": para_result["ensemble_result"]["ai_percentage"] / 100,
+                            "human_written_score": para_result["ensemble_result"]["human_percentage"] / 100,
+                            "predicted_model": para_result["ensemble_result"]["predicted_model"],
+                            "metrics": {
+                                "perplexity": para_result["metrics"]["perplexity"],
+                                "burstiness": para_result["metrics"]["burstiness"]
+                            }
                         })
                     except Exception as e:
                         logger.warning(f"Failed to analyze paragraph: {e}")
+        # Prepare response
+        response_data = {
+            "fakePercentage": ai_percentage,
+            "isHuman": human_percentage,
+            "textWords": total_words,
+            "aiWords": ai_words,
+            "predicted_model": analysis_result["ensemble_result"]["predicted_model"],
+            "feedback": "Most of Your Text is AI/GPT Generated" if ai_percentage > 50 else "Most of Your Text Appears Human-Written",
+            "confidence": analysis_result["ensemble_result"]["confidence"],
+            "models_used": analysis_result["ensemble_result"]["models_used"],
+            # New: Metrics
+            "metrics": analysis_result["metrics"],
+            # New: Pattern analysis
+            "pattern_analysis": analysis_result["pattern_analysis"],
+            # Paragraphs if requested
+            "paragraphs": paragraphs_analysis,
+            # Text preview
+            "input_text": text[:500] + "..." if len(text) > 500 else text,
+            "detected_language": "en"
+        }
+        # Add individual model scores if requested
+        if data.return_individual_scores:
+            response_data["individual_models"] = analysis_result["individual_models"]
+        return EnhancedDetectionResult(
             success=True,
             code=200,
+            message="Enhanced analysis completed",
+            data=response_data
         )
     except Exception as e:
         logger.error(f"Analysis error: {e}", exc_info=True)
+        return EnhancedDetectionResult(
             success=False,
             code=500,
             message=f"Analysis failed: {str(e)}",
 @app.post("/analyze-simple")
 async def analyze_simple(data: SimpleTextInput):
     """
+    Simple analysis - returns basic results only
     """
     try:
         text = data.text.strip()
             raise HTTPException(status_code=400, detail="Empty text")
         if not model_manager.models_loaded:
+            if not model_manager.load_all_models():
                 raise HTTPException(status_code=503, detail="Models not available")
+        result = model_manager.comprehensive_analysis(text)
+        ensemble = result["ensemble_result"]
         return {
+            "is_ai": ensemble["ai_percentage"] > 50,
+            "ai_score": ensemble["ai_percentage"],
+            "human_score": ensemble["human_percentage"],
+            "detected_model": ensemble["predicted_model"],
+            "confidence": ensemble["confidence"],
+            "perplexity": result["metrics"]["perplexity"],
+            "burstiness": result["metrics"]["burstiness"]
         }
     except HTTPException:
 if __name__ == "__main__":
     import uvicorn
     port = int(os.environ.get("PORT", 8000))
     host = os.environ.get("HOST", "0.0.0.0")
     workers = int(os.environ.get("WORKERS", 1))
     logger.info("=" * 50)
+    logger.info(f"🌐 Starting enhanced server on {host}:{port}")
     logger.info(f"👷 Workers: {workers}")
     logger.info(f"📚 Documentation: http://{host}:{port}/docs")
     logger.info("=" * 50)
     uvicorn.run(
+        "app_enhanced:app",
         host=host,
         port=port,
+        reload=False,
         workers=workers,
+        log_level="info"
     )