mahmoudsaber0 commited on
Commit
6101878
·
verified ·
1 Parent(s): 97cd85d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +623 -337
app.py CHANGED
@@ -4,43 +4,21 @@ import torch
4
  import logging
5
  import gc
6
  import sys
7
- import pwd # Added for monkey patch
8
  from fastapi import FastAPI, HTTPException
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
11
  from typing import Dict, List, Optional
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 
 
 
13
  from tokenizers.normalizers import Sequence, Replace, Strip
14
  from tokenizers import Regex
15
- from huggingface_hub import hf_hub_download # Added for reliable HF downloads
16
-
17
- # =====================================================
18
- # 🛠️ Monkey Patch for Docker/Container UID Issue
19
- # =====================================================
20
- # Fix for 'getpwuid(): uid not found: 1000' in containerized environments
21
- def patched_getpwuid(uid_num):
22
- try:
23
- return original_getpwuid(uid_num)
24
- except KeyError:
25
- if uid_num == os.getuid():
26
- # Create fake user entry
27
- return pwd.struct_pwent(
28
- name='dockeruser',
29
- passwd='x',
30
- uid=uid_num,
31
- gid=os.getgid(),
32
- gecos='Docker User',
33
- dir='/tmp',
34
- shell='/bin/sh'
35
- )
36
- raise
37
-
38
- original_getpwuid = pwd.getpwuid
39
- pwd.getpwuid = patched_getpwuid
40
-
41
- # Set fallback env vars to avoid user-dependent paths
42
- os.environ.setdefault('HOME', '/tmp')
43
- os.environ.setdefault('USER', 'dockeruser')
44
 
45
  # =====================================================
46
  # 🔧 تكوين البيئة والإعدادات
@@ -55,14 +33,15 @@ logger = logging.getLogger(__name__)
55
  CACHE_DIR = "/tmp/huggingface_cache"
56
  os.makedirs(CACHE_DIR, exist_ok=True)
57
 
58
- # تكوين متغيرات البيئة لـ Hugging Face (removed TRANSFORMERS_CACHE to avoid deprecation warning)
59
  os.environ.update({
60
  "HF_HOME": CACHE_DIR,
 
61
  "HF_DATASETS_CACHE": CACHE_DIR,
62
  "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
63
  "TORCH_HOME": CACHE_DIR,
64
- "TOKENIZERS_PARALLELISM": "false", # منع مشاكل threading
65
- "TRANSFORMERS_OFFLINE": "0", # السماح بالتحميل من الإنترنت
66
  })
67
 
68
  # إعدادات PyTorch للذاكرة
@@ -96,282 +75,574 @@ label_mapping = {
96
  }
97
 
98
  # =====================================================
99
- # 🤖 Model Manager - إدارة الموديلات
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # =====================================================
101
- class ModelManager:
102
  def __init__(self):
103
- self.tokenizer = None
104
- self.models = []
 
 
105
  self.models_loaded = False
106
- self.model_urls = [
 
 
 
107
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
108
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
109
  ]
110
- self.base_model_id = "answerdotai/ModernBERT-base" # Primary
111
- self.fallback_model_id = "bert-base-uncased" # Fallback if ModernBERT fails
112
- self.using_fallback = False
113
 
114
- def load_tokenizer(self):
115
- """تحميل الـ Tokenizer مع fallback"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  try:
117
- logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
118
- self.tokenizer = AutoTokenizer.from_pretrained(
119
- self.base_model_id,
120
  cache_dir=CACHE_DIR,
121
  use_fast=True,
122
  trust_remote_code=False
123
  )
124
- logger.info("✅ Primary tokenizer loaded successfully")
125
 
126
- except Exception as e:
127
- logger.warning(f"⚠️ Failed to load primary tokenizer: {e}")
128
  try:
129
- logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
130
- self.tokenizer = AutoTokenizer.from_pretrained(
131
- self.fallback_model_id,
132
- cache_dir=CACHE_DIR,
133
- use_fast=True,
134
- trust_remote_code=False
135
- )
136
- self.using_fallback = True
137
- logger.info("✅ Fallback tokenizer loaded successfully")
138
- except Exception as fallback_e:
139
- logger.error(f"❌ Failed to load fallback tokenizer: {fallback_e}")
140
- return False
141
-
142
- # إعداد معالج النصوص
143
- try:
144
- newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
145
- join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
146
- self.tokenizer.backend_tokenizer.normalizer = Sequence([
147
- self.tokenizer.backend_tokenizer.normalizer,
148
- join_hyphen_break,
149
- newline_to_space,
150
- Strip()
151
- ])
152
  except Exception as e:
153
- logger.warning(f"⚠️ Could not set custom normalizer: {e}")
154
-
155
- return True
156
 
157
- def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
158
- """تحميل موديل واحد مع fallback ومعالجة شاملة للأخطاء"""
159
- base_model = None
160
  try:
161
- logger.info(f"🤖 Loading base {model_name} from {self.base_model_id}...")
162
 
163
- # محاولة تحميل الموديل الأساسي الرئيسي
164
  base_model = AutoModelForSequenceClassification.from_pretrained(
165
- self.base_model_id,
166
  num_labels=41,
167
  cache_dir=CACHE_DIR,
168
- dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # Updated from torch_dtype
169
  low_cpu_mem_usage=True,
170
  trust_remote_code=False
171
  )
172
- logger.info("✅ Primary base model loaded")
173
 
174
- except Exception as e:
175
- logger.warning(f"⚠️ Failed to load primary base model: {e}")
176
- try:
177
- logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
178
- base_model = AutoModelForSequenceClassification.from_pretrained(
179
- self.fallback_model_id,
180
- num_labels=41,
181
- cache_dir=CACHE_DIR,
182
- dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # Updated from torch_dtype
183
- low_cpu_mem_usage=True,
184
- trust_remote_code=False
185
- )
186
- self.using_fallback = True
187
- logger.info("✅ Fallback base model loaded (note: weights may not be compatible)")
188
- except Exception as fallback_e:
189
- logger.error(f"❌ Failed to load fallback base model: {fallback_e}")
190
- return None
191
-
192
- # محاولة تحميل الأوزان (فقط إذا لم نستخدم fallback، أو إذا كانت متوافقة)
193
- try:
194
  if model_path and os.path.exists(model_path):
195
  logger.info(f"📁 Loading from local file: {model_path}")
196
  state_dict = torch.load(model_path, map_location=device, weights_only=True)
197
  base_model.load_state_dict(state_dict, strict=False)
198
  elif model_url:
199
- # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
200
- logger.info(f"🌐 Downloading weights from HF repo...")
201
- repo_id = "mihalykiss/modernbert_2"
202
- filename = model_url.split('/')[-1] # Extract filename like "Model_groups_3class_seed12"
203
- pt_file = hf_hub_download(
204
- repo_id=repo_id,
205
- filename=filename,
206
- cache_dir=CACHE_DIR,
207
- local_dir_use_symlinks=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  )
209
- state_dict = torch.load(pt_file, map_location=device, weights_only=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
212
- if not self.using_fallback:
213
- base_model.load_state_dict(state_dict, strict=False)
214
- logger.info("✅ Weights loaded successfully")
215
- else:
216
- logger.warning("⚠️ Skipping weight load in fallback mode (incompatible architecture)")
217
- else:
218
- logger.info("📊 Using model with random initialization")
219
- except Exception as weight_error:
220
- logger.warning(f"⚠️ Could not load weights: {weight_error}")
221
- logger.info("📊 Continuing with base model (random or pre-trained init)")
222
-
223
- # نقل الموديل للجهاز المناسب
224
- model = base_model.to(device)
225
- model.eval()
226
-
227
- # تنظيف الذاكرة
228
- if 'state_dict' in locals():
229
- del state_dict
230
- gc.collect()
231
- if torch.cuda.is_available():
232
- torch.cuda.empty_cache()
233
-
234
- logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
235
- return model
236
-
237
- def load_models(self, max_models=3): # Increased default to 3 to load local + 2 URLs
238
- """تحميل الموديلات بحد أقصى للذاكرة"""
239
  if self.models_loaded:
240
  logger.info("✨ Models already loaded")
241
  return True
242
 
243
- # تحميل الـ Tokenizer أولاً
244
- if not self.load_tokenizer():
245
- logger.error("❌ Tokenizer load failed - cannot proceed")
246
  return False
247
 
248
- # تحميل الموديلات
249
- logger.info(f"🚀 Loading up to {max_models} models...")
250
 
251
- # محاولة تحميل الملف المحلي أولاً
252
- local_model_path = "modernbert.bin"
253
- if os.path.exists(local_model_path):
254
- model = self.load_single_model(
255
- model_path=local_model_path,
256
- model_name="Model 1 (Local)"
257
  )
258
  if model is not None:
259
- self.models.append(model)
260
 
261
- # تحميل الموديلات من URLs (استخراج filenames)
262
- for i, full_url in enumerate(self.model_urls[:max_models - len(self.models)]):
263
- if len(self.models) >= max_models:
264
  break
265
-
266
- # استخدام full_url كما هو، لكن في load_single_model نستخرج filename
267
- model = self.load_single_model(
268
- model_url=full_url,
269
- model_name=f"Model {len(self.models) + 1}"
270
  )
271
  if model is not None:
272
- self.models.append(model)
273
-
274
- # التحقق من الذاكرة المتاحة
275
- if torch.cuda.is_available():
276
- mem_allocated = torch.cuda.memory_allocated() / 1024**3
277
- mem_reserved = torch.cuda.memory_reserved() / 1024**3
278
- logger.info(f"💾 GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
279
-
280
- # إيقاف التحميل إذا كانت الذاكرة ممتلئة
281
- if mem_allocated > 6: # حد أقصى 6GB
282
- logger.warning("⚠️ Memory limit reached, stopping model loading")
283
- break
284
 
285
- # التحقق من نجاح التحميل
286
- if len(self.models) > 0:
 
287
  self.models_loaded = True
288
- logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
289
  return True
290
  else:
291
  logger.error("❌ No models could be loaded")
292
  return False
293
 
294
- def classify_text(self, text: str) -> Dict:
295
- """تحليل النص باستخدام الموديلات المحملة"""
296
- if not self.models_loaded or len(self.models) == 0:
297
- raise ValueError("No models loaded")
298
-
299
- # تنظيف النص
300
- cleaned_text = clean_text(text)
301
- if not cleaned_text.strip():
302
- raise ValueError("Empty text after cleaning")
303
-
304
- # Tokenization (max_length adjusted for fallback BERT if needed)
305
- max_len = 512 if not self.using_fallback else 512 # BERT max is 512
306
  try:
307
- inputs = self.tokenizer(
 
 
 
 
 
 
308
  cleaned_text,
309
  return_tensors="pt",
310
  truncation=True,
311
- max_length=max_len,
312
  padding=True
313
  ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  except Exception as e:
315
- logger.error(f"Tokenization error: {e}")
316
- raise ValueError(f"Failed to tokenize text: {e}")
317
-
318
- # الحصول على التنبؤات
319
- all_probabilities = []
320
-
321
- with torch.no_grad():
322
- for i, model in enumerate(self.models):
323
- try:
324
- logits = model(**inputs).logits
325
- probs = torch.softmax(logits, dim=1)
326
- all_probabilities.append(probs)
327
- except Exception as e:
328
- logger.warning(f"Model {i+1} prediction failed: {e}")
329
- continue
330
 
331
- if not all_probabilities:
332
- raise ValueError("All models failed to make predictions")
333
 
334
- # حساب المتوسط (Soft Voting)
335
- averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
336
- probabilities = averaged_probs[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
- # حساب نسب Human vs AI
339
- human_prob = probabilities[24].item()
340
- ai_probs = probabilities.clone()
341
- ai_probs[24] = 0 # إزالة احتمالية Human
342
- ai_total_prob = ai_probs.sum().item()
 
343
 
344
- # التطبيع
345
- total = human_prob + ai_total_prob
346
- if total > 0:
347
- human_percentage = (human_prob / total) * 100
348
- ai_percentage = (ai_total_prob / total) * 100
349
- else:
350
- human_percentage = 50
351
- ai_percentage = 50
 
352
 
353
- # تحديد الموديل الأكثر احتمالاً
354
- ai_model_idx = torch.argmax(ai_probs).item()
355
- predicted_model = label_mapping.get(ai_model_idx, "Unknown")
 
 
356
 
357
- # أعلى 5 تنبؤات
358
- top_5_probs, top_5_indices = torch.topk(probabilities, 5)
359
- top_5_results = []
360
- for prob, idx in zip(top_5_probs, top_5_indices):
361
- top_5_results.append({
362
- "model": label_mapping.get(idx.item(), "Unknown"),
363
- "probability": round(prob.item() * 100, 2)
364
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
- return {
367
- "human_percentage": round(human_percentage, 2),
368
- "ai_percentage": round(ai_percentage, 2),
369
- "predicted_model": predicted_model,
370
- "top_5_predictions": top_5_results,
371
- "is_human": human_percentage > ai_percentage,
372
- "models_used": len(all_probabilities),
373
- "using_fallback": self.using_fallback
374
- }
375
 
376
  # =====================================================
377
  # 🧹 دوال التنظيف والمعالجة
@@ -391,12 +662,12 @@ def split_into_paragraphs(text: str) -> List[str]:
391
  # 🌐 FastAPI Application
392
  # =====================================================
393
  app = FastAPI(
394
- title="ModernBERT AI Text Detector",
395
- description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
396
- version="2.3.0" # Updated version with 3 models and deprecation fixes
397
  )
398
 
399
- # إضافة CORS للسماح بالاستخدام من المتصفح
400
  app.add_middleware(
401
  CORSMiddleware,
402
  allow_origins=["*"],
@@ -405,8 +676,8 @@ app.add_middleware(
405
  allow_headers=["*"],
406
  )
407
 
408
- # إنشاء مدير الموديلات
409
- model_manager = ModelManager()
410
 
411
  # =====================================================
412
  # 📝 نماذج البيانات (Pydantic Models)
@@ -414,11 +685,12 @@ model_manager = ModelManager()
414
  class TextInput(BaseModel):
415
  text: str
416
  analyze_paragraphs: Optional[bool] = False
 
417
 
418
  class SimpleTextInput(BaseModel):
419
  text: str
420
 
421
- class DetectionResult(BaseModel):
422
  success: bool
423
  code: int
424
  message: str
@@ -431,33 +703,46 @@ class DetectionResult(BaseModel):
431
  async def startup_event():
432
  """تحميل الموديلات عند بداية التشغيل"""
433
  logger.info("=" * 50)
434
- logger.info("🚀 Starting ModernBERT AI Detector...")
435
  logger.info(f"🐍 Python version: {sys.version}")
436
  logger.info(f"🔥 PyTorch version: {torch.__version__}")
437
- import transformers
438
- logger.info(f"🔧 Transformers version: {transformers.__version__}")
439
- logger.info("🛡️ UID Monkey Patch Applied (for Docker/Container)")
440
  logger.info("=" * 50)
441
 
442
- # محاولة تحميل الموديلات
443
- max_models = int(os.environ.get("MAX_MODELS", "3")) # Updated default to 3
444
- success = model_manager.load_models(max_models=max_models)
 
 
 
 
 
445
 
446
  if success:
447
- logger.info(f"✅ Application ready! (Fallback mode: {model_manager.using_fallback})")
448
  else:
449
  logger.error("⚠️ Failed to load models - API will return errors")
450
- logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
451
 
452
  @app.get("/")
453
  async def root():
454
  """الصفحة الرئيسية"""
 
 
 
 
 
455
  return {
456
- "message": "ModernBERT AI Text Detector API",
457
  "status": "online" if model_manager.models_loaded else "initializing",
458
- "models_loaded": len(model_manager.models),
459
- "using_fallback": model_manager.using_fallback,
460
  "device": str(device),
 
 
 
 
 
 
 
 
461
  "endpoints": {
462
  "analyze": "/analyze",
463
  "simple": "/analyze-simple",
@@ -478,112 +763,111 @@ async def health_check():
478
 
479
  return {
480
  "status": "healthy" if model_manager.models_loaded else "unhealthy",
481
- "models_loaded": len(model_manager.models),
482
- "using_fallback": model_manager.using_fallback,
 
483
  "device": str(device),
484
  "cuda_available": torch.cuda.is_available(),
485
  "memory_info": memory_info
486
  }
487
 
488
- @app.post("/analyze", response_model=DetectionResult)
489
- async def analyze_text(data: TextInput):
490
  """
491
- تحليل النص للكشف عن AI
492
- يحاكي نفس وظيفة Gradio classify_text
493
  """
494
  try:
495
- # التحقق من النص
496
  text = data.text.strip()
497
  if not text:
498
- return DetectionResult(
499
  success=False,
500
  code=400,
501
  message="Empty input text",
502
  data={}
503
  )
504
 
505
- # التأكد من تحميل الموديلات
506
  if not model_manager.models_loaded:
507
- # محاولة تحميل الموديلات
508
- if not model_manager.load_models():
509
- return DetectionResult(
510
  success=False,
511
  code=503,
512
- message="Models not available. Check logs for details.",
513
  data={}
514
  )
515
 
516
- # حساب عدد الكلمات
517
- total_words = len(text.split())
518
-
519
- # التحليل الأساسي
520
- result = model_manager.classify_text(text)
521
 
522
- # النتائج الأساسية
523
- ai_percentage = result["ai_percentage"]
524
- human_percentage = result["human_percentage"]
 
525
  ai_words = int(total_words * (ai_percentage / 100))
526
 
527
- # تحليل الفقرات إذا طُلب ذلك
528
  paragraphs_analysis = []
529
- if data.analyze_paragraphs and ai_percentage > 50:
530
  paragraphs = split_into_paragraphs(text)
531
- recalc_ai_words = 0
532
- recalc_total_words = 0
533
-
534
- for para in paragraphs[:10]: # حد أقصى 10 فقرات
535
  if para.strip():
536
  try:
537
- para_result = model_manager.classify_text(para)
538
  para_words = len(para.split())
539
- recalc_total_words += para_words
540
- recalc_ai_words += para_words * (para_result["ai_percentage"] / 100)
541
 
542
  paragraphs_analysis.append({
543
  "paragraph": para[:200] + "..." if len(para) > 200 else para,
544
- "ai_generated_score": para_result["ai_percentage"] / 100,
545
- "human_written_score": para_result["human_percentage"] / 100,
546
- "predicted_model": para_result["predicted_model"]
 
 
 
 
547
  })
548
  except Exception as e:
549
  logger.warning(f"Failed to analyze paragraph: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
- # إعادة حساب النسب بناءً على الفقرات
552
- if recalc_total_words > 0:
553
- ai_percentage = round((recalc_ai_words / recalc_total_words) * 100, 2)
554
- human_percentage = round(100 - ai_percentage, 2)
555
- ai_words = int(recalc_ai_words)
 
 
556
 
557
- # إنشاء رسالة التغذية الراجعة
558
- if ai_percentage > 50:
559
- feedback = "Most of Your Text is AI/GPT Generated"
560
- else:
561
- feedback = "Most of Your Text Appears Human-Written"
562
 
563
- # إرجاع النتائج بنفس تنسيق الكود الأصلي
564
- return DetectionResult(
565
  success=True,
566
  code=200,
567
- message="analysis completed",
568
- data={
569
- "fakePercentage": ai_percentage,
570
- "isHuman": human_percentage,
571
- "textWords": total_words,
572
- "aiWords": ai_words,
573
- "paragraphs": paragraphs_analysis,
574
- "predicted_model": result["predicted_model"],
575
- "feedback": feedback,
576
- "input_text": text[:500] + "..." if len(text) > 500 else text,
577
- "detected_language": "en",
578
- "top_5_predictions": result.get("top_5_predictions", []),
579
- "models_used": result.get("models_used", 1),
580
- "using_fallback": result.get("using_fallback", False)
581
- }
582
  )
583
 
584
  except Exception as e:
585
  logger.error(f"Analysis error: {e}", exc_info=True)
586
- return DetectionResult(
587
  success=False,
588
  code=500,
589
  message=f"Analysis failed: {str(e)}",
@@ -593,7 +877,7 @@ async def analyze_text(data: TextInput):
593
  @app.post("/analyze-simple")
594
  async def analyze_simple(data: SimpleTextInput):
595
  """
596
- تحليل مبسط - يرجع النتائج الأساسية فقط
597
  """
598
  try:
599
  text = data.text.strip()
@@ -601,18 +885,20 @@ async def analyze_simple(data: SimpleTextInput):
601
  raise HTTPException(status_code=400, detail="Empty text")
602
 
603
  if not model_manager.models_loaded:
604
- if not model_manager.load_models():
605
  raise HTTPException(status_code=503, detail="Models not available")
606
 
607
- result = model_manager.classify_text(text)
 
608
 
609
  return {
610
- "is_ai": result["ai_percentage"] > 50,
611
- "ai_score": result["ai_percentage"],
612
- "human_score": result["human_percentage"],
613
- "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
614
- "confidence": max(result["ai_percentage"], result["human_percentage"]),
615
- "using_fallback": result.get("using_fallback", False)
 
616
  }
617
 
618
  except HTTPException:
@@ -627,21 +913,21 @@ async def analyze_simple(data: SimpleTextInput):
627
  if __name__ == "__main__":
628
  import uvicorn
629
 
630
- # الحصول على الإعدادات من البيئة
631
  port = int(os.environ.get("PORT", 8000))
632
  host = os.environ.get("HOST", "0.0.0.0")
633
  workers = int(os.environ.get("WORKERS", 1))
634
 
635
  logger.info("=" * 50)
636
- logger.info(f"🌐 Starting server on {host}:{port}")
637
  logger.info(f"👷 Workers: {workers}")
638
  logger.info(f"📚 Documentation: http://{host}:{port}/docs")
639
  logger.info("=" * 50)
640
 
641
  uvicorn.run(
642
- "main:app", # Assuming this file is named main.py
643
  host=host,
644
  port=port,
 
645
  workers=workers,
646
- reload=False # Set to True for dev
647
  )
 
4
  import logging
5
  import gc
6
  import sys
7
+ import numpy as np
8
  from fastapi import FastAPI, HTTPException
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
11
  from typing import Dict, List, Optional
12
+ from transformers import (
13
+ AutoTokenizer,
14
+ AutoModelForSequenceClassification,
15
+ AutoModelForCausalLM,
16
+ pipeline
17
+ )
18
  from tokenizers.normalizers import Sequence, Replace, Strip
19
  from tokenizers import Regex
20
+ import math
21
+ from collections import Counter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # =====================================================
24
  # 🔧 تكوين البيئة والإعدادات
 
33
  CACHE_DIR = "/tmp/huggingface_cache"
34
  os.makedirs(CACHE_DIR, exist_ok=True)
35
 
36
+ # تكوين متغيرات البيئة لـ Hugging Face
37
  os.environ.update({
38
  "HF_HOME": CACHE_DIR,
39
+ "TRANSFORMERS_CACHE": CACHE_DIR,
40
  "HF_DATASETS_CACHE": CACHE_DIR,
41
  "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
42
  "TORCH_HOME": CACHE_DIR,
43
+ "TOKENIZERS_PARALLELISM": "false",
44
+ "TRANSFORMERS_OFFLINE": "0",
45
  })
46
 
47
  # إعدادات PyTorch للذاكرة
 
75
  }
76
 
77
  # =====================================================
78
+ # 📈 حسابات Perplexity و Burstiness
79
+ # =====================================================
80
+ class TextMetrics:
81
+ """حساب المقاييس الإحصائية للنص"""
82
+
83
+ @staticmethod
84
+ def calculate_perplexity(text: str, model=None, tokenizer=None):
85
+ """
86
+ حساب Perplexity - قياس مدى "تفاجؤ" الموديل بالنص
87
+ نصوص AI عادة لها perplexity أقل (أكثر قابلية للتنبؤ)
88
+ """
89
+ try:
90
+ if model is None or tokenizer is None:
91
+ # حساب تقريبي بناءً على تكرار الكلمات
92
+ words = text.lower().split()
93
+ word_freq = Counter(words)
94
+ total_words = len(words)
95
+
96
+ # حساب entropy
97
+ entropy = 0
98
+ for count in word_freq.values():
99
+ probability = count / total_words
100
+ if probability > 0:
101
+ entropy -= probability * math.log2(probability)
102
+
103
+ # تقريب perplexity
104
+ perplexity = 2 ** entropy
105
+ return min(perplexity, 1000) # Cap at 1000
106
+ else:
107
+ # حساب حقيقي باستخدام موديل
108
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
109
+ with torch.no_grad():
110
+ outputs = model(**inputs, labels=inputs["input_ids"])
111
+ loss = outputs.loss
112
+ perplexity = torch.exp(loss).item()
113
+ return min(perplexity, 1000)
114
+ except Exception as e:
115
+ logger.warning(f"Error calculating perplexity: {e}")
116
+ return 50.0 # Default value
117
+
118
+ @staticmethod
119
+ def calculate_burstiness(text: str):
120
+ """
121
+ حساب Burstiness - قياس التنوع في طول الجمل
122
+ البشر عندهم burstiness أعلى (جمل متنوعة الطول)
123
+ AI عادة أكثر اتساقاً
124
+ """
125
+ try:
126
+ # تقسيم النص لجمل
127
+ sentences = re.split(r'[.!?]+', text)
128
+ sentences = [s.strip() for s in sentences if s.strip()]
129
+
130
+ if len(sentences) < 2:
131
+ return 0.0
132
+
133
+ # حسا�� طول كل جملة
134
+ sentence_lengths = [len(s.split()) for s in sentences]
135
+
136
+ # حساب الانحراف المعياري والمتوسط
137
+ mean_length = np.mean(sentence_lengths)
138
+ std_length = np.std(sentence_lengths)
139
+
140
+ # Burstiness = الانحراف المعياري / المتوسط
141
+ if mean_length > 0:
142
+ burstiness = std_length / mean_length
143
+ else:
144
+ burstiness = 0.0
145
+
146
+ return round(burstiness, 4)
147
+ except Exception as e:
148
+ logger.warning(f"Error calculating burstiness: {e}")
149
+ return 0.5
150
+
151
+ @staticmethod
152
+ def calculate_vocabulary_diversity(text: str):
153
+ """
154
+ حساب تنوع المفردات
155
+ البشر يستخدمون كلمات أكثر تنوعاً
156
+ """
157
+ words = text.lower().split()
158
+ unique_words = set(words)
159
+ if len(words) > 0:
160
+ diversity = len(unique_words) / len(words)
161
+ else:
162
+ diversity = 0
163
+ return round(diversity, 4)
164
+
165
+ @staticmethod
166
+ def detect_ai_patterns(text: str):
167
+ """
168
+ كشف الأنماط الشائعة في نصوص AI
169
+ """
170
+ ai_patterns = [
171
+ r"it['\s]+s important to note",
172
+ r"in conclusion",
173
+ r"furthermore",
174
+ r"comprehensive understanding",
175
+ r"it is worth noting",
176
+ r"however, it should be noted",
177
+ r"on the other hand",
178
+ r"in summary",
179
+ r"to begin with",
180
+ r"first and foremost"
181
+ ]
182
+
183
+ pattern_count = 0
184
+ for pattern in ai_patterns:
185
+ if re.search(pattern, text.lower()):
186
+ pattern_count += 1
187
+
188
+ return pattern_count
189
+
190
+ @staticmethod
191
+ def detect_human_patterns(text: str):
192
+ """
193
+ كشف الأنماط الشائعة في الكتابة البشرية
194
+ """
195
+ human_patterns = [
196
+ r"kinda|sorta|gonna|wanna|gotta",
197
+ r"tbh|idk|lol|omg|btw",
198
+ r"!{2,}|\?{2,}|\.{3,}",
199
+ r"i think|i feel|i believe",
200
+ r"like,|you know,|i mean,",
201
+ r"anyway|anyhow|whatever"
202
+ ]
203
+
204
+ pattern_count = 0
205
+ for pattern in human_patterns:
206
+ if re.search(pattern, text.lower()):
207
+ pattern_count += 1
208
+
209
+ return pattern_count
210
+
211
+ # =====================================================
212
+ # 🤖 Model Manager - إدارة الموديلات المحسنة
213
  # =====================================================
214
+ class EnhancedModelManager:
215
  def __init__(self):
216
+ self.modernbert_tokenizer = None
217
+ self.modernbert_models = []
218
+ self.additional_models = {}
219
+ self.additional_tokenizers = {}
220
  self.models_loaded = False
221
+ self.metrics = TextMetrics()
222
+
223
+ # ModernBERT URLs
224
+ self.modernbert_urls = [
225
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
226
  "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
227
  ]
 
 
 
228
 
229
+ # Additional models to try
230
+ self.additional_model_configs = [
231
+ {
232
+ "name": "chatgpt-detector-roberta",
233
+ "model_id": "Hello-SimpleAI/chatgpt-detector-roberta",
234
+ "type": "classification"
235
+ },
236
+ {
237
+ "name": "openai-detector",
238
+ "model_id": "roberta-base-openai-detector",
239
+ "type": "classification"
240
+ },
241
+ {
242
+ "name": "ai-content-detector",
243
+ "model_id": "PirateXX/AI-Content-Detector",
244
+ "type": "classification"
245
+ }
246
+ ]
247
+
248
+ def load_modernbert_tokenizer(self):
249
+ """تحميل ModernBERT tokenizer"""
250
  try:
251
+ logger.info("📝 Loading ModernBERT tokenizer...")
252
+ self.modernbert_tokenizer = AutoTokenizer.from_pretrained(
253
+ "answerdotai/ModernBERT-base",
254
  cache_dir=CACHE_DIR,
255
  use_fast=True,
256
  trust_remote_code=False
257
  )
 
258
 
259
+ # إعداد معالج النصوص
 
260
  try:
261
+ newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
262
+ join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
263
+ self.modernbert_tokenizer.backend_tokenizer.normalizer = Sequence([
264
+ self.modernbert_tokenizer.backend_tokenizer.normalizer,
265
+ join_hyphen_break,
266
+ newline_to_space,
267
+ Strip()
268
+ ])
269
+ except Exception as e:
270
+ logger.warning(f"⚠️ Could not set custom normalizer: {e}")
271
+
272
+ logger.info("✅ ModernBERT tokenizer loaded")
273
+ return True
 
 
 
 
 
 
 
 
 
 
274
  except Exception as e:
275
+ logger.error(f" Failed to load tokenizer: {e}")
276
+ return False
 
277
 
278
+ def load_modernbert_model(self, model_url=None, model_path=None, model_name="ModernBERT"):
279
+ """تحميل موديل ModernBERT واحد"""
 
280
  try:
281
+ logger.info(f"🤖 Loading {model_name}...")
282
 
 
283
  base_model = AutoModelForSequenceClassification.from_pretrained(
284
+ "answerdotai/ModernBERT-base",
285
  num_labels=41,
286
  cache_dir=CACHE_DIR,
287
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
288
  low_cpu_mem_usage=True,
289
  trust_remote_code=False
290
  )
 
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  if model_path and os.path.exists(model_path):
293
  logger.info(f"📁 Loading from local file: {model_path}")
294
  state_dict = torch.load(model_path, map_location=device, weights_only=True)
295
  base_model.load_state_dict(state_dict, strict=False)
296
  elif model_url:
297
+ logger.info(f"🌐 Downloading weights from URL...")
298
+ try:
299
+ state_dict = torch.hub.load_state_dict_from_url(
300
+ model_url,
301
+ map_location=device,
302
+ progress=True,
303
+ check_hash=False,
304
+ file_name=f"{model_name}.pt"
305
+ )
306
+ base_model.load_state_dict(state_dict, strict=False)
307
+ except Exception as e:
308
+ logger.warning(f"⚠️ Could not load weights: {e}")
309
+ logger.info("📊 Using model with random initialization")
310
+
311
+ model = base_model.to(device)
312
+ model.eval()
313
+
314
+ if 'state_dict' in locals():
315
+ del state_dict
316
+ gc.collect()
317
+ if torch.cuda.is_available():
318
+ torch.cuda.empty_cache()
319
+
320
+ logger.info(f"✅ {model_name} loaded")
321
+ return model
322
+
323
+ except Exception as e:
324
+ logger.error(f"❌ Failed to load {model_name}: {e}")
325
+ return None
326
+
327
+ def load_additional_model(self, model_config):
328
+ """تحميل موديلات إضافية للكشف عن AI"""
329
+ try:
330
+ model_name = model_config["name"]
331
+ model_id = model_config["model_id"]
332
+
333
+ logger.info(f"🔧 Loading {model_name}...")
334
+
335
+ # Try loading as a pipeline first (easier)
336
+ try:
337
+ classifier = pipeline(
338
+ "text-classification",
339
+ model=model_id,
340
+ device=0 if torch.cuda.is_available() else -1,
341
+ model_kwargs={"cache_dir": CACHE_DIR}
342
  )
343
+ self.additional_models[model_name] = classifier
344
+ logger.info(f"✅ {model_name} loaded as pipeline")
345
+ return True
346
+ except:
347
+ # Try loading manually
348
+ tokenizer = AutoTokenizer.from_pretrained(
349
+ model_id,
350
+ cache_dir=CACHE_DIR
351
+ )
352
+ model = AutoModelForSequenceClassification.from_pretrained(
353
+ model_id,
354
+ cache_dir=CACHE_DIR,
355
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
356
+ ).to(device)
357
+ model.eval()
358
 
359
+ self.additional_tokenizers[model_name] = tokenizer
360
+ self.additional_models[model_name] = model
361
+ logger.info(f"✅ {model_name} loaded manually")
362
+ return True
363
+
364
+ except Exception as e:
365
+ logger.warning(f"⚠️ Could not load {model_config['name']}: {e}")
366
+ return False
367
+
368
+ def load_all_models(self, max_modernbert=2, load_additional=True):
369
+ """تحميل جميع الموديلات"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  if self.models_loaded:
371
  logger.info("✨ Models already loaded")
372
  return True
373
 
374
+ # Load ModernBERT tokenizer
375
+ if not self.load_modernbert_tokenizer():
 
376
  return False
377
 
378
+ # Load ModernBERT models
379
+ logger.info(f"🚀 Loading up to {max_modernbert} ModernBERT models...")
380
 
381
+ # Try local file first
382
+ local_path = "modernbert.bin"
383
+ if os.path.exists(local_path):
384
+ model = self.load_modernbert_model(
385
+ model_path=local_path,
386
+ model_name="ModernBERT-Local"
387
  )
388
  if model is not None:
389
+ self.modernbert_models.append(model)
390
 
391
+ # Load from URLs
392
+ for i, url in enumerate(self.modernbert_urls[:max_modernbert - len(self.modernbert_models)]):
393
+ if len(self.modernbert_models) >= max_modernbert:
394
  break
395
+
396
+ model = self.load_modernbert_model(
397
+ model_url=url,
398
+ model_name=f"ModernBERT-{i+1}"
 
399
  )
400
  if model is not None:
401
+ self.modernbert_models.append(model)
402
+
403
+ # Load additional models
404
+ if load_additional:
405
+ logger.info("🎯 Loading additional AI detection models...")
406
+ for config in self.additional_model_configs:
407
+ self.load_additional_model(config)
 
 
 
 
 
408
 
409
+ # Check success
410
+ total_models = len(self.modernbert_models) + len(self.additional_models)
411
+ if total_models > 0:
412
  self.models_loaded = True
413
+ logger.info(f"✅ Loaded {len(self.modernbert_models)} ModernBERT + {len(self.additional_models)} additional models")
414
  return True
415
  else:
416
  logger.error("❌ No models could be loaded")
417
  return False
418
 
419
+ def classify_with_modernbert(self, text: str, model_index: int):
420
+ """تصنيف النص باستخدام موديل ModernBERT واحد"""
 
 
 
 
 
 
 
 
 
 
421
  try:
422
+ if model_index >= len(self.modernbert_models):
423
+ return None
424
+
425
+ model = self.modernbert_models[model_index]
426
+ cleaned_text = clean_text(text)
427
+
428
+ inputs = self.modernbert_tokenizer(
429
  cleaned_text,
430
  return_tensors="pt",
431
  truncation=True,
432
+ max_length=512,
433
  padding=True
434
  ).to(device)
435
+
436
+ with torch.no_grad():
437
+ logits = model(**inputs).logits
438
+ probs = torch.softmax(logits[0], dim=0)
439
+
440
+ human_prob = probs[24].item()
441
+ ai_probs = probs.clone()
442
+ ai_probs[24] = 0
443
+ ai_total = ai_probs.sum().item()
444
+
445
+ total = human_prob + ai_total
446
+ if total > 0:
447
+ human_pct = (human_prob / total) * 100
448
+ ai_pct = (ai_total / total) * 100
449
+ else:
450
+ human_pct = ai_pct = 50
451
+
452
+ ai_model_idx = torch.argmax(ai_probs).item()
453
+
454
+ return {
455
+ "model_name": f"ModernBERT-{model_index+1}",
456
+ "human_score": round(human_pct, 2),
457
+ "ai_score": round(ai_pct, 2),
458
+ "predicted_model": label_mapping.get(ai_model_idx, "Unknown"),
459
+ "confidence": round(max(human_pct, ai_pct), 2)
460
+ }
461
  except Exception as e:
462
+ logger.error(f"Error in ModernBERT {model_index}: {e}")
463
+ return None
464
+
465
+ def classify_with_additional(self, text: str, model_name: str):
466
+ """تصنيف النص باستخدام موديل إضافي"""
467
+ try:
468
+ if model_name not in self.additional_models:
469
+ return None
 
 
 
 
 
 
 
470
 
471
+ model = self.additional_models[model_name]
 
472
 
473
+ # Check if it's a pipeline or model
474
+ if hasattr(model, '__call__'):
475
+ # It's a pipeline
476
+ result = model(text, truncation=True, max_length=512)
477
+
478
+ # Parse results based on model output format
479
+ ai_score = 0
480
+ human_score = 0
481
+
482
+ for item in result:
483
+ label = item['label'].lower()
484
+ score = item['score'] * 100
485
+
486
+ if 'fake' in label or 'ai' in label or 'gpt' in label:
487
+ ai_score = max(ai_score, score)
488
+ elif 'real' in label or 'human' in label:
489
+ human_score = max(human_score, score)
490
+
491
+ # Normalize if needed
492
+ if ai_score == 0 and human_score == 0:
493
+ ai_score = human_score = 50
494
+
495
+ return {
496
+ "model_name": model_name,
497
+ "human_score": round(human_score, 2),
498
+ "ai_score": round(ai_score, 2),
499
+ "predicted_model": "AI" if ai_score > human_score else "Human",
500
+ "confidence": round(max(ai_score, human_score), 2)
501
+ }
502
+ else:
503
+ # It's a model, use tokenizer
504
+ tokenizer = self.additional_tokenizers.get(model_name)
505
+ if tokenizer is None:
506
+ return None
507
+
508
+ inputs = tokenizer(
509
+ text,
510
+ return_tensors="pt",
511
+ truncation=True,
512
+ max_length=512,
513
+ padding=True
514
+ ).to(device)
515
+
516
+ with torch.no_grad():
517
+ outputs = model(**inputs)
518
+ probs = torch.softmax(outputs.logits[0], dim=0)
519
+
520
+ # Assuming binary classification (AI vs Human)
521
+ if len(probs) == 2:
522
+ human_score = probs[0].item() * 100
523
+ ai_score = probs[1].item() * 100
524
+ else:
525
+ # Handle multi-class
526
+ ai_score = human_score = 50
527
+
528
+ return {
529
+ "model_name": model_name,
530
+ "human_score": round(human_score, 2),
531
+ "ai_score": round(ai_score, 2),
532
+ "predicted_model": "AI" if ai_score > human_score else "Human",
533
+ "confidence": round(max(ai_score, human_score), 2)
534
+ }
535
+
536
+ except Exception as e:
537
+ logger.warning(f"Error in {model_name}: {e}")
538
+ return None
539
+
540
+ def comprehensive_analysis(self, text: str):
541
+ """تحليل شامل باستخدام جميع الموديلات والمقاييس"""
542
+ if not self.models_loaded:
543
+ raise ValueError("No models loaded")
544
 
545
+ results = {
546
+ "individual_models": [],
547
+ "ensemble_result": {},
548
+ "metrics": {},
549
+ "pattern_analysis": {}
550
+ }
551
 
552
+ # 1. Calculate text metrics
553
+ logger.info("📊 Calculating text metrics...")
554
+ results["metrics"] = {
555
+ "perplexity": self.metrics.calculate_perplexity(text),
556
+ "burstiness": self.metrics.calculate_burstiness(text),
557
+ "vocabulary_diversity": self.metrics.calculate_vocabulary_diversity(text),
558
+ "text_length": len(text.split()),
559
+ "sentence_count": len(re.split(r'[.!?]+', text))
560
+ }
561
 
562
+ # 2. Pattern detection
563
+ results["pattern_analysis"] = {
564
+ "ai_patterns_found": self.metrics.detect_ai_patterns(text),
565
+ "human_patterns_found": self.metrics.detect_human_patterns(text)
566
+ }
567
 
568
+ # 3. Run ModernBERT models
569
+ modernbert_results = []
570
+ for i in range(len(self.modernbert_models)):
571
+ result = self.classify_with_modernbert(text, i)
572
+ if result:
573
+ results["individual_models"].append(result)
574
+ modernbert_results.append(result)
575
+
576
+ # 4. Run additional models
577
+ for model_name in self.additional_models.keys():
578
+ result = self.classify_with_additional(text, model_name)
579
+ if result:
580
+ results["individual_models"].append(result)
581
+
582
+ # 5. Calculate ensemble result (weighted average)
583
+ if results["individual_models"]:
584
+ total_ai = 0
585
+ total_human = 0
586
+ weights_sum = 0
587
+
588
+ for i, result in enumerate(results["individual_models"]):
589
+ # Give ModernBERT models higher weight
590
+ weight = 1.5 if i < len(modernbert_results) else 1.0
591
+ total_ai += result["ai_score"] * weight
592
+ total_human += result["human_score"] * weight
593
+ weights_sum += weight
594
+
595
+ if weights_sum > 0:
596
+ ensemble_ai = total_ai / weights_sum
597
+ ensemble_human = total_human / weights_sum
598
+ else:
599
+ ensemble_ai = ensemble_human = 50
600
+
601
+ # Adjust based on metrics
602
+ # High perplexity suggests human text
603
+ if results["metrics"]["perplexity"] > 100:
604
+ ensemble_human += 5
605
+ ensemble_ai -= 5
606
+ elif results["metrics"]["perplexity"] < 30:
607
+ ensemble_ai += 5
608
+ ensemble_human -= 5
609
+
610
+ # High burstiness suggests human text
611
+ if results["metrics"]["burstiness"] > 0.8:
612
+ ensemble_human += 5
613
+ ensemble_ai -= 5
614
+ elif results["metrics"]["burstiness"] < 0.3:
615
+ ensemble_ai += 5
616
+ ensemble_human -= 5
617
+
618
+ # Pattern analysis adjustment
619
+ pattern_adjustment = (results["pattern_analysis"]["ai_patterns_found"] -
620
+ results["pattern_analysis"]["human_patterns_found"]) * 3
621
+ ensemble_ai += pattern_adjustment
622
+ ensemble_human -= pattern_adjustment
623
+
624
+ # Normalize to 100%
625
+ total = ensemble_ai + ensemble_human
626
+ if total > 0:
627
+ ensemble_ai = (ensemble_ai / total) * 100
628
+ ensemble_human = (ensemble_human / total) * 100
629
+
630
+ # Determine most likely AI model
631
+ if ensemble_ai > ensemble_human and modernbert_results:
632
+ predicted_model = modernbert_results[0]["predicted_model"]
633
+ else:
634
+ predicted_model = "Human"
635
+
636
+ results["ensemble_result"] = {
637
+ "ai_percentage": round(min(max(ensemble_ai, 0), 100), 2),
638
+ "human_percentage": round(min(max(ensemble_human, 0), 100), 2),
639
+ "predicted_model": predicted_model,
640
+ "confidence": round(max(ensemble_ai, ensemble_human), 2),
641
+ "is_human": ensemble_human > ensemble_ai,
642
+ "models_used": len(results["individual_models"])
643
+ }
644
 
645
+ return results
 
 
 
 
 
 
 
 
646
 
647
  # =====================================================
648
  # 🧹 دوال التنظيف والمعالجة
 
662
  # 🌐 FastAPI Application
663
  # =====================================================
664
  app = FastAPI(
665
+ title="Enhanced ModernBERT AI Detector",
666
+ description="Advanced AI detection with multiple models, perplexity, and burstiness analysis",
667
+ version="3.0.0"
668
  )
669
 
670
+ # إضافة CORS
671
  app.add_middleware(
672
  CORSMiddleware,
673
  allow_origins=["*"],
 
676
  allow_headers=["*"],
677
  )
678
 
679
+ # إنشاء مدير الموديلات المحسن
680
+ model_manager = EnhancedModelManager()
681
 
682
  # =====================================================
683
  # 📝 نماذج البيانات (Pydantic Models)
 
685
  class TextInput(BaseModel):
686
  text: str
687
  analyze_paragraphs: Optional[bool] = False
688
+ return_individual_scores: Optional[bool] = True
689
 
690
  class SimpleTextInput(BaseModel):
691
  text: str
692
 
693
+ class EnhancedDetectionResult(BaseModel):
694
  success: bool
695
  code: int
696
  message: str
 
703
  async def startup_event():
704
  """تحميل الموديلات عند بداية التشغيل"""
705
  logger.info("=" * 50)
706
+ logger.info("🚀 Starting Enhanced ModernBERT AI Detector...")
707
  logger.info(f"🐍 Python version: {sys.version}")
708
  logger.info(f"🔥 PyTorch version: {torch.__version__}")
 
 
 
709
  logger.info("=" * 50)
710
 
711
+ # Load models
712
+ max_modernbert = int(os.environ.get("MAX_MODERNBERT_MODELS", "2"))
713
+ load_additional = os.environ.get("LOAD_ADDITIONAL_MODELS", "true").lower() == "true"
714
+
715
+ success = model_manager.load_all_models(
716
+ max_modernbert=max_modernbert,
717
+ load_additional=load_additional
718
+ )
719
 
720
  if success:
721
+ logger.info("✅ Application ready with enhanced features!")
722
  else:
723
  logger.error("⚠️ Failed to load models - API will return errors")
 
724
 
725
  @app.get("/")
726
  async def root():
727
  """الصفحة الرئيسية"""
728
+ models_info = {
729
+ "modernbert_models": len(model_manager.modernbert_models),
730
+ "additional_models": list(model_manager.additional_models.keys())
731
+ }
732
+
733
  return {
734
+ "message": "Enhanced ModernBERT AI Text Detector API",
735
  "status": "online" if model_manager.models_loaded else "initializing",
736
+ "models": models_info,
 
737
  "device": str(device),
738
+ "features": [
739
+ "Multiple AI detection models",
740
+ "Perplexity analysis",
741
+ "Burstiness analysis",
742
+ "Pattern detection",
743
+ "Individual model scores",
744
+ "Ensemble predictions"
745
+ ],
746
  "endpoints": {
747
  "analyze": "/analyze",
748
  "simple": "/analyze-simple",
 
763
 
764
  return {
765
  "status": "healthy" if model_manager.models_loaded else "unhealthy",
766
+ "modernbert_models": len(model_manager.modernbert_models),
767
+ "additional_models": len(model_manager.additional_models),
768
+ "total_models": len(model_manager.modernbert_models) + len(model_manager.additional_models),
769
  "device": str(device),
770
  "cuda_available": torch.cuda.is_available(),
771
  "memory_info": memory_info
772
  }
773
 
774
+ @app.post("/analyze", response_model=EnhancedDetectionResult)
775
+ async def analyze_text_enhanced(data: TextInput):
776
  """
777
+ Enhanced analysis with multiple models and metrics
 
778
  """
779
  try:
780
+ # Validate input
781
  text = data.text.strip()
782
  if not text:
783
+ return EnhancedDetectionResult(
784
  success=False,
785
  code=400,
786
  message="Empty input text",
787
  data={}
788
  )
789
 
790
+ # Ensure models are loaded
791
  if not model_manager.models_loaded:
792
+ if not model_manager.load_all_models():
793
+ return EnhancedDetectionResult(
 
794
  success=False,
795
  code=503,
796
+ message="Models not available",
797
  data={}
798
  )
799
 
800
+ # Comprehensive analysis
801
+ analysis_result = model_manager.comprehensive_analysis(text)
 
 
 
802
 
803
+ # Basic stats
804
+ total_words = len(text.split())
805
+ ai_percentage = analysis_result["ensemble_result"]["ai_percentage"]
806
+ human_percentage = analysis_result["ensemble_result"]["human_percentage"]
807
  ai_words = int(total_words * (ai_percentage / 100))
808
 
809
+ # Paragraph analysis if requested
810
  paragraphs_analysis = []
811
+ if data.analyze_paragraphs:
812
  paragraphs = split_into_paragraphs(text)
813
+ for para in paragraphs[:10]:
 
 
 
814
  if para.strip():
815
  try:
816
+ para_result = model_manager.comprehensive_analysis(para)
817
  para_words = len(para.split())
 
 
818
 
819
  paragraphs_analysis.append({
820
  "paragraph": para[:200] + "..." if len(para) > 200 else para,
821
+ "ai_generated_score": para_result["ensemble_result"]["ai_percentage"] / 100,
822
+ "human_written_score": para_result["ensemble_result"]["human_percentage"] / 100,
823
+ "predicted_model": para_result["ensemble_result"]["predicted_model"],
824
+ "metrics": {
825
+ "perplexity": para_result["metrics"]["perplexity"],
826
+ "burstiness": para_result["metrics"]["burstiness"]
827
+ }
828
  })
829
  except Exception as e:
830
  logger.warning(f"Failed to analyze paragraph: {e}")
831
+
832
+ # Prepare response
833
+ response_data = {
834
+ "fakePercentage": ai_percentage,
835
+ "isHuman": human_percentage,
836
+ "textWords": total_words,
837
+ "aiWords": ai_words,
838
+ "predicted_model": analysis_result["ensemble_result"]["predicted_model"],
839
+ "feedback": "Most of Your Text is AI/GPT Generated" if ai_percentage > 50 else "Most of Your Text Appears Human-Written",
840
+ "confidence": analysis_result["ensemble_result"]["confidence"],
841
+ "models_used": analysis_result["ensemble_result"]["models_used"],
842
+
843
+ # New: Metrics
844
+ "metrics": analysis_result["metrics"],
845
+
846
+ # New: Pattern analysis
847
+ "pattern_analysis": analysis_result["pattern_analysis"],
848
 
849
+ # Paragraphs if requested
850
+ "paragraphs": paragraphs_analysis,
851
+
852
+ # Text preview
853
+ "input_text": text[:500] + "..." if len(text) > 500 else text,
854
+ "detected_language": "en"
855
+ }
856
 
857
+ # Add individual model scores if requested
858
+ if data.return_individual_scores:
859
+ response_data["individual_models"] = analysis_result["individual_models"]
 
 
860
 
861
+ return EnhancedDetectionResult(
 
862
  success=True,
863
  code=200,
864
+ message="Enhanced analysis completed",
865
+ data=response_data
 
 
 
 
 
 
 
 
 
 
 
 
 
866
  )
867
 
868
  except Exception as e:
869
  logger.error(f"Analysis error: {e}", exc_info=True)
870
+ return EnhancedDetectionResult(
871
  success=False,
872
  code=500,
873
  message=f"Analysis failed: {str(e)}",
 
877
  @app.post("/analyze-simple")
878
  async def analyze_simple(data: SimpleTextInput):
879
  """
880
+ Simple analysis - returns basic results only
881
  """
882
  try:
883
  text = data.text.strip()
 
885
  raise HTTPException(status_code=400, detail="Empty text")
886
 
887
  if not model_manager.models_loaded:
888
+ if not model_manager.load_all_models():
889
  raise HTTPException(status_code=503, detail="Models not available")
890
 
891
+ result = model_manager.comprehensive_analysis(text)
892
+ ensemble = result["ensemble_result"]
893
 
894
  return {
895
+ "is_ai": ensemble["ai_percentage"] > 50,
896
+ "ai_score": ensemble["ai_percentage"],
897
+ "human_score": ensemble["human_percentage"],
898
+ "detected_model": ensemble["predicted_model"],
899
+ "confidence": ensemble["confidence"],
900
+ "perplexity": result["metrics"]["perplexity"],
901
+ "burstiness": result["metrics"]["burstiness"]
902
  }
903
 
904
  except HTTPException:
 
913
  if __name__ == "__main__":
914
  import uvicorn
915
 
 
916
  port = int(os.environ.get("PORT", 8000))
917
  host = os.environ.get("HOST", "0.0.0.0")
918
  workers = int(os.environ.get("WORKERS", 1))
919
 
920
  logger.info("=" * 50)
921
+ logger.info(f"🌐 Starting enhanced server on {host}:{port}")
922
  logger.info(f"👷 Workers: {workers}")
923
  logger.info(f"📚 Documentation: http://{host}:{port}/docs")
924
  logger.info("=" * 50)
925
 
926
  uvicorn.run(
927
+ "app_enhanced:app",
928
  host=host,
929
  port=port,
930
+ reload=False,
931
  workers=workers,
932
+ log_level="info"
933
  )