Commit
·
44d0409
1
Parent(s):
154e93c
Architecture updated
Browse files- .gitignore +7 -0
- README.md +70 -52
- config/__init__.py +0 -35
- config/constants.py +876 -0
- config/enums.py +106 -0
- config/model_config.py +14 -51
- config/schemas.py +452 -0
- config/settings.py +51 -52
- config/threshold_config.py +107 -164
- data/reports/file_1765557325979_20251212_220627.pdf +0 -181
- detector/__init__.py +0 -20
- detector/attribution.py +0 -962
- detector/orchestrator.py +0 -576
- docs/API_DOCUMENTATION.md +705 -0
- docs/ARCHITECTURE.md +821 -0
- docs/BLOGPOST.md +280 -230
- docs/WHITE_PAPER.md +0 -0
- example.py +0 -45
- logs/application/app_2025-11-07.log +0 -0
- metrics/base_metric.py +26 -99
- metrics/entropy.py +180 -164
- metrics/linguistic.py +193 -193
- metrics/multi_perturbation_stability.py +215 -239
- metrics/perplexity.py +161 -157
- metrics/semantic_analysis.py +216 -186
- metrics/structural.py +187 -180
- models/__init__.py +0 -13
- models/model_manager.py +16 -14
- models/model_registry.py +30 -43
- processors/__init__.py +0 -26
- processors/document_extractor.py +23 -67
- processors/domain_classifier.py +205 -68
- processors/language_detector.py +177 -253
- processors/text_processor.py +22 -126
- reporter/__init__.py +0 -10
- reporter/report_generator.py +80 -184
- run.sh +0 -56
- services/__init__.py +0 -0
- detector/ensemble.py → services/ensemble_classifier.py +197 -451
- {detector → services}/highlighter.py +316 -569
- services/orchestrator.py +753 -0
- {reporter → services}/reasoning_generator.py +219 -290
- setup.sh +0 -22
- test_integration.py +331 -0
- text_auth_app.py +307 -256
- ui/static/index.html +161 -291
- utils/logger.py +26 -65
.gitignore
CHANGED
|
@@ -44,3 +44,10 @@ Thumbs.db
|
|
| 44 |
# Environment variables
|
| 45 |
.env
|
| 46 |
.env.localrailway.toml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Environment variables
|
| 45 |
.env
|
| 46 |
.env.localrailway.toml
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Validation ignored as of now
|
| 50 |
+
validation/
|
| 51 |
+
data/validation_data/
|
| 52 |
+
logs/
|
| 53 |
+
notebooks/
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: Text
|
| 3 |
emoji: 🔍
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
|
@@ -12,8 +12,8 @@ license: mit
|
|
| 12 |
|
| 13 |
<div align="center">
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
##
|
| 17 |
|
| 18 |

|
| 19 |

|
|
@@ -31,9 +31,9 @@ license: mit
|
|
| 31 |
- [Key Differentiators](#key-differentiators)
|
| 32 |
- [System Architecture](#system-architecture)
|
| 33 |
- [Workflow / Data Flow](#workflow--data-flow)
|
| 34 |
-
- [
|
| 35 |
- [Ensemble Methodology](#ensemble-methodology)
|
| 36 |
-
- [Domain-Aware
|
| 37 |
- [Performance Characteristics](#performance-characteristics)
|
| 38 |
- [Project Structure](#project-structure)
|
| 39 |
- [API Endpoints](#api-endpoints)
|
|
@@ -51,19 +51,24 @@ license: mit
|
|
| 51 |
|
| 52 |
## 📝 Abstract
|
| 53 |
|
| 54 |
-
**
|
| 55 |
|
| 56 |
-
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
---
|
| 61 |
|
| 62 |
## 🚀 Overview
|
| 63 |
|
| 64 |
-
**Problem.**
|
| 65 |
|
| 66 |
-
**Solution.** A domain
|
| 67 |
|
| 68 |
**Live Deployment Link:** [AI Text Authenticator Platform](https://huggingface.co/spaces/satyaki-mitra/AI_Text_Authenticator)
|
| 69 |
|
|
@@ -75,16 +80,15 @@ This README is research‑grade (detailed math, methodology, and benchmarks) whi
|
|
| 75 |
|
| 76 |
| Feature | Description | Impact |
|
| 77 |
|---|---:|---|
|
| 78 |
-
| **Domain‑Aware Detection** | Calibrated thresholds and metric weights for 16 content types (Academic, Technical, Creative, Social Media, etc.) |
|
| 79 |
-
| **6
|
| 80 |
| **Explainability** | Sentence‑level scoring, highlights, and human‑readable reasoning | Trust & auditability |
|
| 81 |
-
| **Model Attribution** | Likely model identification (GPT‑4, Claude, Gemini, LLaMA, etc.) | Forensic insights |
|
| 82 |
| **Auto Model Fetch** | First‑run download from Hugging Face, local cache, offline fallback | Lightweight repo & reproducible runs |
|
| 83 |
| **Extensible Design** | Plug‑in metrics, model registry, and retraining pipeline hooks | Easy research iteration |
|
| 84 |
|
| 85 |
### 📊 Supported Domains & Threshold Configuration
|
| 86 |
|
| 87 |
-
The platform supports
|
| 88 |
|
| 89 |
**Domains:**
|
| 90 |
|
|
@@ -109,8 +113,8 @@ The platform supports detection tailored to the following 16 domains, each with
|
|
| 109 |
|
| 110 |
Each domain is configured with specific thresholds for the six detection metrics and an ensemble threshold. The weights determine the relative importance of each metric's output during the ensemble aggregation phase.
|
| 111 |
|
| 112 |
-
* **
|
| 113 |
-
* **
|
| 114 |
* **Weight:** The relative weight assigned to the metric's result during ensemble combination (normalized internally to sum to 1.0 for active metrics).
|
| 115 |
|
| 116 |
### Confidence-Calibrated Aggregation (High Level)
|
|
@@ -138,7 +142,7 @@ flowchart LR
|
|
| 138 |
C[FastAPI<br/>Auth & Rate Limit]
|
| 139 |
end
|
| 140 |
|
| 141 |
-
subgraph ORCH [
|
| 142 |
D[Domain Classifier]
|
| 143 |
E[Preprocessor]
|
| 144 |
F[Metric Coordinator]
|
|
@@ -153,9 +157,9 @@ flowchart LR
|
|
| 153 |
P6[MultiPerturbationStability]
|
| 154 |
end
|
| 155 |
|
| 156 |
-
G[
|
| 157 |
H[Postprocessing & Reporter]
|
| 158 |
-
I["
|
| 159 |
J[Storage: Logs, Reports, Cache]
|
| 160 |
|
| 161 |
A --> C
|
|
@@ -190,7 +194,7 @@ sequenceDiagram
|
|
| 190 |
O->>M: Preprocess & dispatch metrics (parallel)
|
| 191 |
M-->>O: Metric results (async)
|
| 192 |
O->>E: Aggregate & calibrate
|
| 193 |
-
E-->>O: Final
|
| 194 |
O->>R: Generate highlights & report
|
| 195 |
R-->>API: Report ready (JSON/PDF)
|
| 196 |
API-->>U: Return analysis + download link
|
|
@@ -198,9 +202,9 @@ sequenceDiagram
|
|
| 198 |
|
| 199 |
---
|
| 200 |
|
| 201 |
-
## 🧮
|
| 202 |
|
| 203 |
-
This section provides the exact metric definitions implemented in `metrics/` and rationale for their selection. The ensemble combines these orthogonal signals to increase robustness against
|
| 204 |
|
| 205 |
### Metric summary (weights are configurable per domain)
|
| 206 |
- Perplexity — 25%
|
|
@@ -356,9 +360,9 @@ def ensemble_aggregation(metric_results, domain):
|
|
| 356 |
### Uncertainty Quantification
|
| 357 |
```python
|
| 358 |
def calculate_uncertainty(metric_results, ensemble_result):
|
| 359 |
-
var_uncert = np.var([r.
|
| 360 |
conf_uncert = 1 - np.mean([r.confidence for r in metric_results.values()])
|
| 361 |
-
decision_uncert = 1 - 2*abs(ensemble_result.
|
| 362 |
return var_uncert*0.4 + conf_uncert*0.3 + decision_uncert*0.3
|
| 363 |
```
|
| 364 |
|
|
@@ -369,17 +373,16 @@ def calculate_uncertainty(metric_results, ensemble_result):
|
|
| 369 |
Domain weights and thresholds are configurable. Example weights (in `config/threshold_config.py`):
|
| 370 |
|
| 371 |
```python
|
| 372 |
-
DOMAIN_WEIGHTS = {
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
}
|
| 378 |
```
|
| 379 |
|
| 380 |
### Domain Calibration Strategy (brief)
|
| 381 |
- **Academic**: increase linguistic weight, raise perplexity multiplier
|
| 382 |
-
- **Technical**: prioritize semantic coherence, maximize
|
| 383 |
- **Creative**: boost entropy & structural weights for burstiness detection
|
| 384 |
- **Social Media**: prioritize perplexity and relax linguistic demands
|
| 385 |
|
|
@@ -409,13 +412,17 @@ text_auth/
|
|
| 409 |
├── config/
|
| 410 |
│ ├── model_config.py
|
| 411 |
│ ├── settings.py
|
|
|
|
|
|
|
|
|
|
| 412 |
│ └── threshold_config.py
|
| 413 |
├── data/
|
| 414 |
│ ├── reports/
|
|
|
|
| 415 |
│ └── uploads/
|
| 416 |
-
├──
|
| 417 |
-
│ ├──
|
| 418 |
-
│ ├──
|
| 419 |
│ ├── highlighter.py
|
| 420 |
│ └── orchestrator.py
|
| 421 |
├── metrics/
|
|
@@ -435,15 +442,22 @@ text_auth/
|
|
| 435 |
│ ├── language_detector.py
|
| 436 |
│ └── text_processor.py
|
| 437 |
├── reporter/
|
| 438 |
-
│ ├── reasoning_generator.py
|
| 439 |
│ └── report_generator.py
|
| 440 |
├── ui/
|
| 441 |
│ └── static/index.html
|
| 442 |
├── utils/
|
| 443 |
│ └── logger.py
|
|
|
|
| 444 |
├── example.py
|
| 445 |
├── requirements.txt
|
| 446 |
├── run.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
└── text_auth_app.py
|
| 448 |
```
|
| 449 |
|
|
@@ -452,35 +466,41 @@ text_auth/
|
|
| 452 |
## 🌐 API Endpoints
|
| 453 |
|
| 454 |
### `/api/analyze` — Text Analysis (POST)
|
| 455 |
-
Analyze raw text. Returns ensemble
|
| 456 |
|
| 457 |
**Request (JSON)**
|
| 458 |
```json
|
| 459 |
{
|
| 460 |
"text":"...",
|
| 461 |
"domain":"academic|technical_doc|creative|social_media",
|
| 462 |
-
"enable_attribution": true,
|
| 463 |
"enable_highlighting": true,
|
| 464 |
"use_sentence_level": true,
|
| 465 |
-
"include_metrics_summary": true
|
| 466 |
-
}
|
| 467 |
```
|
| 468 |
|
| 469 |
**Response (JSON)** — abbreviated
|
| 470 |
```json
|
| 471 |
{
|
| 472 |
-
"status":"success",
|
| 473 |
-
"analysis_id":"analysis_170...",
|
| 474 |
-
"
|
| 475 |
-
"
|
| 476 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
},
|
| 478 |
-
"
|
| 479 |
-
"
|
| 480 |
-
|
|
|
|
|
|
|
| 481 |
}
|
| 482 |
```
|
| 483 |
|
|
|
|
|
|
|
|
|
|
| 484 |
### `/api/analyze/file` — File Analysis (POST, multipart/form-data)
|
| 485 |
Supports PDF, DOCX, TXT, DOC, MD. File size limit default: 10MB. Returns same structure as text analyze endpoint.
|
| 486 |
|
|
@@ -534,7 +554,7 @@ python text_auth_app.py
|
|
| 534 |
**Example snippet**
|
| 535 |
```python
|
| 536 |
from huggingface_hub import snapshot_download
|
| 537 |
-
snapshot_download(repo_id="satyaki-mitra/text-
|
| 538 |
```
|
| 539 |
|
| 540 |
---
|
|
@@ -556,7 +576,7 @@ snapshot_download(repo_id="satyaki-mitra/text-detector-v1", local_dir="./models/
|
|
| 556 |
**Use cases**: universities (plagiarism & integrity), hiring platforms (resume authenticity), publishers (content verification), social platforms (spam & SEO abuse).
|
| 557 |
|
| 558 |
**Competitive landscape** (summary)
|
| 559 |
-
-
|
| 560 |
|
| 561 |
**Monetization ideas**
|
| 562 |
- SaaS subscription (seat / monthly analyze limits)
|
|
@@ -571,13 +591,11 @@ snapshot_download(repo_id="satyaki-mitra/text-detector-v1", local_dir="./models/
|
|
| 571 |
**Research directions**
|
| 572 |
- Adversarial robustness (paraphrase & synonym attacks)
|
| 573 |
- Cross‑model generalization & zero‑shot detection
|
| 574 |
-
- Fine‑grained attribution (model versioning, temperature estimation)
|
| 575 |
- Explainability: counterfactual examples & feature importance visualization
|
| 576 |
|
| 577 |
**Planned features (Q1‑Q2 2026)**
|
| 578 |
- Multi‑language support (Spanish, French, German, Chinese)
|
| 579 |
- Real‑time streaming API (WebSocket)
|
| 580 |
-
- Fine‑grained attribution & generation parameter estimation
|
| 581 |
- Institution‑specific calibration & admin dashboards
|
| 582 |
|
| 583 |
*Detailed research methodology and academic foundation available in our [Whitepaper](docs/WHITE_PAPER.md). Technical implementation details in [Technical Documentation](docs/BLOGPOST.md).*
|
|
@@ -649,7 +667,7 @@ Acknowledgments:
|
|
| 649 |
|
| 650 |
<div align="center">
|
| 651 |
|
| 652 |
-
**Built with ❤️ —
|
| 653 |
|
| 654 |
*Version 1.0.0 — Last Updated: October, 2025*
|
| 655 |
|
|
|
|
| 1 |
---
|
| 2 |
+
title: TEXT-AUTH — Evidence-Based Text Forensics System
|
| 3 |
emoji: 🔍
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
|
|
|
| 12 |
|
| 13 |
<div align="center">
|
| 14 |
|
| 15 |
+
# 🛡️ TEXT-AUTH
|
| 16 |
+
## Evidence-First Text Forensics & Authenticity Assessment
|
| 17 |
|
| 18 |

|
| 19 |

|
|
|
|
| 31 |
- [Key Differentiators](#key-differentiators)
|
| 32 |
- [System Architecture](#system-architecture)
|
| 33 |
- [Workflow / Data Flow](#workflow--data-flow)
|
| 34 |
+
- [Forensic Signals & Mathematical Foundation](#forensic-signals--mathematical-foundation)
|
| 35 |
- [Ensemble Methodology](#ensemble-methodology)
|
| 36 |
+
- [Domain-Aware Analysis](#domain-aware-analysis)
|
| 37 |
- [Performance Characteristics](#performance-characteristics)
|
| 38 |
- [Project Structure](#project-structure)
|
| 39 |
- [API Endpoints](#api-endpoints)
|
|
|
|
| 51 |
|
| 52 |
## 📝 Abstract
|
| 53 |
|
| 54 |
+
**TEXT-AUTH** is a research-oriented, production-minded **text forensics system** that evaluates written content using multiple independent linguistic, statistical, and semantic signals.
|
| 55 |
|
| 56 |
+
Rather than claiming authorship or identifying a generation source, the platform performs **evidence-based probabilistic assessment** of textual consistency patterns. It reports confidence-calibrated signals, uncertainty estimates, and human-interpretable explanations to support downstream decision-making.
|
| 57 |
|
| 58 |
+
TEXT-AUTH is designed as a **decision-support and forensic analysis tool**, not a binary classifier or attribution oracle.
|
| 59 |
+
|
| 60 |
+
- *For Architectural details, see [Architecture](docs/ARCHITECTURE.md).*
|
| 61 |
+
- *For detailed technical documentation, see [Technical Docs](docs/BLOGPOST.md).*
|
| 62 |
+
- *For research methodology, see [Whitepaper](docs/WHITE_PAPER.md).*
|
| 63 |
+
- *For API documentation, see [API Documentation](docs/API_DOCUMENTATION.md).*
|
| 64 |
|
| 65 |
---
|
| 66 |
|
| 67 |
## 🚀 Overview
|
| 68 |
|
| 69 |
+
**Problem.** Modern text—whether human-written, assisted, edited, or fully generated—often exhibits patterns that are difficult to evaluate using binary classifiers.
|
| 70 |
|
| 71 |
+
**Solution.** A domain-aware analysis system combining six orthogonal evidence signals (Perplexity, Entropy, Structural, Semantic, Linguistic, Multi-perturbation stability) analysis into a confidence‑calibrated ensemble. Outputs are explainable with sentence‑level highlighting, and downloadable reports (JSON/PDF).
|
| 72 |
|
| 73 |
**Live Deployment Link:** [AI Text Authenticator Platform](https://huggingface.co/spaces/satyaki-mitra/AI_Text_Authenticator)
|
| 74 |
|
|
|
|
| 80 |
|
| 81 |
| Feature | Description | Impact |
|
| 82 |
|---|---:|---|
|
| 83 |
+
| **Domain‑Aware Detection** | Calibrated thresholds and metric weights for 16 content types (Academic, Technical, Creative, Social Media, etc.) | Improved signal calibration and reduced false positives compared to generic binary systems |
|
| 84 |
+
| **6-Signal Evidence Ensemble** | Orthogonal statistical, syntactic, and semantic indicators | Robust assessments with reduced false positives |
|
| 85 |
| **Explainability** | Sentence‑level scoring, highlights, and human‑readable reasoning | Trust & auditability |
|
|
|
|
| 86 |
| **Auto Model Fetch** | First‑run download from Hugging Face, local cache, offline fallback | Lightweight repo & reproducible runs |
|
| 87 |
| **Extensible Design** | Plug‑in metrics, model registry, and retraining pipeline hooks | Easy research iteration |
|
| 88 |
|
| 89 |
### 📊 Supported Domains & Threshold Configuration
|
| 90 |
|
| 91 |
+
The platform supports domain-aware forensic analysis tailored to the following 16 domains, each with specific synthetic-text consistency thresholds and metric weights defined in `config/threshold_config.py`. These configurations are used by the ensemble classifier to adapt its decision-making process.
|
| 92 |
|
| 93 |
**Domains:**
|
| 94 |
|
|
|
|
| 113 |
|
| 114 |
Each domain is configured with specific thresholds for the six detection metrics and an ensemble threshold. The weights determine the relative importance of each metric's output during the ensemble aggregation phase.
|
| 115 |
|
| 116 |
+
* **High-Consistency Threshold:** If a metric's synthetic-consistency score exceeds this value, it contributes stronger evidence toward a synthetic-consistency assessment for that metric.
|
| 117 |
+
* **Low-Consistency Threshold:** If a metric's Authentic probability falls below this value, it contributes evidence toward higher human-authored consistency for that metric.
|
| 118 |
* **Weight:** The relative weight assigned to the metric's result during ensemble combination (normalized internally to sum to 1.0 for active metrics).
|
| 119 |
|
| 120 |
### Confidence-Calibrated Aggregation (High Level)
|
|
|
|
| 142 |
C[FastAPI<br/>Auth & Rate Limit]
|
| 143 |
end
|
| 144 |
|
| 145 |
+
subgraph ORCH [Forensic Orchestrator]
|
| 146 |
D[Domain Classifier]
|
| 147 |
E[Preprocessor]
|
| 148 |
F[Metric Coordinator]
|
|
|
|
| 157 |
P6[MultiPerturbationStability]
|
| 158 |
end
|
| 159 |
|
| 160 |
+
G[Evidence Aggregator]
|
| 161 |
H[Postprocessing & Reporter]
|
| 162 |
+
I["Statistical Reference Models<br/>(HuggingFace Cache)"]
|
| 163 |
J[Storage: Logs, Reports, Cache]
|
| 164 |
|
| 165 |
A --> C
|
|
|
|
| 194 |
O->>M: Preprocess & dispatch metrics (parallel)
|
| 195 |
M-->>O: Metric results (async)
|
| 196 |
O->>E: Aggregate & calibrate
|
| 197 |
+
E-->>O: Final assessment + uncertainty
|
| 198 |
O->>R: Generate highlights & report
|
| 199 |
R-->>API: Report ready (JSON/PDF)
|
| 200 |
API-->>U: Return analysis + download link
|
|
|
|
| 202 |
|
| 203 |
---
|
| 204 |
|
| 205 |
+
## 🧮 Forensic Signals & Mathematical Foundation
|
| 206 |
|
| 207 |
+
This section provides the exact metric definitions implemented in `metrics/` and rationale for their selection. The ensemble combines these orthogonal signals to increase robustness against edited, paraphrased, or algorithmically regularized text.
|
| 208 |
|
| 209 |
### Metric summary (weights are configurable per domain)
|
| 210 |
- Perplexity — 25%
|
|
|
|
| 360 |
### Uncertainty Quantification
|
| 361 |
```python
|
| 362 |
def calculate_uncertainty(metric_results, ensemble_result):
|
| 363 |
+
var_uncert = np.var([r.synthetic_probability for r in metric_results.values()])
|
| 364 |
conf_uncert = 1 - np.mean([r.confidence for r in metric_results.values()])
|
| 365 |
+
decision_uncert = 1 - 2*abs(ensemble_result.synthetic_probability - 0.5)
|
| 366 |
return var_uncert*0.4 + conf_uncert*0.3 + decision_uncert*0.3
|
| 367 |
```
|
| 368 |
|
|
|
|
| 373 |
Domain weights and thresholds are configurable. Example weights (in `config/threshold_config.py`):
|
| 374 |
|
| 375 |
```python
|
| 376 |
+
DOMAIN_WEIGHTS = {'academic' : {'perplexity':0.22,'entropy':0.18,'structural':0.15,'linguistic':0.20,'semantic':0.15,'multi_perturbation_stability':0.10},
|
| 377 |
+
'technical' : {'perplexity':0.20,'entropy':0.18,'structural':0.12,'linguistic':0.18,'semantic':0.22,'multi_perturbation_stability':0.10},
|
| 378 |
+
'creative' : {'perplexity':0.25,'entropy':0.25,'structural':0.20,'linguistic':0.12,'semantic':0.10,'multi_perturbation_stability':0.08},
|
| 379 |
+
'social_media' : {'perplexity':0.30,'entropy':0.22,'structural':0.15,'linguistic':0.10,'semantic':0.13,'multi_perturbation_stability':0.10},
|
| 380 |
+
}
|
|
|
|
| 381 |
```
|
| 382 |
|
| 383 |
### Domain Calibration Strategy (brief)
|
| 384 |
- **Academic**: increase linguistic weight, raise perplexity multiplier
|
| 385 |
+
- **Technical**: prioritize semantic coherence, maximize Synthetic threshold to reduce false positives
|
| 386 |
- **Creative**: boost entropy & structural weights for burstiness detection
|
| 387 |
- **Social Media**: prioritize perplexity and relax linguistic demands
|
| 388 |
|
|
|
|
| 412 |
├── config/
|
| 413 |
│ ├── model_config.py
|
| 414 |
│ ├── settings.py
|
| 415 |
+
| ├── enums.py
|
| 416 |
+
| ├── constants.py
|
| 417 |
+
| ├── schemas.py
|
| 418 |
│ └── threshold_config.py
|
| 419 |
├── data/
|
| 420 |
│ ├── reports/
|
| 421 |
+
| ├── validation_data/
|
| 422 |
│ └── uploads/
|
| 423 |
+
├── services/
|
| 424 |
+
│ ├── reasoning_generator.py
|
| 425 |
+
│ ├── ensemble_classifier.py
|
| 426 |
│ ├── highlighter.py
|
| 427 |
│ └── orchestrator.py
|
| 428 |
├── metrics/
|
|
|
|
| 442 |
│ ├── language_detector.py
|
| 443 |
│ └── text_processor.py
|
| 444 |
├── reporter/
|
|
|
|
| 445 |
│ └── report_generator.py
|
| 446 |
├── ui/
|
| 447 |
│ └── static/index.html
|
| 448 |
├── utils/
|
| 449 |
│ └── logger.py
|
| 450 |
+
├── validation/
|
| 451 |
├── example.py
|
| 452 |
├── requirements.txt
|
| 453 |
├── run.sh
|
| 454 |
+
├── README.md
|
| 455 |
+
├── Dockerfile
|
| 456 |
+
├── .gitignore
|
| 457 |
+
├── setup.sh
|
| 458 |
+
├── test_integration.py
|
| 459 |
+
├── .env.example
|
| 460 |
+
├── requirements.txt
|
| 461 |
└── text_auth_app.py
|
| 462 |
```
|
| 463 |
|
|
|
|
| 466 |
## 🌐 API Endpoints
|
| 467 |
|
| 468 |
### `/api/analyze` — Text Analysis (POST)
|
| 469 |
+
Analyze raw text. Returns ensemble assessment, per‑metric signals, highlights, and explainability reasoning.
|
| 470 |
|
| 471 |
**Request (JSON)**
|
| 472 |
```json
|
| 473 |
{
|
| 474 |
"text":"...",
|
| 475 |
"domain":"academic|technical_doc|creative|social_media",
|
|
|
|
| 476 |
"enable_highlighting": true,
|
| 477 |
"use_sentence_level": true,
|
|
|
|
|
|
|
| 478 |
```
|
| 479 |
|
| 480 |
**Response (JSON)** — abbreviated
|
| 481 |
```json
|
| 482 |
{
|
| 483 |
+
"status": "success",
|
| 484 |
+
"analysis_id": "analysis_170...",
|
| 485 |
+
"assessment": {
|
| 486 |
+
"final_verdict": "Synthetic / Authentic / Hybrid",
|
| 487 |
+
"overall_confidence": 0.89,
|
| 488 |
+
"uncertainty_score": 0.23
|
| 489 |
+
},
|
| 490 |
+
"metric_signals": {
|
| 491 |
+
"perplexity": { "score": 0.92, "confidence": 0.89 }
|
| 492 |
},
|
| 493 |
+
"highlighted_html": "<div>...</div>",
|
| 494 |
+
"reasoning": {
|
| 495 |
+
"summary": "...",
|
| 496 |
+
"key_indicators": ["...", "..."]
|
| 497 |
+
}
|
| 498 |
}
|
| 499 |
```
|
| 500 |
|
| 501 |
+
> **Note:** The final verdict represents a probabilistic consistency assessment, not an authorship or generation claim.
|
| 502 |
+
|
| 503 |
+
|
| 504 |
### `/api/analyze/file` — File Analysis (POST, multipart/form-data)
|
| 505 |
Supports PDF, DOCX, TXT, DOC, MD. File size limit default: 10MB. Returns same structure as text analyze endpoint.
|
| 506 |
|
|
|
|
| 554 |
**Example snippet**
|
| 555 |
```python
|
| 556 |
from huggingface_hub import snapshot_download
|
| 557 |
+
snapshot_download(repo_id="satyaki-mitra/statistical-text-reference-v1", local_dir="./models/text-detector-v1")
|
| 558 |
```
|
| 559 |
|
| 560 |
---
|
|
|
|
| 576 |
**Use cases**: universities (plagiarism & integrity), hiring platforms (resume authenticity), publishers (content verification), social platforms (spam & SEO abuse).
|
| 577 |
|
| 578 |
**Competitive landscape** (summary)
|
| 579 |
+
- Binary authorship-claim systems (e.g., GPTZero-style tools) — our advantages: domain adaptation, explainability, evidence transparency, lower false positives and competitive pricing. TEXT-AUTH explicitly avoids authorship claims in favor of evidence-based forensic assessment.
|
| 580 |
|
| 581 |
**Monetization ideas**
|
| 582 |
- SaaS subscription (seat / monthly analyze limits)
|
|
|
|
| 591 |
**Research directions**
|
| 592 |
- Adversarial robustness (paraphrase & synonym attacks)
|
| 593 |
- Cross‑model generalization & zero‑shot detection
|
|
|
|
| 594 |
- Explainability: counterfactual examples & feature importance visualization
|
| 595 |
|
| 596 |
**Planned features (Q1‑Q2 2026)**
|
| 597 |
- Multi‑language support (Spanish, French, German, Chinese)
|
| 598 |
- Real‑time streaming API (WebSocket)
|
|
|
|
| 599 |
- Institution‑specific calibration & admin dashboards
|
| 600 |
|
| 601 |
*Detailed research methodology and academic foundation available in our [Whitepaper](docs/WHITE_PAPER.md). Technical implementation details in [Technical Documentation](docs/BLOGPOST.md).*
|
|
|
|
| 667 |
|
| 668 |
<div align="center">
|
| 669 |
|
| 670 |
+
**Built with ❤️ — Evidence-based text forensics, transparency, and real-world readiness.**
|
| 671 |
|
| 672 |
*Version 1.0.0 — Last Updated: October, 2025*
|
| 673 |
|
config/__init__.py
CHANGED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
from .settings import *
|
| 3 |
-
from .model_config import *
|
| 4 |
-
from .threshold_config import *
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
# Export everything
|
| 8 |
-
__all__ = ["ModelType",
|
| 9 |
-
"ModelConfig",
|
| 10 |
-
"MODEL_REGISTRY",
|
| 11 |
-
"MODEL_GROUPS",
|
| 12 |
-
"DEFAULT_MODEL_WEIGHTS",
|
| 13 |
-
"get_model_config",
|
| 14 |
-
"get_required_models",
|
| 15 |
-
"get_models_by_priority",
|
| 16 |
-
"get_models_by_group",
|
| 17 |
-
"get_total_size_mb",
|
| 18 |
-
"get_required_size_mb",
|
| 19 |
-
"print_model_summary",
|
| 20 |
-
"get_spacy_download_commands",
|
| 21 |
-
"settings",
|
| 22 |
-
"Settings",
|
| 23 |
-
"Domain",
|
| 24 |
-
"ConfidenceLevel",
|
| 25 |
-
"MetricThresholds",
|
| 26 |
-
"DomainThresholds",
|
| 27 |
-
"DEFAULT_THRESHOLDS",
|
| 28 |
-
"THRESHOLD_REGISTRY",
|
| 29 |
-
"CONFIDENCE_RANGES",
|
| 30 |
-
"get_threshold_for_domain",
|
| 31 |
-
"get_confidence_level",
|
| 32 |
-
"adjust_threshold_by_confidence",
|
| 33 |
-
"interpolate_thresholds",
|
| 34 |
-
"get_active_metric_weights",
|
| 35 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config/constants.py
ADDED
|
@@ -0,0 +1,876 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DEPENDENCIES
|
| 2 |
+
from typing import Dict
|
| 3 |
+
from typing import List
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
from dataclasses import field
|
| 6 |
+
from config.enums import Script
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass(frozen = True)
|
| 11 |
+
class DocumentExtractionParams:
|
| 12 |
+
"""
|
| 13 |
+
Hyperparameters for Document Extraction
|
| 14 |
+
"""
|
| 15 |
+
# Supported file extensions
|
| 16 |
+
SUPPORTED_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv', '.pdf', '.docx', '.doc', '.rtf', '.html', '.htm'})
|
| 17 |
+
|
| 18 |
+
# Text file extensions
|
| 19 |
+
TEXT_EXTENSIONS : frozenset = frozenset({'.txt', '.text', '.md', '.markdown', '.log', '.csv'})
|
| 20 |
+
|
| 21 |
+
# Maximum file size (50 MB default)
|
| 22 |
+
MAX_FILE_SIZE : int = 50 * 1024 * 1024
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass(frozen = True)
|
| 27 |
+
class LanguageDetectionParams:
|
| 28 |
+
"""
|
| 29 |
+
Hyperparameters for Language Detection
|
| 30 |
+
"""
|
| 31 |
+
# Text length constraints
|
| 32 |
+
MINIMUM_TEXT_LENGTH : int = 20
|
| 33 |
+
|
| 34 |
+
# Chunking parameters
|
| 35 |
+
MAX_CHUNK_LENGTH : int = 500
|
| 36 |
+
MIN_CHUNK_LENGTH : int = 50
|
| 37 |
+
FIXED_CHUNK_SIZE : int = 1000
|
| 38 |
+
|
| 39 |
+
# Model parameters
|
| 40 |
+
MODEL_MAX_LENGTH : int = 512
|
| 41 |
+
TOP_K_PREDICTIONS : int = 3
|
| 42 |
+
|
| 43 |
+
# Confidence thresholds
|
| 44 |
+
LOW_CONFIDENCE_THRESHOLD : float = 0.6
|
| 45 |
+
MULTILINGUAL_THRESHOLD : float = 0.2
|
| 46 |
+
SCRIPT_DOMINANCE_THRESHOLD : float = 0.7
|
| 47 |
+
LANGUAGE_MATCH_THRESHOLD : float = 0.7
|
| 48 |
+
|
| 49 |
+
# Quality assessment
|
| 50 |
+
WORD_BOUNDARY_RATIO : float = 0.7
|
| 51 |
+
MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8
|
| 52 |
+
|
| 53 |
+
# Language name mappings
|
| 54 |
+
LANGUAGE_NAMES : Dict[str, str] = field(default_factory = lambda : {"en": "English",
|
| 55 |
+
"es": "Spanish",
|
| 56 |
+
"fr": "French",
|
| 57 |
+
"de": "German",
|
| 58 |
+
"it": "Italian",
|
| 59 |
+
"pt": "Portuguese",
|
| 60 |
+
"ru": "Russian",
|
| 61 |
+
"zh": "Chinese",
|
| 62 |
+
"ja": "Japanese",
|
| 63 |
+
"ko": "Korean",
|
| 64 |
+
"ar": "Arabic",
|
| 65 |
+
"hi": "Hindi",
|
| 66 |
+
}
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Unicode script ranges
|
| 70 |
+
SCRIPT_RANGES : Dict[str, List[Tuple[int, int]]] = field(default_factory = lambda: {"latin" : [(0x0041, 0x007A), (0x00C0, 0x024F)],
|
| 71 |
+
"cyrillic" : [(0x0400, 0x04FF)],
|
| 72 |
+
"arabic" : [(0x0600, 0x06FF), (0x0750, 0x077F)],
|
| 73 |
+
"chinese" : [(0x4E00, 0x9FFF), (0x3400, 0x4DBF)],
|
| 74 |
+
"japanese" : [(0x3040, 0x309F), (0x30A0, 0x30FF)],
|
| 75 |
+
"korean" : [(0xAC00, 0xD7AF), (0x1100, 0x11FF)],
|
| 76 |
+
"devanagari" : [(0x0900, 0x097F)],
|
| 77 |
+
"greek" : [(0x0370, 0x03FF)],
|
| 78 |
+
"hebrew" : [(0x0590, 0x05FF)],
|
| 79 |
+
"thai" : [(0x0E00, 0x0E7F)],
|
| 80 |
+
}
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@dataclass(frozen = True)
|
| 87 |
+
class TextProcessingParams:
|
| 88 |
+
"""
|
| 89 |
+
Hyperparameters for Text Processing
|
| 90 |
+
"""
|
| 91 |
+
# Text length constraints
|
| 92 |
+
MINIMUM_TEXT_LENGTH : int = 20
|
| 93 |
+
MAXIMUM_TEXT_LENGTH : int = 1000000 # 1M characters
|
| 94 |
+
|
| 95 |
+
# Text cleaning options
|
| 96 |
+
PRESERVE_FORMATTING : bool = False
|
| 97 |
+
REMOVE_URLS : bool = True
|
| 98 |
+
REMOVE_EMAILS : bool = True
|
| 99 |
+
NORMALIZE_UNICODE : bool = True
|
| 100 |
+
FIX_ENCODING : bool = True
|
| 101 |
+
|
| 102 |
+
# Validation thresholds
|
| 103 |
+
MINIMUM_WORD_COUNT : int = 10
|
| 104 |
+
|
| 105 |
+
# Common abbreviations for sentence splitting
|
| 106 |
+
COMMON_ABBREVIATIONS : list = field(default_factory = lambda: ["Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Gen.", "Sen.", "Rep.", "St.", "Ave.", "Blvd.", "Rd.", "Pkwy.", "Co.", "Ltd.", "Inc.", "Corp.",
|
| 107 |
+
"vs.", "etc.", "e.g.", "i.e.", "c.", "ca.", "cf.", "al.", "et al.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", "Oct.",
|
| 108 |
+
"Nov.", "Dec.", "Mon.", "Tue.", "Wed.", "Thu.", "Fri.", "Sat.", "Sun.", "kg.", "g.", "mg.", "km.", "m.", "cm.", "mm.", "hr.", "min.", "sec.",
|
| 109 |
+
"vol.", "no.", "p.", "pp.", "ch.", "fig.", "ed.", "trans.", "approx.", "est.", "max.", "min.", "avg.", "std.", "temp.", "pres.", "vol.", "ibid.",
|
| 110 |
+
"op.", "cit.", "loc.", "cf.", "viz.", "sc.", "seq."
|
| 111 |
+
]
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@dataclass(frozen = True)
|
| 116 |
+
class DomainClassificationParams:
|
| 117 |
+
"""
|
| 118 |
+
Hyperparameters for Domain Classification
|
| 119 |
+
"""
|
| 120 |
+
# Classification parameters
|
| 121 |
+
TOP_K_DOMAINS : int = 2
|
| 122 |
+
MIN_CONFIDENCE_THRESHOLD : float = 0.3
|
| 123 |
+
|
| 124 |
+
# Confidence thresholds
|
| 125 |
+
HIGH_CONFIDENCE_THRESHOLD : float = 0.7
|
| 126 |
+
MEDIUM_CONFIDENCE_THRESHOLD : float = 0.6
|
| 127 |
+
LOW_CONFIDENCE_THRESHOLD : float = 0.5
|
| 128 |
+
SECONDARY_DOMAIN_MIN_SCORE : float = 0.1
|
| 129 |
+
|
| 130 |
+
# Mixed domain detection
|
| 131 |
+
MIXED_DOMAIN_PRIMARY_MAX : float = 0.7
|
| 132 |
+
MIXED_DOMAIN_SECONDARY_MIN : float = 0.3
|
| 133 |
+
MIXED_DOMAIN_RATIO_THRESHOLD : float = 0.6
|
| 134 |
+
MIXED_DOMAIN_CONFIDENCE_PENALTY : float = 0.8
|
| 135 |
+
|
| 136 |
+
# Text preprocessing
|
| 137 |
+
MAX_WORDS_FOR_CLASSIFICATION : int = 400
|
| 138 |
+
|
| 139 |
+
# Domain labels for zero-shot classification
|
| 140 |
+
DOMAIN_LABELS : Dict[str, List[str]] = field(default_factory = lambda : {"academic" : ["academic paper", "research article", "scientific paper", "scholarly writing", "thesis", "dissertation", "academic research"],
|
| 141 |
+
"creative" : ["creative writing", "fiction", "story", "narrative", "poetry", "literary work", "imaginative writing"],
|
| 142 |
+
"ai_ml" : ["artificial intelligence", "machine learning", "neural networks", "data science", "AI research", "deep learning"],
|
| 143 |
+
"software_dev" : ["software development", "programming", "coding", "software engineering", "web development", "application development"],
|
| 144 |
+
"technical_doc" : ["technical documentation", "user manual", "API documentation", "technical guide", "system documentation"],
|
| 145 |
+
"engineering" : ["engineering document", "technical design", "engineering analysis", "mechanical engineering", "electrical engineering"],
|
| 146 |
+
"science" : ["scientific research", "physics", "chemistry", "biology", "scientific study", "experimental results"],
|
| 147 |
+
"business" : ["business document", "corporate communication", "business report", "professional writing", "executive summary"],
|
| 148 |
+
"journalism" : ["news article", "journalism", "press release", "news report", "media content", "reporting"],
|
| 149 |
+
"social_media" : ["social media post", "casual writing", "online content", "informal text", "social media content"],
|
| 150 |
+
"blog_personal" : ["personal blog", "personal writing", "lifestyle blog", "personal experience", "opinion piece", "diary entry"],
|
| 151 |
+
"legal" : ["legal document", "contract", "legal writing", "law", "legal agreement", "legal analysis"],
|
| 152 |
+
"medical" : ["medical document", "healthcare", "clinical", "medical report", "health information", "medical research"],
|
| 153 |
+
"marketing" : ["marketing content", "advertising", "brand content", "promotional writing", "sales copy", "marketing material"],
|
| 154 |
+
"tutorial" : ["tutorial", "how-to guide", "instructional content", "step-by-step guide", "educational guide", "learning material"],
|
| 155 |
+
"general" : ["general content", "everyday writing", "common text", "standard writing", "normal text", "general information"],
|
| 156 |
+
}
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@dataclass(frozen = True)
|
| 161 |
+
class BaseMetricParams:
|
| 162 |
+
"""
|
| 163 |
+
Hyperparameters for BaseMetric class
|
| 164 |
+
"""
|
| 165 |
+
DEFAULT_AUTHENTIC_PROBABILITY : float = 0.5
|
| 166 |
+
DEFAULT_SYNTHETIC_PROBABILITY : float = 0.5
|
| 167 |
+
DEFAULT_HYBRID_PROBABILITY : float = 0.0
|
| 168 |
+
DEFAULT_CONFIDENCE : float = 0.0
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@dataclass(frozen = True)
|
| 172 |
+
class StructuralMetricParams:
|
| 173 |
+
"""
|
| 174 |
+
Hyperparameters for Structural Metric
|
| 175 |
+
"""
|
| 176 |
+
# Domain threshold application - PROBABILITY CONSTANTS
|
| 177 |
+
STRONG_SYNTHETIC_BASE_PROB : float = 0.7
|
| 178 |
+
STRONG_AUTHENTIC_BASE_PROB : float = 0.7
|
| 179 |
+
WEAK_PROBABILITY_ADJUSTMENT : float = 0.3
|
| 180 |
+
UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3
|
| 181 |
+
UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7
|
| 182 |
+
UNCERTAIN_RANGE_WIDTH : float = 0.4
|
| 183 |
+
NEUTRAL_PROBABILITY : float = 0.5 # For fallback
|
| 184 |
+
MIN_PROBABILITY : float = 0.0
|
| 185 |
+
MAX_PROBABILITY : float = 1.0
|
| 186 |
+
|
| 187 |
+
# Feature extraction - sentence splitting
|
| 188 |
+
SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
|
| 189 |
+
WORD_TOKENIZE_PATTERN : str = r'\b\w+\b'
|
| 190 |
+
PUNCTUATION_PATTERN : str = r'[^\w\s]'
|
| 191 |
+
|
| 192 |
+
# Burstiness calculation
|
| 193 |
+
BURSTINESS_NORMALIZATION_FACTOR : float = 2.0
|
| 194 |
+
|
| 195 |
+
# Readability calculation
|
| 196 |
+
FLESCH_CONSTANT_1 : float = 206.835
|
| 197 |
+
FLESCH_CONSTANT_2 : float = 1.015
|
| 198 |
+
FLESCH_CONSTANT_3 : float = 84.6
|
| 199 |
+
NEUTRAL_READABILITY_SCORE : float = 50.0
|
| 200 |
+
MIN_READABILITY_SCORE : float = 0.0
|
| 201 |
+
MAX_READABILITY_SCORE : float = 100.0
|
| 202 |
+
|
| 203 |
+
# Repetition detection
|
| 204 |
+
REPETITION_WINDOW_SIZE : int = 10
|
| 205 |
+
MIN_WORDS_FOR_REPETITION : int = 10
|
| 206 |
+
|
| 207 |
+
# N-gram analysis
|
| 208 |
+
BIGRAM_N : int = 2
|
| 209 |
+
TRIGRAM_N : int = 3
|
| 210 |
+
|
| 211 |
+
# Synthetic probability calculation thresholds
|
| 212 |
+
BURSTINESS_LOW_THRESHOLD : float = 0.3
|
| 213 |
+
BURSTINESS_MEDIUM_THRESHOLD : float = 0.5
|
| 214 |
+
LENGTH_UNIFORMITY_HIGH_THRESHOLD : float = 0.7
|
| 215 |
+
LENGTH_UNIFORMITY_MEDIUM_THRESH : float = 0.5
|
| 216 |
+
BIGRAM_DIVERSITY_LOW_THRESHOLD : float = 0.7
|
| 217 |
+
READABILITY_SYNTHETIC_MIN : float = 60.0
|
| 218 |
+
READABILITY_SYNTHETIC_MAX : float = 75.0
|
| 219 |
+
REPETITION_LOW_THRESHOLD : float = 0.1
|
| 220 |
+
REPETITION_MEDIUM_THRESHOLD : float = 0.2
|
| 221 |
+
|
| 222 |
+
# Synthetic probability weights
|
| 223 |
+
STRONG_SYNTHETIC_WEIGHT : float = 0.7
|
| 224 |
+
MODERATE_SYNTHETIC_WEIGHT : float = 0.5
|
| 225 |
+
WEAK_SYNTHETIC_WEIGHT : float = 0.3
|
| 226 |
+
VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4
|
| 227 |
+
NEUTRAL_WEIGHT : float = 0.5
|
| 228 |
+
|
| 229 |
+
# Confidence calculation
|
| 230 |
+
CONFIDENCE_STD_NORMALIZER : float = 0.5
|
| 231 |
+
MIN_CONFIDENCE : float = 0.1
|
| 232 |
+
MAX_CONFIDENCE : float = 0.9
|
| 233 |
+
NEUTRAL_CONFIDENCE : float = 0.5 # For fallback
|
| 234 |
+
|
| 235 |
+
# Hybrid probability calculation
|
| 236 |
+
BURSTINESS_HIGH_THRESHOLD : float = 0.6
|
| 237 |
+
SENTENCE_LENGTH_VARIANCE_RATIO : float = 0.8
|
| 238 |
+
TYPE_TOKEN_RATIO_EXTREME_LOW : float = 0.3
|
| 239 |
+
TYPE_TOKEN_RATIO_EXTREME_HIGH : float = 0.9
|
| 240 |
+
READABILITY_EXTREME_LOW : float = 20.0
|
| 241 |
+
READABILITY_EXTREME_HIGH : float = 90.0
|
| 242 |
+
MODERATE_HYBRID_WEIGHT : float = 0.4
|
| 243 |
+
WEAK_HYBRID_WEIGHT : float = 0.3
|
| 244 |
+
MAX_HYBRID_PROBABILITY : float = 0.3
|
| 245 |
+
|
| 246 |
+
# Feature validation
|
| 247 |
+
MIN_SENTENCE_LENGTH_FOR_STD : int = 2
|
| 248 |
+
MIN_WORD_LENGTH_FOR_STD : int = 2
|
| 249 |
+
MIN_VALUES_FOR_BURSTINESS : int = 2
|
| 250 |
+
MIN_WORDS_FOR_NGRAM : int = 2 # For n-gram where n=2
|
| 251 |
+
|
| 252 |
+
# Math and normalization
|
| 253 |
+
ZERO_TOLERANCE : float = 1e-10
|
| 254 |
+
ZERO_VALUE : float = 0.0
|
| 255 |
+
ONE_VALUE : float = 1.0
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
@dataclass(frozen = True)
|
| 259 |
+
class SemanticAnalysisParams:
|
| 260 |
+
"""
|
| 261 |
+
Hyperparameters for Semantic Analysis Metric
|
| 262 |
+
"""
|
| 263 |
+
# Text validation
|
| 264 |
+
MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50
|
| 265 |
+
MIN_SENTENCES_FOR_ANALYSIS : int = 3
|
| 266 |
+
MIN_SENTENCE_LENGTH : int = 10
|
| 267 |
+
MIN_VALID_SENTENCE_LENGTH : int = 5
|
| 268 |
+
|
| 269 |
+
# Domain threshold application - PROBABILITY CONSTANTS
|
| 270 |
+
STRONG_SYNTHETIC_BASE_PROB : float = 0.7
|
| 271 |
+
STRONG_AUTHENTIC_BASE_PROB : float = 0.7
|
| 272 |
+
WEAK_PROBABILITY_ADJUSTMENT : float = 0.3
|
| 273 |
+
UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3
|
| 274 |
+
UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7
|
| 275 |
+
UNCERTAIN_RANGE_WIDTH : float = 0.4
|
| 276 |
+
NEUTRAL_PROBABILITY : float = 0.5
|
| 277 |
+
MIN_PROBABILITY : float = 0.0
|
| 278 |
+
MAX_PROBABILITY : float = 1.0
|
| 279 |
+
|
| 280 |
+
# Sentence splitting
|
| 281 |
+
SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
|
| 282 |
+
WORD_EXTRACTION_PATTERN : str = r'\b[a-zA-Z]{4,}\b'
|
| 283 |
+
|
| 284 |
+
# Coherence calculation
|
| 285 |
+
HIGH_COHERENCE_SIMILARITY : float = 0.8
|
| 286 |
+
SIMILARITY_VARIANCE_FACTOR : float = 5.0
|
| 287 |
+
|
| 288 |
+
# Repetition detection
|
| 289 |
+
REPETITION_SIMILARITY_THRESHOLD : float = 0.8
|
| 290 |
+
REPETITION_SCORE_SCALING : float = 3.0
|
| 291 |
+
MIN_SENTENCES_FOR_REPETITION : int = 5
|
| 292 |
+
|
| 293 |
+
# Topic drift calculation
|
| 294 |
+
START_SECTION_SIZE : int = 3
|
| 295 |
+
END_SECTION_SIZE : int = 3
|
| 296 |
+
SECTION_SIZE_RATIO : int = 3 # denominator for section size calculation
|
| 297 |
+
|
| 298 |
+
# Chunk analysis
|
| 299 |
+
CHUNK_SIZE_WORDS : int = 200
|
| 300 |
+
CHUNK_OVERLAP_RATIO : float = 0.5 # 50% overlap
|
| 301 |
+
MIN_CHUNK_LENGTH : int = 50
|
| 302 |
+
MIN_SENTENCES_PER_CHUNK : int = 2
|
| 303 |
+
|
| 304 |
+
# Keyword analysis
|
| 305 |
+
MIN_WORDS_FOR_KEYWORD_ANALYSIS : int = 10
|
| 306 |
+
TOP_KEYWORDS_COUNT : int = 10
|
| 307 |
+
MIN_KEYWORD_FREQUENCY : int = 2
|
| 308 |
+
|
| 309 |
+
# Synthetic probability thresholds
|
| 310 |
+
COHERENCE_HIGH_THRESHOLD : float = 0.7
|
| 311 |
+
COHERENCE_MEDIUM_THRESHOLD : float = 0.5
|
| 312 |
+
CONSISTENCY_HIGH_THRESHOLD : float = 0.8
|
| 313 |
+
CONSISTENCY_MEDIUM_THRESHOLD : float = 0.6
|
| 314 |
+
REPETITION_HIGH_THRESHOLD : float = 0.3
|
| 315 |
+
REPETITION_MEDIUM_THRESHOLD : float = 0.1
|
| 316 |
+
TOPIC_DRIFT_LOW_THRESHOLD : float = 0.2
|
| 317 |
+
TOPIC_DRIFT_MEDIUM_THRESHOLD : float = 0.4
|
| 318 |
+
COHERENCE_VARIANCE_LOW_THRESHOLD : float = 0.05
|
| 319 |
+
COHERENCE_VARIANCE_MEDIUM_THRESHOLD : float = 0.1
|
| 320 |
+
|
| 321 |
+
# Synthetic probability weights
|
| 322 |
+
STRONG_SYNTHETIC_WEIGHT : float = 0.9
|
| 323 |
+
MODERATE_SYNTHETIC_WEIGHT : float = 0.8
|
| 324 |
+
MEDIUM_SYNTHETIC_WEIGHT : float = 0.6
|
| 325 |
+
WEAK_SYNTHETIC_WEIGHT : float = 0.5
|
| 326 |
+
VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4
|
| 327 |
+
VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3
|
| 328 |
+
LOW_SYNTHETIC_WEIGHT : float = 0.2
|
| 329 |
+
|
| 330 |
+
# Confidence calculation
|
| 331 |
+
CONFIDENCE_STD_NORMALIZER : float = 0.5
|
| 332 |
+
MIN_CONFIDENCE : float = 0.1
|
| 333 |
+
MAX_CONFIDENCE : float = 0.9
|
| 334 |
+
NEUTRAL_CONFIDENCE : float = 0.5
|
| 335 |
+
LOW_FEATURE_CONFIDENCE : float = 0.3
|
| 336 |
+
|
| 337 |
+
# Hybrid probability calculation
|
| 338 |
+
COHERENCE_MIXED_MIN : float = 0.4
|
| 339 |
+
COHERENCE_MIXED_MAX : float = 0.6
|
| 340 |
+
COHERENCE_VARIANCE_HIGH_THRESHOLD : float = 0.15
|
| 341 |
+
COHERENCE_VARIANCE_MEDIUM_THRESHOLD : float = 0.1
|
| 342 |
+
REPETITION_MIXED_MIN : float = 0.15
|
| 343 |
+
REPETITION_MIXED_MAX : float = 0.35
|
| 344 |
+
MODERATE_HYBRID_WEIGHT : float = 0.4
|
| 345 |
+
WEAK_HYBRID_WEIGHT : float = 0.3
|
| 346 |
+
VERY_WEAK_HYBRID_WEIGHT : float = 0.2
|
| 347 |
+
MAX_HYBRID_PROBABILITY : float = 0.3
|
| 348 |
+
|
| 349 |
+
# Default feature values
|
| 350 |
+
DEFAULT_COHERENCE : float = 0.5
|
| 351 |
+
DEFAULT_CONSISTENCY : float = 0.5
|
| 352 |
+
DEFAULT_REPETITION : float = 0.0
|
| 353 |
+
DEFAULT_TOPIC_DRIFT : float = 0.5
|
| 354 |
+
DEFAULT_CONTEXTUAL_CONSISTENCY : float = 0.5
|
| 355 |
+
DEFAULT_CHUNK_COHERENCE : float = 0.5
|
| 356 |
+
DEFAULT_COHERENCE_VARIANCE : float = 0.1
|
| 357 |
+
|
| 358 |
+
# Error handling
|
| 359 |
+
MIN_REQUIRED_FEATURES : int = 3
|
| 360 |
+
ZERO_TOLERANCE : float = 1e-10
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
@dataclass(frozen = True)
|
| 364 |
+
class LinguisticMetricParams:
|
| 365 |
+
"""
|
| 366 |
+
Hyperparameters for Linguistic Metric
|
| 367 |
+
"""
|
| 368 |
+
# Text validation
|
| 369 |
+
MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50
|
| 370 |
+
|
| 371 |
+
# Domain threshold application - PROBABILITY CONSTANTS
|
| 372 |
+
STRONG_SYNTHETIC_BASE_PROB : float = 0.7
|
| 373 |
+
STRONG_AUTHENTIC_BASE_PROB : float = 0.7
|
| 374 |
+
WEAK_PROBABILITY_ADJUSTMENT : float = 0.3
|
| 375 |
+
UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3
|
| 376 |
+
UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7
|
| 377 |
+
UNCERTAIN_RANGE_WIDTH : float = 0.4
|
| 378 |
+
NEUTRAL_PROBABILITY : float = 0.5
|
| 379 |
+
MIN_PROBABILITY : float = 0.0
|
| 380 |
+
MAX_PROBABILITY : float = 1.0
|
| 381 |
+
|
| 382 |
+
# POS analysis
|
| 383 |
+
MIN_TAGS_FOR_ENTROPY : int = 1
|
| 384 |
+
|
| 385 |
+
# Syntactic complexity
|
| 386 |
+
COMPLEXITY_WEIGHT_AVG : float = 0.5
|
| 387 |
+
COMPLEXITY_WEIGHT_MAX : float = 0.5
|
| 388 |
+
|
| 389 |
+
# Sentence complexity
|
| 390 |
+
WORDS_PER_COMPLEXITY_UNIT : float = 10.0
|
| 391 |
+
CLAUSE_COMPLEXITY_FACTOR : float = 0.5
|
| 392 |
+
|
| 393 |
+
# Grammatical patterns
|
| 394 |
+
TRANSITION_WORDS_SET : tuple = ('however', 'therefore', 'moreover', 'furthermore', 'consequently', 'additionally', 'nevertheless', 'nonetheless', 'thus', 'hence')
|
| 395 |
+
IDEAL_PASSIVE_RATIO : float = 0.3
|
| 396 |
+
IDEAL_TRANSITION_RATIO : float = 0.2
|
| 397 |
+
PASSIVE_DEPENDENCY : str = 'nsubjpass'
|
| 398 |
+
CLAUSE_MARKERS : tuple = ('cc', 'mark')
|
| 399 |
+
|
| 400 |
+
# Writing style analysis
|
| 401 |
+
IDEAL_LENGTH_VARIATION : float = 0.5
|
| 402 |
+
IDEAL_PUNCTUATION_RATIO : float = 0.1
|
| 403 |
+
|
| 404 |
+
# SYNTHETIC pattern detection
|
| 405 |
+
TRANSITION_OVERUSE_THRESHOLD : float = 0.05
|
| 406 |
+
POS_SEQUENCE_FREQ_THRESHOLD : float = 0.1
|
| 407 |
+
STRUCTURE_DIVERSITY_THRESHOLD : float = 0.5
|
| 408 |
+
UNUSUAL_CONSTRUCTION_THRESHOLD : float = 0.02
|
| 409 |
+
REPETITIVE_PHRASING_THRESHOLD : float = 0.3
|
| 410 |
+
UNUSUAL_DEPENDENCIES : tuple = ('attr', 'oprd')
|
| 411 |
+
|
| 412 |
+
# Chunk analysis
|
| 413 |
+
CHUNK_SIZE_WORDS : int = 200
|
| 414 |
+
CHUNK_OVERLAP_RATIO : float = 0.5
|
| 415 |
+
MIN_CHUNK_LENGTH : int = 50
|
| 416 |
+
MIN_SENTENCES_FOR_STRUCTURE : int = 3
|
| 417 |
+
MIN_SENTENCES_FOR_ANALYSIS : int = 1
|
| 418 |
+
|
| 419 |
+
# Synthetic probability thresholds
|
| 420 |
+
POS_DIVERSITY_LOW_THRESHOLD : float = 0.3
|
| 421 |
+
POS_DIVERSITY_MEDIUM_THRESHOLD : float = 0.5
|
| 422 |
+
SYNTACTIC_COMPLEXITY_LOW_THRESHOLD : float = 2.0
|
| 423 |
+
SYNTACTIC_COMPLEXITY_MEDIUM_THRESHOLD : float = 3.0
|
| 424 |
+
GRAMMATICAL_CONSISTENCY_HIGH_THRESHOLD : float = 0.8
|
| 425 |
+
GRAMMATICAL_CONSISTENCY_MEDIUM_THRESHOLD : float = 0.6
|
| 426 |
+
TRANSITION_USAGE_HIGH_THRESHOLD : float = 0.3
|
| 427 |
+
TRANSITION_USAGE_MEDIUM_THRESHOLD : float = 0.15
|
| 428 |
+
SYNTHETIC_PATTERN_HIGH_THRESHOLD : float = 0.6
|
| 429 |
+
SYNTHETIC_PATTERN_MEDIUM_THRESHOLD : float = 0.3
|
| 430 |
+
COMPLEXITY_VARIANCE_LOW_THRESHOLD : float = 0.1
|
| 431 |
+
COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 0.3
|
| 432 |
+
|
| 433 |
+
# Synthetic probability weights
|
| 434 |
+
STRONG_SYNTHETIC_WEIGHT : float = 0.9
|
| 435 |
+
MODERATE_SYNTHETIC_WEIGHT : float = 0.8
|
| 436 |
+
MEDIUM_SYNTHETIC_WEIGHT : float = 0.7
|
| 437 |
+
WEAK_SYNTHETIC_WEIGHT : float = 0.6
|
| 438 |
+
VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.5
|
| 439 |
+
LOW_SYNTHETIC_WEIGHT : float = 0.4
|
| 440 |
+
VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3
|
| 441 |
+
MINIMAL_SYNTHETIC_WEIGHT : float = 0.2
|
| 442 |
+
|
| 443 |
+
# Confidence calculation
|
| 444 |
+
CONFIDENCE_STD_NORMALIZER : float = 0.5
|
| 445 |
+
MIN_CONFIDENCE : float = 0.1
|
| 446 |
+
MAX_CONFIDENCE : float = 0.9
|
| 447 |
+
NEUTRAL_CONFIDENCE : float = 0.5
|
| 448 |
+
LOW_FEATURE_CONFIDENCE : float = 0.3
|
| 449 |
+
MIN_REQUIRED_FEATURES : int = 4
|
| 450 |
+
|
| 451 |
+
# Hybrid probability calculation
|
| 452 |
+
POS_DIVERSITY_MIXED_MIN : float = 0.35
|
| 453 |
+
POS_DIVERSITY_MIXED_MAX : float = 0.55
|
| 454 |
+
POS_ENTROPY_LOW_THRESHOLD : float = 0.35
|
| 455 |
+
POS_ENTROPY_HIGH_THRESHOLD : float = 0.65
|
| 456 |
+
COMPLEXITY_VARIANCE_HIGH_THRESHOLD : float = 0.5
|
| 457 |
+
COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 0.3
|
| 458 |
+
SYNTHETIC_PATTERN_MIXED_MIN : float = 0.2
|
| 459 |
+
SYNTHETIC_PATTERN_MIXED_MAX : float = 0.6
|
| 460 |
+
MODERATE_HYBRID_WEIGHT : float = 0.4
|
| 461 |
+
WEAK_HYBRID_WEIGHT : float = 0.3
|
| 462 |
+
MINIMAL_HYBRID_WEIGHT : float = 0.2
|
| 463 |
+
MAX_HYBRID_PROBABILITY : float = 0.3
|
| 464 |
+
|
| 465 |
+
# Default feature values
|
| 466 |
+
DEFAULT_POS_DIVERSITY : float = 0.5
|
| 467 |
+
DEFAULT_POS_ENTROPY : float = 2.5
|
| 468 |
+
DEFAULT_SYNTACTIC_COMPLEXITY : float = 2.5
|
| 469 |
+
DEFAULT_SENTENCE_COMPLEXITY : float = 2.0
|
| 470 |
+
DEFAULT_GRAMMATICAL_CONSISTENCY : float = 0.5
|
| 471 |
+
DEFAULT_TRANSITION_USAGE : float = 0.1
|
| 472 |
+
DEFAULT_PASSIVE_RATIO : float = 0.2
|
| 473 |
+
DEFAULT_WRITING_STYLE_SCORE : float = 0.5
|
| 474 |
+
DEFAULT_SYNTHETIC_PATTERN_SCORE : float = 0.3
|
| 475 |
+
DEFAULT_CHUNK_COMPLEXITY : float = 2.5
|
| 476 |
+
DEFAULT_COMPLEXITY_VARIANCE : float = 0.2
|
| 477 |
+
|
| 478 |
+
# Math and normalization
|
| 479 |
+
LOG_BASE : int = 2
|
| 480 |
+
ZERO_TOLERANCE : float = 1e-10
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
@dataclass(frozen = True)
|
| 484 |
+
class PerplexityMetricParams:
|
| 485 |
+
"""
|
| 486 |
+
Hyperparameters for Perplexity Metric
|
| 487 |
+
"""
|
| 488 |
+
# Text validation
|
| 489 |
+
MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50
|
| 490 |
+
|
| 491 |
+
# Domain threshold application - PROBABILITY CONSTANTS
|
| 492 |
+
STRONG_SYNTHETIC_BASE_PROB : float = 0.7
|
| 493 |
+
STRONG_AUTHENTIC_BASE_PROB : float = 0.7
|
| 494 |
+
WEAK_PROBABILITY_ADJUSTMENT : float = 0.3
|
| 495 |
+
UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3
|
| 496 |
+
UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7
|
| 497 |
+
UNCERTAIN_RANGE_WIDTH : float = 0.4
|
| 498 |
+
NEUTRAL_PROBABILITY : float = 0.5
|
| 499 |
+
MIN_PROBABILITY : float = 0.0
|
| 500 |
+
MAX_PROBABILITY : float = 1.0
|
| 501 |
+
|
| 502 |
+
# Model parameters
|
| 503 |
+
MAX_TOKEN_LENGTH : int = 1024
|
| 504 |
+
MIN_TOKENS_FOR_PERPLEXITY : int = 5
|
| 505 |
+
MIN_SENTENCE_LENGTH : int = 20
|
| 506 |
+
MIN_CHUNK_LENGTH : int = 50
|
| 507 |
+
|
| 508 |
+
# Chunk analysis
|
| 509 |
+
CHUNK_SIZE_WORDS : int = 200
|
| 510 |
+
CHUNK_OVERLAP_RATIO : float = 0.5
|
| 511 |
+
|
| 512 |
+
# Perplexity normalization
|
| 513 |
+
PERPLEXITY_SIGMOID_CENTER : float = 30.0
|
| 514 |
+
PERPLEXITY_SIGMOID_SCALE : float = 10.0
|
| 515 |
+
|
| 516 |
+
# Cross-entropy normalization
|
| 517 |
+
MAX_CROSS_ENTROPY : float = 5.0
|
| 518 |
+
|
| 519 |
+
# Perplexity value thresholds (actual perplexity values)
|
| 520 |
+
PERPLEXITY_VERY_LOW_THRESHOLD : float = 20.0
|
| 521 |
+
PERPLEXITY_LOW_THRESHOLD : float = 40.0
|
| 522 |
+
PERPLEXITY_HIGH_THRESHOLD : float = 80.0
|
| 523 |
+
PERPLEXITY_VERY_HIGH_THRESHOLD : float = 150.0
|
| 524 |
+
|
| 525 |
+
# Synthetic probability thresholds (normalized values 0-1)
|
| 526 |
+
NORMALIZED_PERPLEXITY_HIGH_THRESHOLD : float = 0.7
|
| 527 |
+
NORMALIZED_PERPLEXITY_MEDIUM_THRESHOLD : float = 0.5
|
| 528 |
+
PERPLEXITY_VARIANCE_LOW_THRESHOLD : float = 50.0
|
| 529 |
+
PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 200.0
|
| 530 |
+
STD_SENTENCE_PERPLEXITY_LOW_THRESHOLD : float = 20.0
|
| 531 |
+
STD_SENTENCE_PERPLEXITY_MEDIUM_THRESHOLD : float = 50.0
|
| 532 |
+
CROSS_ENTROPY_LOW_THRESHOLD : float = 0.3
|
| 533 |
+
CROSS_ENTROPY_MEDIUM_THRESHOLD : float = 0.6
|
| 534 |
+
CHUNK_VARIANCE_VERY_LOW_THRESHOLD : float = 25.0
|
| 535 |
+
CHUNK_VARIANCE_LOW_THRESHOLD : float = 100.0
|
| 536 |
+
|
| 537 |
+
# Synthetic probability weights
|
| 538 |
+
STRONG_SYNTHETIC_WEIGHT : float = 0.8
|
| 539 |
+
MEDIUM_SYNTHETIC_WEIGHT : float = 0.6
|
| 540 |
+
WEAK_SYNTHETIC_WEIGHT : float = 0.4
|
| 541 |
+
VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.2
|
| 542 |
+
VERY_LOW_SYNTHETIC_WEIGHT : float = 0.3
|
| 543 |
+
MINIMAL_SYNTHETIC_WEIGHT : float = 0.2
|
| 544 |
+
|
| 545 |
+
# Confidence calculation
|
| 546 |
+
CONFIDENCE_STD_NORMALIZER : float = 0.5
|
| 547 |
+
MIN_CONFIDENCE : float = 0.1
|
| 548 |
+
MAX_CONFIDENCE : float = 0.9
|
| 549 |
+
NEUTRAL_CONFIDENCE : float = 0.5
|
| 550 |
+
LOW_FEATURE_CONFIDENCE : float = 0.3
|
| 551 |
+
MIN_REQUIRED_FEATURES : int = 3
|
| 552 |
+
|
| 553 |
+
# Hybrid probability calculation
|
| 554 |
+
NORMALIZED_PERPLEXITY_MIXED_MIN : float = 0.4
|
| 555 |
+
NORMALIZED_PERPLEXITY_MIXED_MAX : float = 0.6
|
| 556 |
+
PERPLEXITY_VARIANCE_HIGH_THRESHOLD : float = 200.0
|
| 557 |
+
PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD : float = 100.0
|
| 558 |
+
STD_SENTENCE_PERPLEXITY_MIXED_MIN : float = 20.0
|
| 559 |
+
STD_SENTENCE_PERPLEXITY_MIXED_MAX : float = 60.0
|
| 560 |
+
MODERATE_HYBRID_WEIGHT : float = 0.4
|
| 561 |
+
WEAK_HYBRID_WEIGHT : float = 0.2
|
| 562 |
+
MINIMAL_HYBRID_WEIGHT : float = 0.0
|
| 563 |
+
MAX_HYBRID_PROBABILITY : float = 0.3
|
| 564 |
+
|
| 565 |
+
# Default feature values
|
| 566 |
+
DEFAULT_OVERALL_PERPLEXITY : float = 50.0
|
| 567 |
+
DEFAULT_NORMALIZED_PERPLEXITY : float = 0.5
|
| 568 |
+
DEFAULT_AVG_SENTENCE_PERPLEXITY : float = 50.0
|
| 569 |
+
DEFAULT_STD_SENTENCE_PERPLEXITY : float = 25.0
|
| 570 |
+
DEFAULT_MIN_SENTENCE_PERPLEXITY : float = 30.0
|
| 571 |
+
DEFAULT_MAX_SENTENCE_PERPLEXITY : float = 70.0
|
| 572 |
+
DEFAULT_PERPLEXITY_VARIANCE : float = 100.0
|
| 573 |
+
DEFAULT_AVG_CHUNK_PERPLEXITY : float = 50.0
|
| 574 |
+
DEFAULT_CROSS_ENTROPY_SCORE : float = 0.5
|
| 575 |
+
|
| 576 |
+
# Math and normalization
|
| 577 |
+
ZERO_TOLERANCE : float = 1e-10
|
| 578 |
+
LARGE_PERPLEXITY_THRESHOLD : float = 1000.0
|
| 579 |
+
|
| 580 |
+
# Regular experssion for sentence splitting
|
| 581 |
+
SENTENCE_SPLIT_PATTERN : str = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s'
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
@dataclass(frozen = True)
|
| 585 |
+
class EntropyMetricParams:
|
| 586 |
+
"""
|
| 587 |
+
Hyperparameters for Entropy Metric
|
| 588 |
+
"""
|
| 589 |
+
# Text validation
|
| 590 |
+
MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50
|
| 591 |
+
MIN_SENTENCE_LENGTH : int = 10
|
| 592 |
+
MIN_WORDS_FOR_ANALYSIS : int = 5
|
| 593 |
+
MIN_TOKENS_FOR_ANALYSIS : int = 10
|
| 594 |
+
MIN_TOKENS_FOR_SEQUENCE : int = 20
|
| 595 |
+
|
| 596 |
+
# Domain threshold application - PROBABILITY CONSTANTS
|
| 597 |
+
STRONG_SYNTHETIC_BASE_PROB : float = 0.7
|
| 598 |
+
STRONG_AUTHENTIC_BASE_PROB : float = 0.7
|
| 599 |
+
WEAK_PROBABILITY_ADJUSTMENT : float = 0.3
|
| 600 |
+
UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3
|
| 601 |
+
UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7
|
| 602 |
+
UNCERTAIN_RANGE_WIDTH : float = 0.4
|
| 603 |
+
NEUTRAL_PROBABILITY : float = 0.5
|
| 604 |
+
MIN_PROBABILITY : float = 0.0
|
| 605 |
+
MAX_PROBABILITY : float = 1.0
|
| 606 |
+
|
| 607 |
+
# Chunk analysis
|
| 608 |
+
CHUNK_SIZE_WORDS : int = 100
|
| 609 |
+
CHUNK_OVERLAP_RATIO : float = 0.5
|
| 610 |
+
MIN_CHUNK_LENGTH : int = 20
|
| 611 |
+
|
| 612 |
+
# Sequence analysis
|
| 613 |
+
MAX_BIGRAM_ENTROPY : float = 8.0
|
| 614 |
+
|
| 615 |
+
# Entropy normalization
|
| 616 |
+
MAX_CHAR_ENTROPY : float = 4.0
|
| 617 |
+
|
| 618 |
+
# Synthetic probability thresholds
|
| 619 |
+
CHAR_ENTROPY_VERY_LOW_THRESHOLD : float = 3.5
|
| 620 |
+
CHAR_ENTROPY_LOW_THRESHOLD : float = 3.8
|
| 621 |
+
CHAR_ENTROPY_MEDIUM_THRESHOLD : float = 4.0
|
| 622 |
+
ENTROPY_VARIANCE_VERY_LOW_THRESHOLD : float = 0.1
|
| 623 |
+
ENTROPY_VARIANCE_LOW_THRESHOLD : float = 0.2
|
| 624 |
+
ENTROPY_VARIANCE_MEDIUM_THRESHOLD : float = 0.3
|
| 625 |
+
TOKEN_DIVERSITY_LOW_THRESHOLD : float = 0.6
|
| 626 |
+
TOKEN_DIVERSITY_MEDIUM_THRESHOLD : float = 0.7
|
| 627 |
+
TOKEN_DIVERSITY_HIGH_THRESHOLD : float = 0.8
|
| 628 |
+
SEQUENCE_UNPREDICTABILITY_LOW_THRESHOLD : float = 0.3
|
| 629 |
+
SEQUENCE_UNPREDICTABILITY_MEDIUM_THRESHOLD : float = 0.4
|
| 630 |
+
SEQUENCE_UNPREDICTABILITY_HIGH_THRESHOLD : float = 0.5
|
| 631 |
+
SYNTHETIC_PATTERN_SCORE_HIGH_THRESHOLD : float = 0.75
|
| 632 |
+
SYNTHETIC_PATTERN_SCORE_MEDIUM_THRESHOLD : float = 0.5
|
| 633 |
+
TOKEN_ENTROPY_LOW_THRESHOLD : float = 6.5
|
| 634 |
+
|
| 635 |
+
# Synthetic probability weights
|
| 636 |
+
STRONG_SYNTHETIC_WEIGHT : float = 0.9
|
| 637 |
+
VERY_STRONG_SYNTHETIC_WEIGHT : float = 0.8
|
| 638 |
+
MEDIUM_SYNTHETIC_WEIGHT : float = 0.7
|
| 639 |
+
MODERATE_SYNTHETIC_WEIGHT : float = 0.6
|
| 640 |
+
WEAK_SYNTHETIC_WEIGHT : float = 0.5
|
| 641 |
+
VERY_WEAK_SYNTHETIC_WEIGHT : float = 0.4
|
| 642 |
+
LOW_SYNTHETIC_WEIGHT : float = 0.3
|
| 643 |
+
MINIMAL_SYNTHETIC_WEIGHT : float = 0.2
|
| 644 |
+
VERY_LOW_SYNTHETIC_WEIGHT : float = 0.1
|
| 645 |
+
|
| 646 |
+
# Confidence calculation
|
| 647 |
+
CONFIDENCE_STD_NORMALIZER : float = 0.5
|
| 648 |
+
MIN_CONFIDENCE : float = 0.1
|
| 649 |
+
MAX_CONFIDENCE : float = 0.9
|
| 650 |
+
NEUTRAL_CONFIDENCE : float = 0.5
|
| 651 |
+
LOW_FEATURE_CONFIDENCE : float = 0.3
|
| 652 |
+
MIN_REQUIRED_FEATURES : int = 2
|
| 653 |
+
|
| 654 |
+
# Hybrid probability calculation
|
| 655 |
+
ENTROPY_VARIANCE_HIGH_THRESHOLD : float = 0.5
|
| 656 |
+
ENTROPY_VARIANCE_MIXED_THRESHOLD : float = 0.3
|
| 657 |
+
ENTROPY_DISCREPANCY_THRESHOLD : float = 1.0
|
| 658 |
+
SYNTHETIC_PATTERN_MIXED_MIN : float = 0.4
|
| 659 |
+
SYNTHETIC_PATTERN_MIXED_MAX : float = 0.6
|
| 660 |
+
STRONG_HYBRID_WEIGHT : float = 0.6
|
| 661 |
+
MODERATE_HYBRID_WEIGHT : float = 0.4
|
| 662 |
+
WEAK_HYBRID_WEIGHT : float = 0.3
|
| 663 |
+
MINIMAL_HYBRID_WEIGHT : float = 0.0
|
| 664 |
+
MAX_HYBRID_PROBABILITY : float = 0.4
|
| 665 |
+
|
| 666 |
+
# Default feature values
|
| 667 |
+
DEFAULT_CHAR_ENTROPY : float = 3.8
|
| 668 |
+
DEFAULT_WORD_ENTROPY : float = 6.0
|
| 669 |
+
DEFAULT_TOKEN_ENTROPY : float = 8.0
|
| 670 |
+
DEFAULT_TOKEN_DIVERSITY : float = 0.7
|
| 671 |
+
DEFAULT_SEQUENCE_UNPREDICTABILITY : float = 0.5
|
| 672 |
+
DEFAULT_ENTROPY_VARIANCE : float = 0.2
|
| 673 |
+
DEFAULT_AVG_CHUNK_ENTROPY : float = 3.8
|
| 674 |
+
DEFAULT_PREDICTABILITY_SCORE : float = 0.5
|
| 675 |
+
|
| 676 |
+
# Math and normalization
|
| 677 |
+
ZERO_TOLERANCE : float = 1e-10
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
@dataclass(frozen = True)
|
| 681 |
+
class MultiPerturbationStabilityMetricParams:
|
| 682 |
+
"""
|
| 683 |
+
Hyperparameters for Multi-Perturbation Stability Metric
|
| 684 |
+
"""
|
| 685 |
+
# Text validation
|
| 686 |
+
MIN_TEXT_LENGTH_FOR_ANALYSIS : int = 50
|
| 687 |
+
MIN_TEXT_LENGTH_FOR_PERTURBATION : int = 10
|
| 688 |
+
MIN_TOKENS_FOR_LIKELIHOOD : int = 3
|
| 689 |
+
MIN_WORDS_FOR_PERTURBATION : int = 3
|
| 690 |
+
MIN_WORDS_FOR_DELETION : int = 5
|
| 691 |
+
|
| 692 |
+
# Domain threshold application - PROBABILITY CONSTANTS
|
| 693 |
+
STRONG_SYNTHETIC_BASE_PROB : float = 0.7
|
| 694 |
+
STRONG_AUTHENTIC_BASE_PROB : float = 0.7
|
| 695 |
+
WEAK_PROBABILITY_ADJUSTMENT : float = 0.3
|
| 696 |
+
UNCERTAIN_SYNTHETIC_RANGE_START : float = 0.3
|
| 697 |
+
UNCERTAIN_AUTHENTIC_RANGE_START : float = 0.7
|
| 698 |
+
UNCERTAIN_RANGE_WIDTH : float = 0.4
|
| 699 |
+
NEUTRAL_PROBABILITY : float = 0.5
|
| 700 |
+
MIN_PROBABILITY : float = 0.0
|
| 701 |
+
MAX_PROBABILITY : float = 1.0
|
| 702 |
+
|
| 703 |
+
# Perturbation parameters
|
| 704 |
+
NUM_PERTURBATIONS : int = 10
|
| 705 |
+
MAX_PERTURBATION_ATTEMPTS : int = 10
|
| 706 |
+
PERTURBATION_DELETION_RATIO : float = 0.1
|
| 707 |
+
ROBBERTA_TOP_K_PREDICTIONS : int = 3
|
| 708 |
+
|
| 709 |
+
# Text preprocessing
|
| 710 |
+
MAX_TEXT_LENGTH_FOR_ANALYSIS : int = 2000
|
| 711 |
+
MAX_TEXT_LENGTH_FOR_PERTURBATION : int = 1000
|
| 712 |
+
MAX_TOKEN_LENGTH : int = 256
|
| 713 |
+
MAX_ROBERTA_TOKEN_LENGTH : int = 128
|
| 714 |
+
|
| 715 |
+
# Chunk analysis
|
| 716 |
+
CHUNK_SIZE_WORDS : int = 150
|
| 717 |
+
CHUNK_OVERLAP_RATIO : float = 0.5
|
| 718 |
+
MIN_CHUNK_LENGTH : int = 50
|
| 719 |
+
CHUNK_DELETION_RATIO : float = 0.1
|
| 720 |
+
|
| 721 |
+
# Likelihood calculation
|
| 722 |
+
MIN_VALID_PERTURBATIONS : int = 3
|
| 723 |
+
DEFAULT_LIKELIHOOD : float = 2.0
|
| 724 |
+
MIN_LIKELIHOOD : float = 0.5
|
| 725 |
+
MAX_LIKELIHOOD : float = 10.0
|
| 726 |
+
|
| 727 |
+
# Stability scoring
|
| 728 |
+
STABILITY_HIGH_THRESHOLD : float = 0.7
|
| 729 |
+
STABILITY_MEDIUM_THRESHOLD : float = 0.5
|
| 730 |
+
STABILITY_LOW_THRESHOLD : float = 0.3
|
| 731 |
+
RELATIVE_DROP_HIGH_THRESHOLD : float = 0.5
|
| 732 |
+
RELATIVE_DROP_MEDIUM_THRESHOLD : float = 0.3
|
| 733 |
+
RELATIVE_DROP_LOW_THRESHOLD : float = 0.15
|
| 734 |
+
|
| 735 |
+
# Curvature scoring
|
| 736 |
+
CURVATURE_HIGH_THRESHOLD : float = 0.7
|
| 737 |
+
CURVATURE_MEDIUM_THRESHOLD : float = 0.5
|
| 738 |
+
CURVATURE_LOW_THRESHOLD : float = 0.3
|
| 739 |
+
CURVATURE_SCALING_FACTOR : float = 3.0
|
| 740 |
+
|
| 741 |
+
# Likelihood ratio thresholds
|
| 742 |
+
LIKELIHOOD_RATIO_HIGH_THRESHOLD : float = 0.8
|
| 743 |
+
LIKELIHOOD_RATIO_MEDIUM_THRESHOLD : float = 0.6
|
| 744 |
+
LIKELIHOOD_RATIO_LOW_THRESHOLD : float = 0.4
|
| 745 |
+
MAX_LIKELIHOOD_RATIO : float = 3.0
|
| 746 |
+
MIN_LIKELIHOOD_RATIO : float = 0.33
|
| 747 |
+
|
| 748 |
+
# Stability variance thresholds
|
| 749 |
+
STABILITY_VARIANCE_VERY_LOW : float = 0.05
|
| 750 |
+
STABILITY_VARIANCE_LOW : float = 0.1
|
| 751 |
+
STABILITY_VARIANCE_HIGH : float = 0.15
|
| 752 |
+
|
| 753 |
+
# Synthetic probability weights
|
| 754 |
+
STABILITY_WEIGHT : float = 0.3
|
| 755 |
+
CURVATURE_WEIGHT : float = 0.25
|
| 756 |
+
RATIO_WEIGHT : float = 0.25
|
| 757 |
+
VARIANCE_WEIGHT : float = 0.2
|
| 758 |
+
|
| 759 |
+
# Synthetic probability thresholds
|
| 760 |
+
STABILITY_STRONG_THRESHOLD : float = 0.9
|
| 761 |
+
STABILITY_MEDIUM_STRONG_THRESHOLD : float = 0.7
|
| 762 |
+
STABILITY_MODERATE_THRESHOLD : float = 0.5
|
| 763 |
+
STABILITY_WEAK_THRESHOLD : float = 0.2
|
| 764 |
+
CURVATURE_STRONG_THRESHOLD : float = 0.8
|
| 765 |
+
CURVATURE_MEDIUM_THRESHOLD : float = 0.6
|
| 766 |
+
CURVATURE_MODERATE_THRESHOLD : float = 0.4
|
| 767 |
+
CURVATURE_WEAK_THRESHOLD : float = 0.2
|
| 768 |
+
RATIO_STRONG_THRESHOLD : float = 0.9
|
| 769 |
+
RATIO_MEDIUM_THRESHOLD : float = 0.7
|
| 770 |
+
RATIO_MODERATE_THRESHOLD : float = 0.5
|
| 771 |
+
RATIO_WEAK_THRESHOLD : float = 0.3
|
| 772 |
+
VARIANCE_STRONG_THRESHOLD : float = 0.8
|
| 773 |
+
VARIANCE_MODERATE_THRESHOLD : float = 0.5
|
| 774 |
+
VARIANCE_WEAK_THRESHOLD : float = 0.2
|
| 775 |
+
|
| 776 |
+
# Confidence calculation
|
| 777 |
+
CONFIDENCE_BASE : float = 0.5
|
| 778 |
+
CONFIDENCE_STD_FACTOR : float = 0.5
|
| 779 |
+
MIN_CONFIDENCE : float = 0.1
|
| 780 |
+
MAX_CONFIDENCE : float = 0.9
|
| 781 |
+
NEUTRAL_CONFIDENCE : float = 0.5
|
| 782 |
+
LOW_FEATURE_CONFIDENCE : float = 0.3
|
| 783 |
+
MIN_REQUIRED_FEATURES : int = 3
|
| 784 |
+
|
| 785 |
+
# Hybrid probability calculation
|
| 786 |
+
STABILITY_MIXED_MIN : float = 0.35
|
| 787 |
+
STABILITY_MIXED_MAX : float = 0.55
|
| 788 |
+
STABILITY_VARIANCE_MIXED_HIGH : float = 0.15
|
| 789 |
+
STABILITY_VARIANCE_MIXED_MEDIUM : float = 0.1
|
| 790 |
+
LIKELIHOOD_RATIO_MIXED_MIN : float = 0.5
|
| 791 |
+
LIKELIHOOD_RATIO_MIXED_MAX : float = 0.8
|
| 792 |
+
MODERATE_HYBRID_WEIGHT : float = 0.4
|
| 793 |
+
WEAK_HYBRID_WEIGHT : float = 0.3
|
| 794 |
+
VERY_WEAK_HYBRID_WEIGHT : float = 0.2
|
| 795 |
+
MINIMAL_HYBRID_WEIGHT : float = 0.0
|
| 796 |
+
MAX_HYBRID_PROBABILITY : float = 0.3
|
| 797 |
+
|
| 798 |
+
# Default feature values
|
| 799 |
+
DEFAULT_ORIGINAL_LIKELIHOOD : float = 2.0
|
| 800 |
+
DEFAULT_AVG_PERTURBED_LIKELIHOOD : float = 1.8
|
| 801 |
+
DEFAULT_LIKELIHOOD_RATIO : float = 1.1
|
| 802 |
+
DEFAULT_NORMALIZED_LIKELIHOOD_RATIO : float = 0.55
|
| 803 |
+
DEFAULT_STABILITY_SCORE : float = 0.3
|
| 804 |
+
DEFAULT_CURVATURE_SCORE : float = 0.3
|
| 805 |
+
DEFAULT_PERTURBATION_VARIANCE : float = 0.05
|
| 806 |
+
DEFAULT_AVG_CHUNK_STABILITY : float = 0.3
|
| 807 |
+
DEFAULT_STABILITY_VARIANCE : float = 0.1
|
| 808 |
+
|
| 809 |
+
# Math and normalization
|
| 810 |
+
ZERO_TOLERANCE : float = 1e-10
|
| 811 |
+
|
| 812 |
+
# Common words to avoid masking
|
| 813 |
+
COMMON_WORDS_TO_AVOID : tuple = ('the', 'and', 'but', 'for', 'with', 'that', 'this', 'have', 'from', 'were')
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
@dataclass(frozen = True)
|
| 817 |
+
class MetricsEnsembleParams:
|
| 818 |
+
"""
|
| 819 |
+
Constants for MEtrics Ensemble Classifier
|
| 820 |
+
"""
|
| 821 |
+
# Minimum requirements
|
| 822 |
+
MIN_METRICS_REQUIRED : int = 3
|
| 823 |
+
|
| 824 |
+
# Default probabilities
|
| 825 |
+
DEFAULT_SYNTHETIC_PROB : float = 0.5
|
| 826 |
+
DEFAULT_AUTHENTIC_PROB : float = 0.5
|
| 827 |
+
DEFAULT_HYBRID_PROB : float = 0.0
|
| 828 |
+
|
| 829 |
+
# Weighting
|
| 830 |
+
SIGMOID_CONFIDENCE_SCALE : float = 10.0
|
| 831 |
+
SIGMOID_CENTER : float = 0.5
|
| 832 |
+
|
| 833 |
+
# Confidence composition
|
| 834 |
+
CONFIDENCE_WEIGHT_BASE : float = 0.4
|
| 835 |
+
CONFIDENCE_WEIGHT_AGREEMENT : float = 0.3
|
| 836 |
+
CONFIDENCE_WEIGHT_CERTAINTY : float = 0.2
|
| 837 |
+
CONFIDENCE_WEIGHT_QUALITY : float = 0.1
|
| 838 |
+
|
| 839 |
+
# Uncertainty composition
|
| 840 |
+
UNCERTAINTY_WEIGHT_VARIANCE : float = 0.4
|
| 841 |
+
UNCERTAINTY_WEIGHT_CONFIDENCE : float = 0.3
|
| 842 |
+
UNCERTAINTY_WEIGHT_DECISION : float = 0.3
|
| 843 |
+
|
| 844 |
+
# Consensus
|
| 845 |
+
CONSENSUS_STD_SCALING : float = 2.0
|
| 846 |
+
|
| 847 |
+
# Hybrid detection
|
| 848 |
+
HYBRID_PROB_THRESHOLD : float = 0.25
|
| 849 |
+
HYBRID_UNCERTAINTY_THRESHOLD : float = 0.6
|
| 850 |
+
HYBRID_SYNTHETIC_RANGE_LOW : float = 0.3
|
| 851 |
+
HYBRID_SYNTHETIC_RANGE_HIGH : float = 0.7
|
| 852 |
+
|
| 853 |
+
# Threshold adaptation
|
| 854 |
+
UNCERTAINTY_THRESHOLD_ADJUSTMENT : float = 0.1
|
| 855 |
+
|
| 856 |
+
# Contribution labels
|
| 857 |
+
CONTRIBUTION_HIGH : float = 0.15
|
| 858 |
+
CONTRIBUTION_MEDIUM : float = 0.08
|
| 859 |
+
|
| 860 |
+
HIGH_CONFIDENCE_THRESHOLD : float = 0.7
|
| 861 |
+
|
| 862 |
+
|
| 863 |
+
|
| 864 |
+
# Singleton instances for parameter classes
|
| 865 |
+
document_extraction_params = DocumentExtractionParams()
|
| 866 |
+
language_detection_params = LanguageDetectionParams()
|
| 867 |
+
domain_classification_params = DomainClassificationParams()
|
| 868 |
+
text_processing_params = TextProcessingParams()
|
| 869 |
+
base_metric_params = BaseMetricParams()
|
| 870 |
+
structural_metric_params = StructuralMetricParams()
|
| 871 |
+
semantic_analysis_params = SemanticAnalysisParams()
|
| 872 |
+
linguistic_metric_params = LinguisticMetricParams()
|
| 873 |
+
perplexity_metric_params = PerplexityMetricParams()
|
| 874 |
+
entropy_metric_params = EntropyMetricParams()
|
| 875 |
+
multi_perturbation_stability_metric_params = MultiPerturbationStabilityMetricParams()
|
| 876 |
+
metrics_ensemble_params = MetricsEnsembleParams()
|
config/enums.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DEPENDENCIES
|
| 2 |
+
from enum import Enum
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ModelType(Enum):
|
| 7 |
+
"""
|
| 8 |
+
Model types for categorization
|
| 9 |
+
"""
|
| 10 |
+
TRANSFORMER = "transformer"
|
| 11 |
+
SENTENCE_TRANSFORMER = "sentence_transformer"
|
| 12 |
+
LANGUAGE_MODEL = "language_model"
|
| 13 |
+
MASKED_LANGUAGE_MODEL = "masked_language_model"
|
| 14 |
+
CLASSIFIER = "classifier"
|
| 15 |
+
EMBEDDING = "embedding"
|
| 16 |
+
RULE_BASED = "rule_based"
|
| 17 |
+
SEQUENCE_CLASSIFICATION = "sequence_classification"
|
| 18 |
+
CAUSAL_LM = "causal_lm"
|
| 19 |
+
MASKED_LM = "masked_lm"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Domain(Enum):
|
| 23 |
+
"""
|
| 24 |
+
Text domains for adaptive thresholding
|
| 25 |
+
"""
|
| 26 |
+
# Core domains
|
| 27 |
+
GENERAL = "general"
|
| 28 |
+
ACADEMIC = "academic"
|
| 29 |
+
CREATIVE = "creative"
|
| 30 |
+
AI_ML = "ai_ml" # domain topic, not authorship
|
| 31 |
+
SOFTWARE_DEV = "software_dev"
|
| 32 |
+
TECHNICAL_DOC = "technical_doc"
|
| 33 |
+
ENGINEERING = "engineering"
|
| 34 |
+
SCIENCE = "science"
|
| 35 |
+
BUSINESS = "business"
|
| 36 |
+
LEGAL = "legal"
|
| 37 |
+
MEDICAL = "medical"
|
| 38 |
+
JOURNALISM = "journalism"
|
| 39 |
+
MARKETING = "marketing"
|
| 40 |
+
SOCIAL_MEDIA = "social_media"
|
| 41 |
+
BLOG_PERSONAL = "blog_personal"
|
| 42 |
+
TUTORIAL = "tutorial"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class Language(Enum):
|
| 47 |
+
"""
|
| 48 |
+
ISO 639-1 language codes for supported languages
|
| 49 |
+
"""
|
| 50 |
+
ENGLISH = "en"
|
| 51 |
+
SPANISH = "es"
|
| 52 |
+
FRENCH = "fr"
|
| 53 |
+
GERMAN = "de"
|
| 54 |
+
ITALIAN = "it"
|
| 55 |
+
PORTUGUESE = "pt"
|
| 56 |
+
RUSSIAN = "ru"
|
| 57 |
+
CHINESE = "zh"
|
| 58 |
+
JAPANESE = "ja"
|
| 59 |
+
KOREAN = "ko"
|
| 60 |
+
ARABIC = "ar"
|
| 61 |
+
HINDI = "hi"
|
| 62 |
+
DUTCH = "nl"
|
| 63 |
+
POLISH = "pl"
|
| 64 |
+
TURKISH = "tr"
|
| 65 |
+
SWEDISH = "sv"
|
| 66 |
+
VIETNAMESE = "vi"
|
| 67 |
+
INDONESIAN = "id"
|
| 68 |
+
THAI = "th"
|
| 69 |
+
GREEK = "el"
|
| 70 |
+
HEBREW = "he"
|
| 71 |
+
CZECH = "cs"
|
| 72 |
+
ROMANIAN = "ro"
|
| 73 |
+
DANISH = "da"
|
| 74 |
+
FINNISH = "fi"
|
| 75 |
+
NORWEGIAN = "no"
|
| 76 |
+
UNKNOWN = "unknown"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class Script(Enum):
|
| 80 |
+
"""
|
| 81 |
+
Writing scripts
|
| 82 |
+
"""
|
| 83 |
+
LATIN = "latin"
|
| 84 |
+
CYRILLIC = "cyrillic"
|
| 85 |
+
ARABIC = "arabic"
|
| 86 |
+
CHINESE = "chinese"
|
| 87 |
+
JAPANESE = "japanese"
|
| 88 |
+
KOREAN = "korean"
|
| 89 |
+
DEVANAGARI = "devanagari"
|
| 90 |
+
GREEK = "greek"
|
| 91 |
+
HEBREW = "hebrew"
|
| 92 |
+
THAI = "thai"
|
| 93 |
+
MIXED = "mixed"
|
| 94 |
+
UNKNOWN = "unknown"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class ConfidenceLevel(Enum):
|
| 99 |
+
"""
|
| 100 |
+
Confidence levels for authenticity estimation
|
| 101 |
+
"""
|
| 102 |
+
VERY_LOW = "very_low"
|
| 103 |
+
LOW = "low"
|
| 104 |
+
MEDIUM = "medium"
|
| 105 |
+
HIGH = "high"
|
| 106 |
+
VERY_HIGH = "very_high"
|
config/model_config.py
CHANGED
|
@@ -1,51 +1,14 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
-
from enum import Enum
|
| 3 |
from typing import Any
|
| 4 |
from typing import Dict
|
| 5 |
from typing import Optional
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
Model types for categorization
|
| 14 |
-
"""
|
| 15 |
-
TRANSFORMER = "transformer"
|
| 16 |
-
SENTENCE_TRANSFORMER = "sentence_transformer"
|
| 17 |
-
GPT = "gpt"
|
| 18 |
-
GPTMASK = "gpt"
|
| 19 |
-
CLASSIFIER = "classifier"
|
| 20 |
-
EMBEDDING = "embedding"
|
| 21 |
-
RULE_BASED = "rule_based"
|
| 22 |
-
SEQUENCE_CLASSIFICATION = "sequence_classification"
|
| 23 |
-
CAUSAL_LM = "causal_lm"
|
| 24 |
-
MASKED_LM = "masked_lm"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
@dataclass
|
| 28 |
-
class ModelConfig:
|
| 29 |
-
"""
|
| 30 |
-
Configuration for a single model
|
| 31 |
-
"""
|
| 32 |
-
model_id : str
|
| 33 |
-
model_type : ModelType
|
| 34 |
-
description : str
|
| 35 |
-
size_mb : int
|
| 36 |
-
required : bool = True
|
| 37 |
-
download_priority : int = 1 # 1=highest, 5=lowest
|
| 38 |
-
quantizable : bool = True
|
| 39 |
-
onnx_compatible : bool = False
|
| 40 |
-
cache_model : bool = True
|
| 41 |
-
max_length : Optional[int] = None
|
| 42 |
-
batch_size : int = 1
|
| 43 |
-
additional_params : Dict[str, Any] = field(default_factory = dict)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
MODEL_REGISTRY : Dict[str, ModelConfig] = {"perplexity_gpt2" : ModelConfig(model_id = "gpt2",
|
| 47 |
-
model_type = ModelType.GPT,
|
| 48 |
-
description = "GPT-2 base for perplexity calculation",
|
| 49 |
size_mb = 548,
|
| 50 |
required = True,
|
| 51 |
download_priority = 1,
|
|
@@ -80,9 +43,9 @@ MODEL_REGISTRY : Dict[str, ModelConfig] = {"perplexity_gpt2" : ModelC
|
|
| 80 |
batch_size = 16,
|
| 81 |
additional_params = {"is_spacy_model": True},
|
| 82 |
),
|
| 83 |
-
"
|
| 84 |
model_type = ModelType.CLASSIFIER,
|
| 85 |
-
description = "
|
| 86 |
size_mb = 500,
|
| 87 |
required = True,
|
| 88 |
download_priority = 1,
|
|
@@ -120,7 +83,7 @@ MODEL_REGISTRY : Dict[str, ModelConfig] = {"perplexity_gpt2" : ModelC
|
|
| 120 |
),
|
| 121 |
"language_detector" : ModelConfig(model_id = "papluca/xlm-roberta-base-language-detection",
|
| 122 |
model_type = ModelType.CLASSIFIER,
|
| 123 |
-
description = "Language detection
|
| 124 |
size_mb = 1100,
|
| 125 |
required = False,
|
| 126 |
download_priority = 5,
|
|
@@ -131,18 +94,18 @@ MODEL_REGISTRY : Dict[str, ModelConfig] = {"perplexity_gpt2" : ModelC
|
|
| 131 |
|
| 132 |
|
| 133 |
# MODEL GROUPS FOR BATCH DOWNLOADING
|
| 134 |
-
MODEL_GROUPS = {"minimal" : ["
|
| 135 |
-
"essential" : ["
|
| 136 |
"extended" : ["semantic_secondary", "multi_perturbation_mask", "domain_classifier_fallback"],
|
| 137 |
"optional" : ["language_detector"],
|
| 138 |
}
|
| 139 |
|
| 140 |
|
| 141 |
# MODEL WEIGHTS FOR ENSEMBLE : For 6 metrics implemented
|
| 142 |
-
DEFAULT_MODEL_WEIGHTS = {"
|
| 143 |
-
"perplexity" : 0.20, #
|
| 144 |
-
"entropy" : 0.15, #
|
| 145 |
-
"
|
| 146 |
"linguistic" : 0.15, # spacy
|
| 147 |
"multi_perturbation_stability" : 0.10, # gpt2 + distilroberta (optional)
|
| 148 |
}
|
|
|
|
| 1 |
# DEPENDENCIES
|
|
|
|
| 2 |
from typing import Any
|
| 3 |
from typing import Dict
|
| 4 |
from typing import Optional
|
| 5 |
+
from config.enums import ModelType
|
| 6 |
+
from config.schemas import ModelConfig
|
| 7 |
|
| 8 |
|
| 9 |
+
MODEL_REGISTRY : Dict[str, ModelConfig] = {"perplexity_reference_lm" : ModelConfig(model_id = "gpt2",
|
| 10 |
+
model_type = ModelType.LANGUAGE_MODEL,
|
| 11 |
+
description = "Reference language model for statistical perplexity estimation",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
size_mb = 548,
|
| 13 |
required = True,
|
| 14 |
download_priority = 1,
|
|
|
|
| 43 |
batch_size = 16,
|
| 44 |
additional_params = {"is_spacy_model": True},
|
| 45 |
),
|
| 46 |
+
"content_domain_classifier" : ModelConfig(model_id = "cross-encoder/nli-roberta-base",
|
| 47 |
model_type = ModelType.CLASSIFIER,
|
| 48 |
+
description = "Zero-shot content domain inference model",
|
| 49 |
size_mb = 500,
|
| 50 |
required = True,
|
| 51 |
download_priority = 1,
|
|
|
|
| 83 |
),
|
| 84 |
"language_detector" : ModelConfig(model_id = "papluca/xlm-roberta-base-language-detection",
|
| 85 |
model_type = ModelType.CLASSIFIER,
|
| 86 |
+
description = "Language detection for routing; not used in authenticity scoring",
|
| 87 |
size_mb = 1100,
|
| 88 |
required = False,
|
| 89 |
download_priority = 5,
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
# MODEL GROUPS FOR BATCH DOWNLOADING
|
| 97 |
+
MODEL_GROUPS = {"minimal" : ["perplexity_reference_lm", "content_domain_classifier"],
|
| 98 |
+
"essential" : ["perplexity_reference_lm", "semantic_primary", "linguistic_spacy", "content_domain_classifier"],
|
| 99 |
"extended" : ["semantic_secondary", "multi_perturbation_mask", "domain_classifier_fallback"],
|
| 100 |
"optional" : ["language_detector"],
|
| 101 |
}
|
| 102 |
|
| 103 |
|
| 104 |
# MODEL WEIGHTS FOR ENSEMBLE : For 6 metrics implemented
|
| 105 |
+
DEFAULT_MODEL_WEIGHTS = {"structural" : 0.20, # No model needed
|
| 106 |
+
"perplexity" : 0.20, # reference language model
|
| 107 |
+
"entropy" : 0.15, # token distribution statistics
|
| 108 |
+
"semantic" : 0.20, # all-MiniLM-L6-v2
|
| 109 |
"linguistic" : 0.15, # spacy
|
| 110 |
"multi_perturbation_stability" : 0.10, # gpt2 + distilroberta (optional)
|
| 111 |
}
|
config/schemas.py
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DEPENDENCIES
|
| 2 |
+
from typing import Any
|
| 3 |
+
from typing import Dict
|
| 4 |
+
from typing import List
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from dataclasses import field
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from config.enums import Script
|
| 9 |
+
from config.enums import Domain
|
| 10 |
+
from config.enums import Language
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from config.enums import ModelType
|
| 13 |
+
from config.enums import ConfidenceLevel
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class ModelConfig:
|
| 18 |
+
"""
|
| 19 |
+
Configuration for a single model
|
| 20 |
+
"""
|
| 21 |
+
model_id : str
|
| 22 |
+
model_type : ModelType
|
| 23 |
+
description : str
|
| 24 |
+
size_mb : int
|
| 25 |
+
required : bool = True
|
| 26 |
+
download_priority : int = 1 # 1=highest, 5=lowest
|
| 27 |
+
quantizable : bool = True
|
| 28 |
+
onnx_compatible : bool = False
|
| 29 |
+
cache_model : bool = True
|
| 30 |
+
max_length : Optional[int] = None
|
| 31 |
+
batch_size : int = 1
|
| 32 |
+
additional_params : Dict[str, Any] = field(default_factory = dict)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ModelUsageStats:
|
| 37 |
+
"""
|
| 38 |
+
Lightweight model usage statistics
|
| 39 |
+
"""
|
| 40 |
+
model_name : str
|
| 41 |
+
usage_count : int
|
| 42 |
+
last_used : datetime
|
| 43 |
+
timed_usage_count : int
|
| 44 |
+
total_usage_time_seconds : float
|
| 45 |
+
avg_usage_time_seconds : float
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 49 |
+
"""
|
| 50 |
+
Convert to dictionary
|
| 51 |
+
"""
|
| 52 |
+
return {"model_name" : self.model_name,
|
| 53 |
+
"usage_count" : self.usage_count,
|
| 54 |
+
"last_used" : self.last_used.isoformat() if self.last_used else None,
|
| 55 |
+
"timed_usage_count" : self.timed_usage_count,
|
| 56 |
+
"total_usage_time_seconds" : round(self.total_usage_time_seconds, 2),
|
| 57 |
+
"avg_usage_time_seconds" : round(self.avg_usage_time_seconds, 2),
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class ExtractedDocument:
|
| 63 |
+
"""
|
| 64 |
+
Container for extracted document content with metadata
|
| 65 |
+
"""
|
| 66 |
+
text : str
|
| 67 |
+
file_path : Optional[str]
|
| 68 |
+
file_type : str
|
| 69 |
+
file_size_bytes : int
|
| 70 |
+
page_count : int
|
| 71 |
+
extraction_method : str
|
| 72 |
+
metadata : Dict[str, Any]
|
| 73 |
+
is_success : bool
|
| 74 |
+
error_message : Optional[str]
|
| 75 |
+
warnings : List[str]
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 79 |
+
"""
|
| 80 |
+
Convert to dictionary for JSON serialization
|
| 81 |
+
"""
|
| 82 |
+
return {"text_length" : len(self.text),
|
| 83 |
+
"file_type" : self.file_type,
|
| 84 |
+
"file_size_bytes" : self.file_size_bytes,
|
| 85 |
+
"page_count" : self.page_count,
|
| 86 |
+
"extraction_method" : self.extraction_method,
|
| 87 |
+
"metadata" : self.metadata,
|
| 88 |
+
"is_success" : self.is_success,
|
| 89 |
+
"error_message" : self.error_message,
|
| 90 |
+
"warnings" : self.warnings,
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@dataclass
|
| 96 |
+
class ProcessedText:
|
| 97 |
+
"""
|
| 98 |
+
Container for processed text with metadata
|
| 99 |
+
"""
|
| 100 |
+
original_text : str
|
| 101 |
+
cleaned_text : str
|
| 102 |
+
sentences : List[str]
|
| 103 |
+
words : List[str]
|
| 104 |
+
paragraphs : List[str]
|
| 105 |
+
char_count : int
|
| 106 |
+
word_count : int
|
| 107 |
+
sentence_count : int
|
| 108 |
+
paragraph_count : int
|
| 109 |
+
avg_sentence_length: float
|
| 110 |
+
avg_word_length : float
|
| 111 |
+
is_valid : bool
|
| 112 |
+
validation_errors : List[str]
|
| 113 |
+
metadata : Dict[str, Any]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 117 |
+
"""
|
| 118 |
+
Convert to dictionary for JSON serialization
|
| 119 |
+
"""
|
| 120 |
+
return {"original_length" : len(self.original_text),
|
| 121 |
+
"cleaned_length" : len(self.cleaned_text),
|
| 122 |
+
"char_count" : self.char_count,
|
| 123 |
+
"word_count" : self.word_count,
|
| 124 |
+
"sentence_count" : self.sentence_count,
|
| 125 |
+
"paragraph_count" : self.paragraph_count,
|
| 126 |
+
"avg_sentence_length" : round(self.avg_sentence_length, 2),
|
| 127 |
+
"avg_word_length" : round(self.avg_word_length, 2),
|
| 128 |
+
"is_valid" : self.is_valid,
|
| 129 |
+
"validation_errors" : self.validation_errors,
|
| 130 |
+
"metadata" : self.metadata,
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@dataclass
|
| 135 |
+
class LanguageDetectionResult:
|
| 136 |
+
"""
|
| 137 |
+
Result of language detection
|
| 138 |
+
"""
|
| 139 |
+
primary_language : Language
|
| 140 |
+
evidence_strength : float
|
| 141 |
+
all_languages : Dict[str, float] # language_code -> evidence_strength
|
| 142 |
+
script : Script
|
| 143 |
+
is_multilingual : bool
|
| 144 |
+
detection_method : str
|
| 145 |
+
char_count : int
|
| 146 |
+
word_count : int
|
| 147 |
+
warnings : List[str]
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def to_dict(self) -> Dict:
|
| 151 |
+
"""
|
| 152 |
+
Convert to dictionary
|
| 153 |
+
"""
|
| 154 |
+
return {"primary_language" : self.primary_language.value,
|
| 155 |
+
"evidence_strength" : round(self.evidence_strength, 4),
|
| 156 |
+
"all_languages" : {k: round(v, 4) for k, v in self.all_languages.items()},
|
| 157 |
+
"script" : self.script.value,
|
| 158 |
+
"is_multilingual" : self.is_multilingual,
|
| 159 |
+
"detection_method" : self.detection_method,
|
| 160 |
+
"char_count" : self.char_count,
|
| 161 |
+
"word_count" : self.word_count,
|
| 162 |
+
"warnings" : self.warnings,
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
@dataclass
|
| 167 |
+
class MetricThresholds:
|
| 168 |
+
"""
|
| 169 |
+
Thresholds for a single metric
|
| 170 |
+
"""
|
| 171 |
+
synthetic_threshold : float # Above this = low authenticity
|
| 172 |
+
authentic_threshold : float # Below this = high authenticity
|
| 173 |
+
confidence_multiplier : float = 1.0
|
| 174 |
+
weight : float = 1.0
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
@dataclass
|
| 178 |
+
class DomainThresholds:
|
| 179 |
+
"""
|
| 180 |
+
Thresholds for 6 metrics in a specific domain
|
| 181 |
+
"""
|
| 182 |
+
domain : Domain
|
| 183 |
+
structural : MetricThresholds
|
| 184 |
+
perplexity : MetricThresholds
|
| 185 |
+
entropy : MetricThresholds
|
| 186 |
+
semantic : MetricThresholds
|
| 187 |
+
linguistic : MetricThresholds
|
| 188 |
+
multi_perturbation_stability : MetricThresholds
|
| 189 |
+
ensemble_threshold : float = 0.5 # authenticity decision boundary
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
@dataclass
|
| 193 |
+
class DomainPrediction:
|
| 194 |
+
"""
|
| 195 |
+
Result of domain classification
|
| 196 |
+
"""
|
| 197 |
+
primary_domain : Domain
|
| 198 |
+
secondary_domain : Optional[Domain]
|
| 199 |
+
evidence_strength : float
|
| 200 |
+
domain_scores : Dict[str, float]
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class MetricResult:
|
| 204 |
+
"""
|
| 205 |
+
Result from a metric calculation
|
| 206 |
+
"""
|
| 207 |
+
def __init__(self, metric_name: str, synthetic_probability: float, authentic_probability: float, hybrid_probability: float, confidence: float, details: Optional[Dict[str, Any]] = None, error: Optional[str] = None):
|
| 208 |
+
self.metric_name = metric_name
|
| 209 |
+
self.synthetic_probability = max(0.0, min(1.0, synthetic_probability))
|
| 210 |
+
self.authentic_probability = max(0.0, min(1.0, authentic_probability))
|
| 211 |
+
self.hybrid_probability = max(0.0, min(1.0, hybrid_probability))
|
| 212 |
+
self.confidence = max(0.0, min(1.0, confidence))
|
| 213 |
+
self.details = details or {}
|
| 214 |
+
self.error = error
|
| 215 |
+
|
| 216 |
+
# Normalize probabilities to sum to 1
|
| 217 |
+
total = self.synthetic_probability + self.authentic_probability + self.hybrid_probability
|
| 218 |
+
|
| 219 |
+
if (total > 0):
|
| 220 |
+
self.synthetic_probability /= total
|
| 221 |
+
self.authentic_probability /= total
|
| 222 |
+
self.hybrid_probability /= total
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 226 |
+
"""
|
| 227 |
+
Convert to dictionary
|
| 228 |
+
"""
|
| 229 |
+
return {"metric_name" : self.metric_name,
|
| 230 |
+
"synthetic_probability" : round(self.synthetic_probability, 4),
|
| 231 |
+
"authentic_probability" : round(self.authentic_probability, 4),
|
| 232 |
+
"hybrid_probability" : round(self.hybrid_probability, 4),
|
| 233 |
+
"confidence" : round(self.confidence, 4),
|
| 234 |
+
"details" : self.details,
|
| 235 |
+
"error" : self.error,
|
| 236 |
+
"success" : self.error is None,
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
@property
|
| 241 |
+
def is_synthetic(self) -> bool:
|
| 242 |
+
"""
|
| 243 |
+
Check if classified as synthetic
|
| 244 |
+
"""
|
| 245 |
+
return self.synthetic_probability > max(self.authentic_probability, self.hybrid_probability)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
@property
|
| 249 |
+
def is_authentic(self) -> bool:
|
| 250 |
+
"""
|
| 251 |
+
Check if classified as authentic
|
| 252 |
+
"""
|
| 253 |
+
return self.authentic_probability > max(self.synthetic_probability, self.hybrid_probability)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
@property
|
| 257 |
+
def is_hybrid(self) -> bool:
|
| 258 |
+
"""
|
| 259 |
+
Check if classified as hybrid
|
| 260 |
+
"""
|
| 261 |
+
return self.hybrid_probability > max(self.synthetic_probability, self.authentic_probability)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
@property
|
| 265 |
+
def predicted_class(self) -> str:
|
| 266 |
+
"""
|
| 267 |
+
Get predicted class
|
| 268 |
+
"""
|
| 269 |
+
if self.is_synthetic:
|
| 270 |
+
return "Synthetic"
|
| 271 |
+
|
| 272 |
+
elif self.is_authentic:
|
| 273 |
+
return "Authentic"
|
| 274 |
+
|
| 275 |
+
else:
|
| 276 |
+
return "Hybrid"
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
@dataclass
|
| 280 |
+
class EnsembleResult:
|
| 281 |
+
"""
|
| 282 |
+
Result from ensemble classification
|
| 283 |
+
"""
|
| 284 |
+
final_verdict : str # "Synthetically-Generated-Text", "Authentically-Written-Text", or "Hybrid-Text"
|
| 285 |
+
synthetic_probability : float
|
| 286 |
+
authentic_probability : float
|
| 287 |
+
hybrid_probability : float
|
| 288 |
+
overall_confidence : float
|
| 289 |
+
domain : Domain
|
| 290 |
+
metric_results : Dict[str, MetricResult]
|
| 291 |
+
metric_weights : Dict[str, float]
|
| 292 |
+
weighted_scores : Dict[str, float]
|
| 293 |
+
reasoning : List[str]
|
| 294 |
+
uncertainty_score : float
|
| 295 |
+
consensus_level : float
|
| 296 |
+
execution_mode : str
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 300 |
+
"""
|
| 301 |
+
Convert to dictionary for JSON serialization
|
| 302 |
+
"""
|
| 303 |
+
return {"final_verdict" : self.final_verdict,
|
| 304 |
+
"synthetic_probability" : round(self.synthetic_probability, 4),
|
| 305 |
+
"authentic_probability" : round(self.authentic_probability, 4),
|
| 306 |
+
"hybrid_probability" : round(self.hybrid_probability, 4),
|
| 307 |
+
"overall_confidence" : round(self.overall_confidence, 4),
|
| 308 |
+
"domain" : self.domain.value,
|
| 309 |
+
"uncertainty_score" : round(self.uncertainty_score, 4),
|
| 310 |
+
"consensus_level" : round(self.consensus_level, 4),
|
| 311 |
+
"metric_contributions" : {name: {"weight" : round(self.metric_weights.get(name, 0.0), 4),
|
| 312 |
+
"weighted_score" : round(self.weighted_scores.get(name, 0.0), 4),
|
| 313 |
+
"synthetic_prob" : round(result.synthetic_probability, 4),
|
| 314 |
+
"confidence" : round(result.confidence, 4),
|
| 315 |
+
}
|
| 316 |
+
for name, result in self.metric_results.items()
|
| 317 |
+
},
|
| 318 |
+
"reasoning" : self.reasoning,
|
| 319 |
+
"execution_mode" : self.execution_mode,
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
@dataclass
|
| 324 |
+
class HighlightedSentenceResult:
|
| 325 |
+
"""
|
| 326 |
+
A sentence with highlighting information
|
| 327 |
+
"""
|
| 328 |
+
text : str
|
| 329 |
+
synthetic_probability : float
|
| 330 |
+
authentic_probability : float
|
| 331 |
+
hybrid_probability : float
|
| 332 |
+
confidence : float
|
| 333 |
+
confidence_level : ConfidenceLevel
|
| 334 |
+
color_class : str
|
| 335 |
+
tooltip : str
|
| 336 |
+
index : int
|
| 337 |
+
is_hybrid_content : bool
|
| 338 |
+
metric_breakdown : Optional[Dict[str, float]] = None
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
@dataclass
|
| 343 |
+
class DetectionResult:
|
| 344 |
+
"""
|
| 345 |
+
Complete detection result with all metadata
|
| 346 |
+
"""
|
| 347 |
+
# Final results
|
| 348 |
+
ensemble_result : EnsembleResult
|
| 349 |
+
|
| 350 |
+
# Input metadata
|
| 351 |
+
processed_text : ProcessedText
|
| 352 |
+
domain_prediction : DomainPrediction
|
| 353 |
+
language_result : Optional[LanguageDetectionResult]
|
| 354 |
+
|
| 355 |
+
# Metric details
|
| 356 |
+
metric_results : Dict[str, MetricResult]
|
| 357 |
+
|
| 358 |
+
# Performance metrics
|
| 359 |
+
processing_time : float
|
| 360 |
+
metrics_execution_time : Dict[str, float]
|
| 361 |
+
|
| 362 |
+
# Warnings and errors
|
| 363 |
+
warnings : List[str]
|
| 364 |
+
errors : List[str]
|
| 365 |
+
|
| 366 |
+
# File information
|
| 367 |
+
file_info : Optional[Dict[str, Any]] = None
|
| 368 |
+
|
| 369 |
+
# Execution mode
|
| 370 |
+
execution_mode : Optional[str] = "parallel"
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 374 |
+
"""
|
| 375 |
+
Convert to dictionary for JSON serialization
|
| 376 |
+
"""
|
| 377 |
+
result = {"prediction" : {"verdict" : self.ensemble_result.final_verdict,
|
| 378 |
+
"synthetic_probability" : round(self.ensemble_result.synthetic_probability, 4),
|
| 379 |
+
"authentic_probability" : round(self.ensemble_result.authentic_probability, 4),
|
| 380 |
+
"hybrid_probability" : round(self.ensemble_result.hybrid_probability, 4),
|
| 381 |
+
"confidence" : round(self.ensemble_result.overall_confidence, 4),
|
| 382 |
+
},
|
| 383 |
+
"analysis" : {"domain" : self.domain_prediction.primary_domain.value,
|
| 384 |
+
"domain_confidence" : round(self.domain_prediction.evidence_strength, 4),
|
| 385 |
+
"language" : self.language_result.primary_language.value if self.language_result else "unknown",
|
| 386 |
+
"language_confidence" : round(self.language_result.evidence_strength, 4) if self.language_result else 0.0,
|
| 387 |
+
"text_length" : self.processed_text.word_count,
|
| 388 |
+
"sentence_count" : self.processed_text.sentence_count,
|
| 389 |
+
},
|
| 390 |
+
"metrics" : {name: result.to_dict() for name, result in self.metric_results.items()},
|
| 391 |
+
"ensemble" : self.ensemble_result.to_dict(),
|
| 392 |
+
"performance" : {"total_time" : round(self.processing_time, 3),
|
| 393 |
+
"metrics_time" : {name: round(t, 3) for name, t in self.metrics_execution_time.items()},
|
| 394 |
+
},
|
| 395 |
+
"warnings" : self.warnings,
|
| 396 |
+
"errors" : self.errors,
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
# Include file_info if available
|
| 400 |
+
if self.file_info:
|
| 401 |
+
result["file_info"] = self.file_info
|
| 402 |
+
|
| 403 |
+
return result
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
@dataclass
|
| 407 |
+
class DetailedReasoningResult:
|
| 408 |
+
"""
|
| 409 |
+
Comprehensive reasoning for detection result with ensemble integration
|
| 410 |
+
"""
|
| 411 |
+
summary : str
|
| 412 |
+
key_indicators : List[str]
|
| 413 |
+
metric_explanations : Dict[str, str]
|
| 414 |
+
supporting_evidence : List[str]
|
| 415 |
+
contradicting_evidence : List[str]
|
| 416 |
+
confidence_explanation : str
|
| 417 |
+
domain_analysis : str
|
| 418 |
+
ensemble_analysis : str
|
| 419 |
+
recommendations : List[str]
|
| 420 |
+
uncertainty_analysis : str
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 424 |
+
"""
|
| 425 |
+
Convert to dictionary
|
| 426 |
+
"""
|
| 427 |
+
return {"summary" : self.summary,
|
| 428 |
+
"key_indicators" : self.key_indicators,
|
| 429 |
+
"metric_explanations" : self.metric_explanations,
|
| 430 |
+
"supporting_evidence" : self.supporting_evidence,
|
| 431 |
+
"contradicting_evidence" : self.contradicting_evidence,
|
| 432 |
+
"confidence_explanation" : self.confidence_explanation,
|
| 433 |
+
"domain_analysis" : self.domain_analysis,
|
| 434 |
+
"ensemble_analysis" : self.ensemble_analysis,
|
| 435 |
+
"recommendations" : self.recommendations,
|
| 436 |
+
"uncertainty_analysis" : self.uncertainty_analysis,
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
@dataclass
|
| 441 |
+
class DetailedMetricResult:
|
| 442 |
+
"""
|
| 443 |
+
Metric data structure with sub-metrics
|
| 444 |
+
"""
|
| 445 |
+
name : str
|
| 446 |
+
synthetic_probability : float
|
| 447 |
+
authentic_probability : float
|
| 448 |
+
confidence : float
|
| 449 |
+
verdict : str
|
| 450 |
+
description : str
|
| 451 |
+
detailed_metrics : Dict[str, float]
|
| 452 |
+
weight : float
|
config/settings.py
CHANGED
|
@@ -12,88 +12,87 @@ class Settings(BaseSettings):
|
|
| 12 |
Main application settings
|
| 13 |
"""
|
| 14 |
# Application Info
|
| 15 |
-
APP_NAME
|
| 16 |
-
APP_VERSION
|
| 17 |
-
APP_DESCRIPTION
|
| 18 |
|
| 19 |
# Environment
|
| 20 |
-
ENVIRONMENT
|
| 21 |
-
DEBUG
|
| 22 |
|
| 23 |
# Server Configuration
|
| 24 |
-
HOST
|
| 25 |
-
PORT
|
| 26 |
-
WORKERS
|
| 27 |
|
| 28 |
# Paths
|
| 29 |
-
BASE_DIR
|
| 30 |
-
MODEL_CACHE_DIR
|
| 31 |
-
LOG_DIR
|
| 32 |
-
UPLOAD_DIR
|
| 33 |
-
REPORT_DIR
|
| 34 |
|
| 35 |
# File Upload Settings
|
| 36 |
-
MAX_UPLOAD_SIZE
|
| 37 |
-
ALLOWED_EXTENSIONS
|
| 38 |
|
| 39 |
# Processing Settings
|
| 40 |
-
MAX_TEXT_LENGTH
|
| 41 |
-
MIN_TEXT_LENGTH
|
| 42 |
-
CHUNK_SIZE
|
| 43 |
-
CHUNK_OVERLAP
|
| 44 |
|
| 45 |
# Model Settings
|
| 46 |
-
DEVICE
|
| 47 |
-
USE_QUANTIZATION
|
| 48 |
-
USE_ONNX
|
| 49 |
-
MODEL_LOAD_STRATEGY
|
| 50 |
-
MAX_CACHED_MODELS
|
| 51 |
|
| 52 |
# Detection Settings
|
| 53 |
-
|
| 54 |
-
ENSEMBLE_METHOD
|
| 55 |
-
|
| 56 |
|
| 57 |
# Rate Limiting
|
| 58 |
-
RATE_LIMIT_ENABLED
|
| 59 |
-
RATE_LIMIT_REQUESTS
|
| 60 |
-
RATE_LIMIT_WINDOW
|
| 61 |
|
| 62 |
# Logging
|
| 63 |
-
LOG_LEVEL
|
| 64 |
-
LOG_FORMAT
|
| 65 |
-
LOG_ROTATION
|
| 66 |
-
LOG_RETENTION
|
| 67 |
|
| 68 |
# API Settings
|
| 69 |
-
API_PREFIX
|
| 70 |
-
CORS_ORIGINS
|
| 71 |
|
| 72 |
# Database (Optional - for future)
|
| 73 |
-
DATABASE_URL
|
| 74 |
|
| 75 |
# Security
|
| 76 |
-
SECRET_KEY
|
| 77 |
-
API_KEY_ENABLED
|
| 78 |
|
| 79 |
# Feature Flags
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
ENABLE_BATCH_PROCESSING : bool = True
|
| 84 |
|
| 85 |
# Performance
|
| 86 |
-
MAX_CONCURRENT_REQUESTS
|
| 87 |
-
REQUEST_TIMEOUT
|
| 88 |
|
| 89 |
# Metrics Configuration
|
| 90 |
-
METRICS_ENABLED
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
|
| 98 |
class Config:
|
| 99 |
env_file = ".env"
|
|
|
|
| 12 |
Main application settings
|
| 13 |
"""
|
| 14 |
# Application Info
|
| 15 |
+
APP_NAME : str = "TEXT-AUTH"
|
| 16 |
+
APP_VERSION : str = "1.0.0"
|
| 17 |
+
APP_DESCRIPTION : str = "Text Authentication & Content Authenticity Platform"
|
| 18 |
|
| 19 |
# Environment
|
| 20 |
+
ENVIRONMENT : str = Field(default = "development", env = "ENVIRONMENT")
|
| 21 |
+
DEBUG : bool = Field(default = True, env = "DEBUG")
|
| 22 |
|
| 23 |
# Server Configuration
|
| 24 |
+
HOST : str = Field(default = "0.0.0.0", env = "HOST")
|
| 25 |
+
PORT : int = Field(default = 8000, env = "PORT")
|
| 26 |
+
WORKERS : int = Field(default = 4, env = "WORKERS")
|
| 27 |
|
| 28 |
# Paths
|
| 29 |
+
BASE_DIR : Path = Path(__file__).parent.parent.resolve()
|
| 30 |
+
MODEL_CACHE_DIR : Path = Field(default = Path(__file__).parent.parent / "models" / "cache", env = "MODEL_CACHE_DIR")
|
| 31 |
+
LOG_DIR : Path = Field(default = Path(__file__).parent.parent / "logs", env = "LOG_DIR")
|
| 32 |
+
UPLOAD_DIR : Path = Field(default = Path(__file__).parent.parent / "data" / "uploads", env = "UPLOAD_DIR")
|
| 33 |
+
REPORT_DIR : Path = Field(default = Path(__file__).parent.parent / "data" / "reports", env = "REPORT_DIR")
|
| 34 |
|
| 35 |
# File Upload Settings
|
| 36 |
+
MAX_UPLOAD_SIZE : int = 10 * 1024 * 1024 # 10MB
|
| 37 |
+
ALLOWED_EXTENSIONS : list = [".txt", ".pdf", ".docx", ".doc", ".md"]
|
| 38 |
|
| 39 |
# Processing Settings
|
| 40 |
+
MAX_TEXT_LENGTH : int = 500000 # Maximum characters to process
|
| 41 |
+
MIN_TEXT_LENGTH : int = 200 # Minimum characters for analysis
|
| 42 |
+
CHUNK_SIZE : int = 512 # Tokens per chunk
|
| 43 |
+
CHUNK_OVERLAP : int = 50 # Overlap between chunks
|
| 44 |
|
| 45 |
# Model Settings
|
| 46 |
+
DEVICE : str = Field(default = "cpu", env = "DEVICE") # "cuda" or "cpu"
|
| 47 |
+
USE_QUANTIZATION : bool = Field(default = False, env = "USE_QUANTIZATION")
|
| 48 |
+
USE_ONNX : bool = Field(default = False, env = "USE_ONNX")
|
| 49 |
+
MODEL_LOAD_STRATEGY : str = "lazy" # "lazy" or "eager"
|
| 50 |
+
MAX_CACHED_MODELS : int = 5
|
| 51 |
|
| 52 |
# Detection Settings
|
| 53 |
+
AUTHENTICITY_CONFIDENCE_THRESHOLD : float = 0.7 # Minimum confidence for classification
|
| 54 |
+
ENSEMBLE_METHOD : str = "weighted_average" # "weighted_average", "voting", "stacking"
|
| 55 |
+
USE_DOMAIN_CALIBRATION : bool = True
|
| 56 |
|
| 57 |
# Rate Limiting
|
| 58 |
+
RATE_LIMIT_ENABLED : bool = True
|
| 59 |
+
RATE_LIMIT_REQUESTS : int = 100
|
| 60 |
+
RATE_LIMIT_WINDOW : int = 3600 # seconds (1 hour)
|
| 61 |
|
| 62 |
# Logging
|
| 63 |
+
LOG_LEVEL : str = Field(default = "INFO", env = "LOG_LEVEL")
|
| 64 |
+
LOG_FORMAT : str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 65 |
+
LOG_ROTATION : str = "1 day"
|
| 66 |
+
LOG_RETENTION : str = "30 days"
|
| 67 |
|
| 68 |
# API Settings
|
| 69 |
+
API_PREFIX : str = "/api/v1"
|
| 70 |
+
CORS_ORIGINS : list = ["*"] # For production, specify exact origins
|
| 71 |
|
| 72 |
# Database (Optional - for future)
|
| 73 |
+
DATABASE_URL : Optional[str] = Field(default = None, env = "DATABASE_URL")
|
| 74 |
|
| 75 |
# Security
|
| 76 |
+
SECRET_KEY : str = Field(default = "your-secret-key-change-in-production", env = "SECRET_KEY")
|
| 77 |
+
API_KEY_ENABLED : bool = False
|
| 78 |
|
| 79 |
# Feature Flags
|
| 80 |
+
ENABLE_HIGHLIGHTING : bool = True
|
| 81 |
+
ENABLE_PDF_REPORTS : bool = True
|
| 82 |
+
ENABLE_BATCH_PROCESSING : bool = True
|
|
|
|
| 83 |
|
| 84 |
# Performance
|
| 85 |
+
MAX_CONCURRENT_REQUESTS : int = 10
|
| 86 |
+
REQUEST_TIMEOUT : int = 300 # seconds (5 minutes)
|
| 87 |
|
| 88 |
# Metrics Configuration
|
| 89 |
+
METRICS_ENABLED : dict = {"semantic_analysis" : True,
|
| 90 |
+
"multi_perturbation_stability" : True,
|
| 91 |
+
"perplexity" : True,
|
| 92 |
+
"structural" : True,
|
| 93 |
+
"entropy" : True,
|
| 94 |
+
"linguistic" : True,
|
| 95 |
+
}
|
| 96 |
|
| 97 |
class Config:
|
| 98 |
env_file = ".env"
|
config/threshold_config.py
CHANGED
|
@@ -1,244 +1,186 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
-
from enum import Enum
|
| 3 |
from typing import Dict
|
| 4 |
from typing import Tuple
|
| 5 |
-
from
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
"""
|
| 10 |
-
Text domains for adaptive thresholding
|
| 11 |
-
"""
|
| 12 |
-
# Core domains
|
| 13 |
-
GENERAL = "general"
|
| 14 |
-
ACADEMIC = "academic"
|
| 15 |
-
CREATIVE = "creative"
|
| 16 |
-
AI_ML = "ai_ml"
|
| 17 |
-
SOFTWARE_DEV = "software_dev"
|
| 18 |
-
TECHNICAL_DOC = "technical_doc"
|
| 19 |
-
ENGINEERING = "engineering"
|
| 20 |
-
SCIENCE = "science"
|
| 21 |
-
BUSINESS = "business"
|
| 22 |
-
LEGAL = "legal"
|
| 23 |
-
MEDICAL = "medical"
|
| 24 |
-
JOURNALISM = "journalism"
|
| 25 |
-
MARKETING = "marketing"
|
| 26 |
-
SOCIAL_MEDIA = "social_media"
|
| 27 |
-
BLOG_PERSONAL = "blog_personal"
|
| 28 |
-
TUTORIAL = "tutorial"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class ConfidenceLevel(Enum):
|
| 32 |
-
"""
|
| 33 |
-
Confidence levels for classification
|
| 34 |
-
"""
|
| 35 |
-
VERY_LOW = "very_low"
|
| 36 |
-
LOW = "low"
|
| 37 |
-
MEDIUM = "medium"
|
| 38 |
-
HIGH = "high"
|
| 39 |
-
VERY_HIGH = "very_high"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
@dataclass
|
| 43 |
-
class MetricThresholds:
|
| 44 |
-
"""
|
| 45 |
-
Thresholds for a single metric
|
| 46 |
-
"""
|
| 47 |
-
ai_threshold : float # Above this = likely AI
|
| 48 |
-
human_threshold : float # Below this = likely human
|
| 49 |
-
confidence_multiplier : float = 1.0
|
| 50 |
-
weight : float = 1.0
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
@dataclass
|
| 54 |
-
class DomainThresholds:
|
| 55 |
-
"""
|
| 56 |
-
Thresholds for 6 metrics in a specific domain
|
| 57 |
-
"""
|
| 58 |
-
domain : Domain
|
| 59 |
-
structural : MetricThresholds
|
| 60 |
-
perplexity : MetricThresholds
|
| 61 |
-
entropy : MetricThresholds
|
| 62 |
-
semantic_analysis : MetricThresholds
|
| 63 |
-
linguistic : MetricThresholds
|
| 64 |
-
multi_perturbation_stability : MetricThresholds
|
| 65 |
-
ensemble_threshold : float = 0.5
|
| 66 |
|
| 67 |
|
| 68 |
# ==================== DOMAIN-SPECIFIC THRESHOLDS ====================
|
| 69 |
# GENERAL (Default fallback)
|
| 70 |
DEFAULT_THRESHOLDS = DomainThresholds(domain = Domain.GENERAL,
|
| 71 |
-
structural = MetricThresholds(
|
| 72 |
-
perplexity = MetricThresholds(
|
| 73 |
-
entropy = MetricThresholds(
|
| 74 |
-
|
| 75 |
-
linguistic = MetricThresholds(
|
| 76 |
-
multi_perturbation_stability = MetricThresholds(
|
| 77 |
ensemble_threshold = 0.40,
|
| 78 |
)
|
| 79 |
|
| 80 |
# ACADEMIC
|
| 81 |
ACADEMIC_THRESHOLDS = DomainThresholds(domain = Domain.ACADEMIC,
|
| 82 |
-
structural = MetricThresholds(
|
| 83 |
-
perplexity = MetricThresholds(
|
| 84 |
-
entropy = MetricThresholds(
|
| 85 |
-
|
| 86 |
-
linguistic = MetricThresholds(
|
| 87 |
-
multi_perturbation_stability = MetricThresholds(
|
| 88 |
ensemble_threshold = 0.42,
|
| 89 |
)
|
| 90 |
|
| 91 |
# CREATIVE WRITING
|
| 92 |
CREATIVE_THRESHOLDS = DomainThresholds(domain = Domain.CREATIVE,
|
| 93 |
-
structural = MetricThresholds(
|
| 94 |
-
perplexity = MetricThresholds(
|
| 95 |
-
entropy = MetricThresholds(
|
| 96 |
-
|
| 97 |
-
linguistic = MetricThresholds(
|
| 98 |
-
multi_perturbation_stability = MetricThresholds(
|
| 99 |
ensemble_threshold = 0.38,
|
| 100 |
)
|
| 101 |
|
| 102 |
# AI/ML/DATA SCIENCE
|
| 103 |
AI_ML_THRESHOLDS = DomainThresholds(domain = Domain.AI_ML,
|
| 104 |
-
structural = MetricThresholds(
|
| 105 |
-
perplexity = MetricThresholds(
|
| 106 |
-
entropy = MetricThresholds(
|
| 107 |
-
|
| 108 |
-
linguistic = MetricThresholds(
|
| 109 |
-
multi_perturbation_stability = MetricThresholds(
|
| 110 |
ensemble_threshold = 0.41,
|
| 111 |
)
|
| 112 |
|
| 113 |
# SOFTWARE DEVELOPMENT
|
| 114 |
SOFTWARE_DEV_THRESHOLDS = DomainThresholds(domain = Domain.SOFTWARE_DEV,
|
| 115 |
-
structural = MetricThresholds(
|
| 116 |
-
perplexity = MetricThresholds(
|
| 117 |
-
entropy = MetricThresholds(
|
| 118 |
-
|
| 119 |
-
linguistic = MetricThresholds(
|
| 120 |
-
multi_perturbation_stability = MetricThresholds(
|
| 121 |
ensemble_threshold = 0.41,
|
| 122 |
)
|
| 123 |
|
| 124 |
# TECHNICAL DOCUMENTATION
|
| 125 |
TECHNICAL_DOC_THRESHOLDS = DomainThresholds(domain = Domain.TECHNICAL_DOC,
|
| 126 |
-
structural = MetricThresholds(
|
| 127 |
-
perplexity = MetricThresholds(
|
| 128 |
-
entropy = MetricThresholds(
|
| 129 |
-
|
| 130 |
-
linguistic = MetricThresholds(
|
| 131 |
-
multi_perturbation_stability = MetricThresholds(
|
| 132 |
ensemble_threshold = 0.42,
|
| 133 |
)
|
| 134 |
|
| 135 |
# ENGINEERING
|
| 136 |
ENGINEERING_THRESHOLDS = DomainThresholds(domain = Domain.ENGINEERING,
|
| 137 |
-
structural = MetricThresholds(
|
| 138 |
-
perplexity = MetricThresholds(
|
| 139 |
-
entropy = MetricThresholds(
|
| 140 |
-
|
| 141 |
-
linguistic = MetricThresholds(
|
| 142 |
-
multi_perturbation_stability = MetricThresholds(
|
| 143 |
ensemble_threshold = 0.41,
|
| 144 |
)
|
| 145 |
|
| 146 |
# SCIENCE (Physics, Chemistry, Biology)
|
| 147 |
SCIENCE_THRESHOLDS = DomainThresholds(domain = Domain.SCIENCE,
|
| 148 |
-
structural = MetricThresholds(
|
| 149 |
-
perplexity = MetricThresholds(
|
| 150 |
-
entropy = MetricThresholds(
|
| 151 |
-
|
| 152 |
-
linguistic = MetricThresholds(
|
| 153 |
-
multi_perturbation_stability = MetricThresholds(
|
| 154 |
ensemble_threshold = 0.42,
|
| 155 |
)
|
| 156 |
|
| 157 |
# BUSINESS
|
| 158 |
BUSINESS_THRESHOLDS = DomainThresholds(domain = Domain.BUSINESS,
|
| 159 |
-
structural = MetricThresholds(
|
| 160 |
-
perplexity = MetricThresholds(
|
| 161 |
-
entropy = MetricThresholds(
|
| 162 |
-
|
| 163 |
-
linguistic = MetricThresholds(
|
| 164 |
-
multi_perturbation_stability = MetricThresholds(
|
| 165 |
ensemble_threshold = 0.40,
|
| 166 |
)
|
| 167 |
|
| 168 |
# LEGAL
|
| 169 |
LEGAL_THRESHOLDS = DomainThresholds(domain = Domain.LEGAL,
|
| 170 |
-
structural = MetricThresholds(
|
| 171 |
-
perplexity = MetricThresholds(
|
| 172 |
-
entropy = MetricThresholds(
|
| 173 |
-
|
| 174 |
-
linguistic = MetricThresholds(
|
| 175 |
-
multi_perturbation_stability = MetricThresholds(
|
| 176 |
ensemble_threshold = 0.43,
|
| 177 |
)
|
| 178 |
|
| 179 |
# MEDICAL
|
| 180 |
MEDICAL_THRESHOLDS = DomainThresholds(domain = Domain.MEDICAL,
|
| 181 |
-
structural = MetricThresholds(
|
| 182 |
-
perplexity = MetricThresholds(
|
| 183 |
-
entropy = MetricThresholds(
|
| 184 |
-
|
| 185 |
-
linguistic = MetricThresholds(
|
| 186 |
-
multi_perturbation_stability = MetricThresholds(
|
| 187 |
ensemble_threshold = 0.43,
|
| 188 |
)
|
| 189 |
|
| 190 |
# JOURNALISM
|
| 191 |
JOURNALISM_THRESHOLDS = DomainThresholds(domain = Domain.JOURNALISM,
|
| 192 |
-
structural = MetricThresholds(
|
| 193 |
-
perplexity = MetricThresholds(
|
| 194 |
-
entropy = MetricThresholds(
|
| 195 |
-
|
| 196 |
-
linguistic = MetricThresholds(
|
| 197 |
-
multi_perturbation_stability = MetricThresholds(
|
| 198 |
ensemble_threshold = 0.40,
|
| 199 |
)
|
| 200 |
|
| 201 |
# MARKETING
|
| 202 |
MARKETING_THRESHOLDS = DomainThresholds(domain = Domain.MARKETING,
|
| 203 |
-
structural = MetricThresholds(
|
| 204 |
-
perplexity = MetricThresholds(
|
| 205 |
-
entropy = MetricThresholds(
|
| 206 |
-
|
| 207 |
-
linguistic = MetricThresholds(
|
| 208 |
-
multi_perturbation_stability = MetricThresholds(
|
| 209 |
ensemble_threshold = 0.39,
|
| 210 |
)
|
| 211 |
|
| 212 |
# SOCIAL MEDIA
|
| 213 |
SOCIAL_MEDIA_THRESHOLDS = DomainThresholds(domain = Domain.SOCIAL_MEDIA,
|
| 214 |
-
structural = MetricThresholds(
|
| 215 |
-
perplexity = MetricThresholds(
|
| 216 |
-
entropy = MetricThresholds(
|
| 217 |
-
|
| 218 |
-
linguistic = MetricThresholds(
|
| 219 |
-
multi_perturbation_stability = MetricThresholds(
|
| 220 |
ensemble_threshold = 0.36,
|
| 221 |
)
|
| 222 |
|
| 223 |
# PERSONAL BLOG
|
| 224 |
BLOG_PERSONAL_THRESHOLDS = DomainThresholds(domain = Domain.BLOG_PERSONAL,
|
| 225 |
-
structural = MetricThresholds(
|
| 226 |
-
perplexity = MetricThresholds(
|
| 227 |
-
entropy = MetricThresholds(
|
| 228 |
-
|
| 229 |
-
linguistic = MetricThresholds(
|
| 230 |
-
multi_perturbation_stability = MetricThresholds(
|
| 231 |
ensemble_threshold = 0.38,
|
| 232 |
)
|
| 233 |
|
| 234 |
# TUTORIAL/HOW-TO
|
| 235 |
TUTORIAL_THRESHOLDS = DomainThresholds(domain = Domain.TUTORIAL,
|
| 236 |
-
structural = MetricThresholds(
|
| 237 |
-
perplexity = MetricThresholds(
|
| 238 |
-
entropy = MetricThresholds(
|
| 239 |
-
|
| 240 |
-
linguistic = MetricThresholds(
|
| 241 |
-
multi_perturbation_stability = MetricThresholds(
|
| 242 |
ensemble_threshold = 0.40,
|
| 243 |
)
|
| 244 |
|
|
@@ -282,7 +224,8 @@ def get_threshold_for_domain(domain: Domain) -> DomainThresholds:
|
|
| 282 |
|
| 283 |
def get_confidence_level(score: float) -> ConfidenceLevel:
|
| 284 |
"""
|
| 285 |
-
Determine confidence level
|
|
|
|
| 286 |
"""
|
| 287 |
for level, (min_val, max_val) in CONFIDENCE_RANGES.items():
|
| 288 |
if (min_val <= score < max_val):
|
|
@@ -317,16 +260,16 @@ def interpolate_thresholds(domain1: Domain, domain2: Domain, weight1: float = 0.
|
|
| 317 |
weight2 = 1 - weight1
|
| 318 |
|
| 319 |
def interpolate_metric(m1: MetricThresholds, m2: MetricThresholds) -> MetricThresholds:
|
| 320 |
-
return MetricThresholds(
|
| 321 |
-
|
| 322 |
-
weight
|
| 323 |
)
|
| 324 |
|
| 325 |
return DomainThresholds(domain = domain1,
|
| 326 |
structural = interpolate_metric(thresh1.structural, thresh2.structural),
|
| 327 |
perplexity = interpolate_metric(thresh1.perplexity, thresh2.perplexity),
|
| 328 |
entropy = interpolate_metric(thresh1.entropy, thresh2.entropy),
|
| 329 |
-
|
| 330 |
linguistic = interpolate_metric(thresh1.linguistic, thresh2.linguistic),
|
| 331 |
multi_perturbation_stability = interpolate_metric(thresh1.multi_perturbation_stability, thresh2.multi_perturbation_stability),
|
| 332 |
ensemble_threshold = thresh1.ensemble_threshold * weight1 + thresh2.ensemble_threshold * weight2,
|
|
@@ -342,7 +285,7 @@ def get_active_metric_weights(domain: Domain, enabled_metrics: Dict[str, bool])
|
|
| 342 |
metric_mapping = {"structural" : thresholds.structural,
|
| 343 |
"perplexity" : thresholds.perplexity,
|
| 344 |
"entropy" : thresholds.entropy,
|
| 345 |
-
"
|
| 346 |
"linguistic" : thresholds.linguistic,
|
| 347 |
"multi_perturbation_stability" : thresholds.multi_perturbation_stability,
|
| 348 |
}
|
|
|
|
| 1 |
# DEPENDENCIES
|
|
|
|
| 2 |
from typing import Dict
|
| 3 |
from typing import Tuple
|
| 4 |
+
from config.enums import Domain
|
| 5 |
+
from config.enums import ConfidenceLevel
|
| 6 |
+
from config.schemas import MetricThresholds
|
| 7 |
+
from config.schemas import DomainThresholds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# ==================== DOMAIN-SPECIFIC THRESHOLDS ====================
|
| 11 |
# GENERAL (Default fallback)
|
| 12 |
DEFAULT_THRESHOLDS = DomainThresholds(domain = Domain.GENERAL,
|
| 13 |
+
structural = MetricThresholds(synthetic_threshold = 0.55, authentic_threshold = 0.45, weight = 0.20),
|
| 14 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.25),
|
| 15 |
+
entropy = MetricThresholds(synthetic_threshold = 0.48, authentic_threshold = 0.52, weight = 0.15),
|
| 16 |
+
semantic = MetricThresholds(synthetic_threshold = 0.55, authentic_threshold = 0.45, weight = 0.18),
|
| 17 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.12),
|
| 18 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.60, authentic_threshold = 0.40, weight = 0.10),
|
| 19 |
ensemble_threshold = 0.40,
|
| 20 |
)
|
| 21 |
|
| 22 |
# ACADEMIC
|
| 23 |
ACADEMIC_THRESHOLDS = DomainThresholds(domain = Domain.ACADEMIC,
|
| 24 |
+
structural = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.18),
|
| 25 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.45, weight = 0.26),
|
| 26 |
+
entropy = MetricThresholds(synthetic_threshold = 0.45, authentic_threshold = 0.50, weight = 0.14),
|
| 27 |
+
semantic = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.20),
|
| 28 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.14),
|
| 29 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.65, authentic_threshold = 0.35, weight = 0.08),
|
| 30 |
ensemble_threshold = 0.42,
|
| 31 |
)
|
| 32 |
|
| 33 |
# CREATIVE WRITING
|
| 34 |
CREATIVE_THRESHOLDS = DomainThresholds(domain = Domain.CREATIVE,
|
| 35 |
+
structural = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.18),
|
| 36 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.55, authentic_threshold = 0.50, weight = 0.22),
|
| 37 |
+
entropy = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.55, weight = 0.16),
|
| 38 |
+
semantic = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.20),
|
| 39 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.55, authentic_threshold = 0.45, weight = 0.16),
|
| 40 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.08),
|
| 41 |
ensemble_threshold = 0.38,
|
| 42 |
)
|
| 43 |
|
| 44 |
# AI/ML/DATA SCIENCE
|
| 45 |
AI_ML_THRESHOLDS = DomainThresholds(domain = Domain.AI_ML,
|
| 46 |
+
structural = MetricThresholds(synthetic_threshold = 0.57, authentic_threshold = 0.43, weight = 0.18),
|
| 47 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.51, authentic_threshold = 0.46, weight = 0.26),
|
| 48 |
+
entropy = MetricThresholds(synthetic_threshold = 0.47, authentic_threshold = 0.50, weight = 0.14),
|
| 49 |
+
semantic = MetricThresholds(synthetic_threshold = 0.57, authentic_threshold = 0.43, weight = 0.20),
|
| 50 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.61, authentic_threshold = 0.39, weight = 0.14),
|
| 51 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.64, authentic_threshold = 0.36, weight = 0.08),
|
| 52 |
ensemble_threshold = 0.41,
|
| 53 |
)
|
| 54 |
|
| 55 |
# SOFTWARE DEVELOPMENT
|
| 56 |
SOFTWARE_DEV_THRESHOLDS = DomainThresholds(domain = Domain.SOFTWARE_DEV,
|
| 57 |
+
structural = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.17),
|
| 58 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.45, weight = 0.27),
|
| 59 |
+
entropy = MetricThresholds(synthetic_threshold = 0.46, authentic_threshold = 0.50, weight = 0.14),
|
| 60 |
+
semantic = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.20),
|
| 61 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.60, authentic_threshold = 0.40, weight = 0.14),
|
| 62 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.63, authentic_threshold = 0.37, weight = 0.08),
|
| 63 |
ensemble_threshold = 0.41,
|
| 64 |
)
|
| 65 |
|
| 66 |
# TECHNICAL DOCUMENTATION
|
| 67 |
TECHNICAL_DOC_THRESHOLDS = DomainThresholds(domain = Domain.TECHNICAL_DOC,
|
| 68 |
+
structural = MetricThresholds(synthetic_threshold = 0.59, authentic_threshold = 0.41, weight = 0.18),
|
| 69 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.49, authentic_threshold = 0.44, weight = 0.27),
|
| 70 |
+
entropy = MetricThresholds(synthetic_threshold = 0.45, authentic_threshold = 0.49, weight = 0.13),
|
| 71 |
+
semantic = MetricThresholds(synthetic_threshold = 0.59, authentic_threshold = 0.41, weight = 0.20),
|
| 72 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.14),
|
| 73 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.65, authentic_threshold = 0.35, weight = 0.08),
|
| 74 |
ensemble_threshold = 0.42,
|
| 75 |
)
|
| 76 |
|
| 77 |
# ENGINEERING
|
| 78 |
ENGINEERING_THRESHOLDS = DomainThresholds(domain = Domain.ENGINEERING,
|
| 79 |
+
structural = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.18),
|
| 80 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.45, weight = 0.26),
|
| 81 |
+
entropy = MetricThresholds(synthetic_threshold = 0.46, authentic_threshold = 0.50, weight = 0.14),
|
| 82 |
+
semantic = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.20),
|
| 83 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.61, authentic_threshold = 0.39, weight = 0.14),
|
| 84 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.64, authentic_threshold = 0.36, weight = 0.08),
|
| 85 |
ensemble_threshold = 0.41,
|
| 86 |
)
|
| 87 |
|
| 88 |
# SCIENCE (Physics, Chemistry, Biology)
|
| 89 |
SCIENCE_THRESHOLDS = DomainThresholds(domain = Domain.SCIENCE,
|
| 90 |
+
structural = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.18),
|
| 91 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.51, authentic_threshold = 0.46, weight = 0.26),
|
| 92 |
+
entropy = MetricThresholds(synthetic_threshold = 0.46, authentic_threshold = 0.50, weight = 0.14),
|
| 93 |
+
semantic = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.20),
|
| 94 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.14),
|
| 95 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.64, authentic_threshold = 0.36, weight = 0.08),
|
| 96 |
ensemble_threshold = 0.42,
|
| 97 |
)
|
| 98 |
|
| 99 |
# BUSINESS
|
| 100 |
BUSINESS_THRESHOLDS = DomainThresholds(domain = Domain.BUSINESS,
|
| 101 |
+
structural = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.18),
|
| 102 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.24),
|
| 103 |
+
entropy = MetricThresholds(synthetic_threshold = 0.48, authentic_threshold = 0.52, weight = 0.15),
|
| 104 |
+
semantic = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.19),
|
| 105 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.60, authentic_threshold = 0.40, weight = 0.15),
|
| 106 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.09),
|
| 107 |
ensemble_threshold = 0.40,
|
| 108 |
)
|
| 109 |
|
| 110 |
# LEGAL
|
| 111 |
LEGAL_THRESHOLDS = DomainThresholds(domain = Domain.LEGAL,
|
| 112 |
+
structural = MetricThresholds(synthetic_threshold = 0.60, authentic_threshold = 0.40, weight = 0.17),
|
| 113 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.44, weight = 0.27),
|
| 114 |
+
entropy = MetricThresholds(synthetic_threshold = 0.44, authentic_threshold = 0.48, weight = 0.13),
|
| 115 |
+
semantic = MetricThresholds(synthetic_threshold = 0.60, authentic_threshold = 0.40, weight = 0.20),
|
| 116 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.63, authentic_threshold = 0.37, weight = 0.15),
|
| 117 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.66, authentic_threshold = 0.34, weight = 0.08),
|
| 118 |
ensemble_threshold = 0.43,
|
| 119 |
)
|
| 120 |
|
| 121 |
# MEDICAL
|
| 122 |
MEDICAL_THRESHOLDS = DomainThresholds(domain = Domain.MEDICAL,
|
| 123 |
+
structural = MetricThresholds(synthetic_threshold = 0.59, authentic_threshold = 0.41, weight = 0.17),
|
| 124 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.45, weight = 0.27),
|
| 125 |
+
entropy = MetricThresholds(synthetic_threshold = 0.45, authentic_threshold = 0.49, weight = 0.13),
|
| 126 |
+
semantic = MetricThresholds(synthetic_threshold = 0.59, authentic_threshold = 0.41, weight = 0.20),
|
| 127 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.15),
|
| 128 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.65, authentic_threshold = 0.35, weight = 0.08),
|
| 129 |
ensemble_threshold = 0.43,
|
| 130 |
)
|
| 131 |
|
| 132 |
# JOURNALISM
|
| 133 |
JOURNALISM_THRESHOLDS = DomainThresholds(domain = Domain.JOURNALISM,
|
| 134 |
+
structural = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.18),
|
| 135 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.24),
|
| 136 |
+
entropy = MetricThresholds(synthetic_threshold = 0.48, authentic_threshold = 0.52, weight = 0.15),
|
| 137 |
+
semantic = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.20),
|
| 138 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.58, authentic_threshold = 0.42, weight = 0.15),
|
| 139 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.08),
|
| 140 |
ensemble_threshold = 0.40,
|
| 141 |
)
|
| 142 |
|
| 143 |
# MARKETING
|
| 144 |
MARKETING_THRESHOLDS = DomainThresholds(domain = Domain.MARKETING,
|
| 145 |
+
structural = MetricThresholds(synthetic_threshold = 0.54, authentic_threshold = 0.46, weight = 0.19),
|
| 146 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.53, authentic_threshold = 0.49, weight = 0.23),
|
| 147 |
+
entropy = MetricThresholds(synthetic_threshold = 0.49, authentic_threshold = 0.53, weight = 0.15),
|
| 148 |
+
semantic = MetricThresholds(synthetic_threshold = 0.54, authentic_threshold = 0.46, weight = 0.19),
|
| 149 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.57, authentic_threshold = 0.43, weight = 0.16),
|
| 150 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.61, authentic_threshold = 0.39, weight = 0.08),
|
| 151 |
ensemble_threshold = 0.39,
|
| 152 |
)
|
| 153 |
|
| 154 |
# SOCIAL MEDIA
|
| 155 |
SOCIAL_MEDIA_THRESHOLDS = DomainThresholds(domain = Domain.SOCIAL_MEDIA,
|
| 156 |
+
structural = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.18),
|
| 157 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.54, authentic_threshold = 0.50, weight = 0.20),
|
| 158 |
+
entropy = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.54, weight = 0.17),
|
| 159 |
+
semantic = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.18),
|
| 160 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.55, authentic_threshold = 0.45, weight = 0.18),
|
| 161 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.60, authentic_threshold = 0.40, weight = 0.09),
|
| 162 |
ensemble_threshold = 0.36,
|
| 163 |
)
|
| 164 |
|
| 165 |
# PERSONAL BLOG
|
| 166 |
BLOG_PERSONAL_THRESHOLDS = DomainThresholds(domain = Domain.BLOG_PERSONAL,
|
| 167 |
+
structural = MetricThresholds(synthetic_threshold = 0.53, authentic_threshold = 0.47, weight = 0.19),
|
| 168 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.54, authentic_threshold = 0.50, weight = 0.22),
|
| 169 |
+
entropy = MetricThresholds(synthetic_threshold = 0.50, authentic_threshold = 0.54, weight = 0.16),
|
| 170 |
+
semantic = MetricThresholds(synthetic_threshold = 0.53, authentic_threshold = 0.47, weight = 0.19),
|
| 171 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.16),
|
| 172 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.59, authentic_threshold = 0.41, weight = 0.08),
|
| 173 |
ensemble_threshold = 0.38,
|
| 174 |
)
|
| 175 |
|
| 176 |
# TUTORIAL/HOW-TO
|
| 177 |
TUTORIAL_THRESHOLDS = DomainThresholds(domain = Domain.TUTORIAL,
|
| 178 |
+
structural = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.18),
|
| 179 |
+
perplexity = MetricThresholds(synthetic_threshold = 0.52, authentic_threshold = 0.48, weight = 0.25),
|
| 180 |
+
entropy = MetricThresholds(synthetic_threshold = 0.48, authentic_threshold = 0.52, weight = 0.15),
|
| 181 |
+
semantic = MetricThresholds(synthetic_threshold = 0.56, authentic_threshold = 0.44, weight = 0.19),
|
| 182 |
+
linguistic = MetricThresholds(synthetic_threshold = 0.59, authentic_threshold = 0.41, weight = 0.15),
|
| 183 |
+
multi_perturbation_stability = MetricThresholds(synthetic_threshold = 0.62, authentic_threshold = 0.38, weight = 0.08),
|
| 184 |
ensemble_threshold = 0.40,
|
| 185 |
)
|
| 186 |
|
|
|
|
| 224 |
|
| 225 |
def get_confidence_level(score: float) -> ConfidenceLevel:
|
| 226 |
"""
|
| 227 |
+
Determine confidence level for authenticity estimation
|
| 228 |
+
(score represents synthetic-likeness probability)
|
| 229 |
"""
|
| 230 |
for level, (min_val, max_val) in CONFIDENCE_RANGES.items():
|
| 231 |
if (min_val <= score < max_val):
|
|
|
|
| 260 |
weight2 = 1 - weight1
|
| 261 |
|
| 262 |
def interpolate_metric(m1: MetricThresholds, m2: MetricThresholds) -> MetricThresholds:
|
| 263 |
+
return MetricThresholds(synthetic_threshold = m1.synthetic_threshold * weight1 + m2.synthetic_threshold * weight2,
|
| 264 |
+
authentic_threshold = m1.authentic_threshold * weight1 + m2.authentic_threshold * weight2,
|
| 265 |
+
weight = m1.weight * weight1 + m2.weight * weight2,
|
| 266 |
)
|
| 267 |
|
| 268 |
return DomainThresholds(domain = domain1,
|
| 269 |
structural = interpolate_metric(thresh1.structural, thresh2.structural),
|
| 270 |
perplexity = interpolate_metric(thresh1.perplexity, thresh2.perplexity),
|
| 271 |
entropy = interpolate_metric(thresh1.entropy, thresh2.entropy),
|
| 272 |
+
semantic = interpolate_metric(thresh1.semantic, thresh2.semantic),
|
| 273 |
linguistic = interpolate_metric(thresh1.linguistic, thresh2.linguistic),
|
| 274 |
multi_perturbation_stability = interpolate_metric(thresh1.multi_perturbation_stability, thresh2.multi_perturbation_stability),
|
| 275 |
ensemble_threshold = thresh1.ensemble_threshold * weight1 + thresh2.ensemble_threshold * weight2,
|
|
|
|
| 285 |
metric_mapping = {"structural" : thresholds.structural,
|
| 286 |
"perplexity" : thresholds.perplexity,
|
| 287 |
"entropy" : thresholds.entropy,
|
| 288 |
+
"semantic" : thresholds.semantic,
|
| 289 |
"linguistic" : thresholds.linguistic,
|
| 290 |
"multi_perturbation_stability" : thresholds.multi_perturbation_stability,
|
| 291 |
}
|
data/reports/file_1765557325979_20251212_220627.pdf
DELETED
|
@@ -1,181 +0,0 @@
|
|
| 1 |
-
%PDF-1.4
|
| 2 |
-
%���� ReportLab Generated PDF document http://www.reportlab.com
|
| 3 |
-
1 0 obj
|
| 4 |
-
<<
|
| 5 |
-
/F1 2 0 R /F2 3 0 R /F3 5 0 R /F4 10 0 R
|
| 6 |
-
>>
|
| 7 |
-
endobj
|
| 8 |
-
2 0 obj
|
| 9 |
-
<<
|
| 10 |
-
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
| 11 |
-
>>
|
| 12 |
-
endobj
|
| 13 |
-
3 0 obj
|
| 14 |
-
<<
|
| 15 |
-
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
| 16 |
-
>>
|
| 17 |
-
endobj
|
| 18 |
-
4 0 obj
|
| 19 |
-
<<
|
| 20 |
-
/Contents 15 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 14 0 R /Resources <<
|
| 21 |
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 22 |
-
>> /Rotate 0 /Trans <<
|
| 23 |
-
|
| 24 |
-
>>
|
| 25 |
-
/Type /Page
|
| 26 |
-
>>
|
| 27 |
-
endobj
|
| 28 |
-
5 0 obj
|
| 29 |
-
<<
|
| 30 |
-
/BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
|
| 31 |
-
>>
|
| 32 |
-
endobj
|
| 33 |
-
6 0 obj
|
| 34 |
-
<<
|
| 35 |
-
/Contents 16 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 14 0 R /Resources <<
|
| 36 |
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 37 |
-
>> /Rotate 0 /Trans <<
|
| 38 |
-
|
| 39 |
-
>>
|
| 40 |
-
/Type /Page
|
| 41 |
-
>>
|
| 42 |
-
endobj
|
| 43 |
-
7 0 obj
|
| 44 |
-
<<
|
| 45 |
-
/Contents 17 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 14 0 R /Resources <<
|
| 46 |
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 47 |
-
>> /Rotate 0 /Trans <<
|
| 48 |
-
|
| 49 |
-
>>
|
| 50 |
-
/Type /Page
|
| 51 |
-
>>
|
| 52 |
-
endobj
|
| 53 |
-
8 0 obj
|
| 54 |
-
<<
|
| 55 |
-
/Contents 18 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 14 0 R /Resources <<
|
| 56 |
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 57 |
-
>> /Rotate 0 /Trans <<
|
| 58 |
-
|
| 59 |
-
>>
|
| 60 |
-
/Type /Page
|
| 61 |
-
>>
|
| 62 |
-
endobj
|
| 63 |
-
9 0 obj
|
| 64 |
-
<<
|
| 65 |
-
/Contents 19 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 14 0 R /Resources <<
|
| 66 |
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 67 |
-
>> /Rotate 0 /Trans <<
|
| 68 |
-
|
| 69 |
-
>>
|
| 70 |
-
/Type /Page
|
| 71 |
-
>>
|
| 72 |
-
endobj
|
| 73 |
-
10 0 obj
|
| 74 |
-
<<
|
| 75 |
-
/BaseFont /ZapfDingbats /Name /F4 /Subtype /Type1 /Type /Font
|
| 76 |
-
>>
|
| 77 |
-
endobj
|
| 78 |
-
11 0 obj
|
| 79 |
-
<<
|
| 80 |
-
/Contents 20 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 14 0 R /Resources <<
|
| 81 |
-
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 82 |
-
>> /Rotate 0 /Trans <<
|
| 83 |
-
|
| 84 |
-
>>
|
| 85 |
-
/Type /Page
|
| 86 |
-
>>
|
| 87 |
-
endobj
|
| 88 |
-
12 0 obj
|
| 89 |
-
<<
|
| 90 |
-
/PageMode /UseNone /Pages 14 0 R /Type /Catalog
|
| 91 |
-
>>
|
| 92 |
-
endobj
|
| 93 |
-
13 0 obj
|
| 94 |
-
<<
|
| 95 |
-
/Author (\(anonymous\)) /CreationDate (D:20251212220627-05'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251212220627-05'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
| 96 |
-
/Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
|
| 97 |
-
>>
|
| 98 |
-
endobj
|
| 99 |
-
14 0 obj
|
| 100 |
-
<<
|
| 101 |
-
/Count 6 /Kids [ 4 0 R 6 0 R 7 0 R 8 0 R 9 0 R 11 0 R ] /Type /Pages
|
| 102 |
-
>>
|
| 103 |
-
endobj
|
| 104 |
-
15 0 obj
|
| 105 |
-
<<
|
| 106 |
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2007
|
| 107 |
-
>>
|
| 108 |
-
stream
|
| 109 |
-
Gau`SlZ:cL&HC$_Z!Wm7R;fip:(MiL2G%Rka"^38b]p!0`#4ekYU'nH:B1+Wh<@?>9JT=@(aJiohtu0A4U)V8bl#IrJ=f4(klh1/+hS'B&-p'20FiGj4]YIl'Ke;s^*!-kJK'7EiR/#\5)&et[cjD-<m*sHCR!"n;'VECWPWY2a/"Xji>`&s$!`M.nO5E_HOZb9d@63UjC3E>i?no'_/Q"b/UE300<F_J<\`7ch_/Z6@)<RB,Ro.d!!uoNUFt?9m%oG\%?;eBpPe;N9KN@b]'!t?^b:*W\s:b"f1pDI1ulMeb\qk=eXm"MW^=ZnZ:SJh7]WFLal-8`CpKdYIugeVHT1H//LT&:=6J;+LO,c^<VF+*WQLr%blLPN<3]WQ</.3pJQ"0E.PRRZkkiV)A*BM;:.qG\4j`4&2rJg$FkH[[4's6`FUk1gn7K"*)(pT1;K/o:N-7-WSj)7i>/U5"8Ja'L#?Rt%:bZ$B$^El;L\Z@N]=+;+G=4^Od[g(IUE5QQF2?sE:%6fq?]Rk=;o1P[S[`2s&m&J2V;o#&T0T:iM]`hkJK[(=.CalY,D#?oPaDOh5@4:FoSiVb]H1uI;"Vdo##Al+l]Jp-97!3OdK_DtHo/cQpuV[PNjd:iE6/V-R!,XNJ4Ve;P)f'N+bM#fYU!$^iaZ@'BtDu0Z)Wtr:X1YZZD&C7NXuAW8E&'?YMNWGY.I997u?k@9mS\mD1("\V1u-nKUm/%-\ArA(bmu&AerW("QWKS=?A"?XiT[%:R6s0Oe@3B<(bo0fI4]n8rpG1)'rUYo$Lj3_5G^(6n?9!=(,l]d"O+kY;M8YM=.LmL;[sNn0&_:)SXi]NC?R?T/E;HS6>Jg+uQ=kU4/Dd-]3n3BC"FqFpQ29TcG-T[iF;+\$1)po?]06XK7i<lm&VOCVuYnqY(F-Ra^l[na5H!,&H@XhkhO#Xi!HG1tqg_[7F1DU=i.I/]It4X5q*M^2-=5KG4I)`hp'c!Z,fD_"-#o8/Br+a&(_f?jJ7eknk<5Lc)HeA<pNd0UC3fo>fQ[Xc/P[p6Sre@T^L&B9;i`.\.Xu9S0Q-@B)c3p!`(jeh,6'E#IZAl;/kl%C-P"b\_>><&WR5lrJk'YtXQ4[WuCs-rXnl]%H1tnt\&NC&$Y1&r,GX0b%>$#ME]+F6lfEN'1ar*3>;86-HOUWb[?@b#'!\Csr2kY0n._=/4<uBSZlF#?efHXFam,:-8OZ<Z:rYH*\\edc0L2\0/'fJ)eA8jsnWbq)Mh/q#+f`&:3%lnAt<p,Jbq3q-.O;>EFA^3X?E-%;.ud$%s^%>=dg`!?/US[)Ni.&`(/sYZ)(-&`$^,=rn7],H?tFpSP>2qci;m7*fpT7?-WE`_-'6(2q;FkuLcME(ahDd'i`%<gG0`f*F$$"!`:K%T`)512HrA@pHCXFb*ce7uH1RWL9iVNpn%\97d&`l0anTAbFXKl^,*@dIci!jSIucNi?#g(N3:A*NhA*BFV*/0h;34=UYLYbk$e%j$(6Y``O6sh;+gj>*,QO,fYBmLL!e/@@8[4&D+;rGIA@WcCe;!FC$M\V9ji,5+Ai8edY[%+)4g%(k^pn*MpYZQ5#$FRqfC=.+mn(6Sk:'"3qJgl67UnVCS'Y-BmJk<G`aEoY1^rPq''bc66P4G*!Nn#3Z#rl9Pu`p&FX[R]oi3E\l$;qT"#8Ap!q5b'N@Yq:BAeYYq+Db-Y?]J(b(j&5KiCi6Bi6!e#J#hI*<HkKJS7-C\f7+$_eXIUB8gYWC[C0`nR:\%ALsL-(+>U+`k#@ZdWYA9+cpW31]71QGMX/@55R.R=GsPC^Z9[M>oof:?6/5JlJ"a1,"Ap3(a[X38+rF:;a$7ZjlKd$In&(>1Su[TR^>MRrlj8G'`QrpK+m?KmJ-gLb_nUaj]Q!j<FR)ckG@d`;]HcV^!q,b4L5;W;kPQO-u:r@``4g9-;9DQk=og!%<I>.!aAqk7JDd^MSTDR=].!8DUu13dJ*e%qgOmd+J7oT3Db#^ZLN:@*Nf~>endstream
|
| 110 |
-
endobj
|
| 111 |
-
16 0 obj
|
| 112 |
-
<<
|
| 113 |
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1177
|
| 114 |
-
>>
|
| 115 |
-
stream
|
| 116 |
-
Gatm;D/Ymt&H88.EF12,e*?`4%"(`1[Ui(:X^RYFm7WnX.2.7B*2Skmj*)h8a;[toM,GlDhqdo!I*\&Y`@oF7c;t!)!PEj,J8ki9J6BgB?j(r*qqYc$S!&@J"`[ho`>uHR(iuM7!+D$OXp'Id-6*'NZA7"5bsAR#iuFGU)g@;+rpAl?i(jCk9Eq76e)Bp6"MZ((D1$fODp>I.;E-:"^?=9m\8?LV4I6D9S#=G`D4JXY)g;mJ"[email protected]'TE-X8kO+n_Y"\.Us+L2>`c"R,8M7&3u]Lu;30KoJ=&q2G?B%($*a3M+TFtSVgfbDOOU57=:\8cl,K`5?bj&,5nkOCYikek`\aabn/-h0R6;"7fhI!$q:7M6`B$P!:3XC8,A3m;\*cWZF_16-d`IJ/CO#<#/baP\_K(1bQ-mTc:kWKA$UG<sH>1dU;-(b[$<=/lEY/`Sf9`kd)E#3O(fj]jLn"W)f9(B)@l%)tn(-Atn`XU?b!X8?r9?%pHlXVTj8R9JE:$3n[bWg9$na^HVPTmX`7Y9"H=aW2==0A^8TDR!aSK"DOV_JRq)[;?)5FRr?4dsRGsF*)>Tf&#)_*q/`30o:O':0PUV%a$fJM`Dbnk=gRe(<s1mq":k=;m8ESd%YP&0%j"-oY68=p4L!9/Jqp,)r>rML7]i[<CLk-&"ml4m#qCSMWP=dDWM"+,6M.IBp9e/,13!=P+iM5jW8#Sm^s+]O@nG1(p#$la^FJe+Osd5!WbkiD=AoOIsa'CiO_>Pn1h_AI^N7ppXQM,G2OB`3k;%bSZ_.%S^*t9LVc3T[C:dha=SU=cC[9Sn\UeX%Ea`SRQUO_7b#l:!?t,%7R5f%/72BQ'WL7l?bk1>gArWEM/37D9b/sYX'(D;"-bjo#d'SF_.uf\DL-gFWrG+uN#E:N,kRV2jok#PdWces&24Ks;V^GEBtD(V0Y3d:SUb7(T)s:@SkF2a)R5Q!1V0C-]6EQ*>i07;UX3h<mq-.-jDno)FD&s,hi3.(5NZ+gF@+lq`sFu4n(_L)P$-Gu>b7-^jKToPYmF;[$c\3Tg0W#X600sb9^sS&cd@%0m<K)'.+NUT'T\CrOMM"pGNpVK[J@!rL[ePZYR0tI/<l`[&U&1FV,JohZa/9&eC9PXkXcI7)q0R,c'Bh(KTu:R]k>(#-KtC1C$Jun~>endstream
|
| 117 |
-
endobj
|
| 118 |
-
17 0 obj
|
| 119 |
-
<<
|
| 120 |
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2041
|
| 121 |
-
>>
|
| 122 |
-
stream
|
| 123 |
-
Gb"/)9lo&I&A@sBi6iK%PG%[tfUHA$XZK;;8@6j93fHJ]9I.cS-te]M?\%%@6r9%jHLQ#52E2)-MoU6J&8t,P0gD-,T.U\$*MD<H:dc0.D@dObT\om2Jtm6SZoh8!j=-+sp9hHJFC7^(,s6j[,+]PU1JYL$;0mS"mRCCjJq^`q6WE5-fCme9B;=DGr*jc&hN!3Xq'e@MGB34fE^':6"^b*q_YX7>IU3!/_kQf'\;U-)QPise'J,*EM&Db##"mCbcsEu'Ca'r?bJuLI``K(SebJ4B2&`;Q/pD[]]^X")hf<t)K_$9MUD6s$8JWW$#d(CS4jDPloFfEDj>OV;pL0@VBsN&A_IR9T_^o,*nMHc2RjbXVY+1.;1hKr5SYfM/:F!h\h/C+(*g=H"cV5)@1YZZaV$3'MbG7IV1\AhsBnUA1*0L/:"k9&3U]K'n&`RQ?k=+\BdFJE3Nm_&mc$Bt#'@5Br?[>Pu*,1(_Yc`0Ao_/4Od93Xp`Y]8]O&::Of+3[Y!p#.Db^\lT$JL]igRXk)UW[cT3W'lCj6VOW9H#*IL:s<'P`6:]XOCa$'@6"JFTr2kQ^P(")l16mh\YlsJ)`bRYXIBL8\)p?V)ap<8T#cLM0(;E3Mkrp\;oh%Ae#.e2'K4Pg9s2-;3`);>8oR9\^@j^n\^Z?7(dgpl#cG>+*$1R/N&U\Msa:m<;Qp^A(JoF0ad>B7_N$OB>=1qg>)?tGI6Cj@-NgQ8YYS"anOk\gktFV(/;ptVXeqYOfpG:0UX%K317_'l^p3"blA=<a(7<d)'ekdVbS&'`'QGh/!V4W?cTWi\>\AW`=GhW17?]f<XD,oT1CnDJM=6*o7E/oX89ftGVWR%d/[*r5^Gc/R0j&eFLbM,282"`oPJFlV^%3^;h6V8`EZAR&?5;<\Hq3u?*0iP9R<A@/uC?%B/+UoPC^[>ch/JUlF?5c=QJ6?_+:fgrMrB49c/gXd]>`@-1T58>r]/ac;#LHU`ZC!4p]#LlR%Bflf-&Y;]fP0:ellhk[O]u^'$LT=A"2'TeEruB1nV2?*4jI-T2q$AMa#)U;Xb7=\=:=1lN-_K9?QcCub"H)Hc+F/Ius2O[gBibS!d(6u<otZUYtN>o%GlN/V5d)J>HEL%.@LiXR[J+BhqMKosa]I%ni>'E[2<S*dt30O?^A7.+L*SP4_>\_":+ne;3M_nJ3`_>k"m^%Vu%V]ZPq^-P[6:I,![\;sIjr#.HaE,=,6pR2SV9EusQJa[nF%6q&Z@G4>Mn]ub[$M)=`*cKh<fAuCjq.4,eX&%AlmWMs4@:;a+Y:+_P>#gKGWYVA?4Jd.KeBt[g<M[4;<c_$`]6P^QQ.6r+<`rX<bK=QMCTMtroGa:"CoXc1b/Ps$<t*2dO,1qF6F1N=A\?C:Cti&P/]9^)[]Fq/hR8\lMr83YZ.`MGiEB^EB`D+0Zm!:i/*9E6pA8D579BcInC02^,W;PKe@=o&=aKHs5HQCkH5DOt?[d#q3B6L&HV-kAoFT<mX(J_*R8#&B<i*s5r)2n1,IC56e&[h!L88]K3[RQ2m'"5Zjk'J2f3k1k%7Xe%r(8[@fp,qJ_s7&`@$m0\3_D*gdFJ2BQ!hflK[B3uFBOj@GMQ/"'p3eropC<$nY(CA.5U).+V69bW&*OREBpdU\4*<>lf;h%\l"WebPNIE&-SiFe5_(I&<T&f$WAm<akZPSD]=N/KdTM`&55/7op1OZR+t<Ije;sQ@Mtu1Lt?&RLM$3u1D'cLJ:oL?nEjM;0HT;)P9aQ:=s5uR,u#8]r*3ueVB@:T2F2HJ9Gj.`>IP3VW/*^dFMTe\;htjgS%*m>X,WrN\_5(i_D\AOJlg'B*OJQih'<.)9*[Ha"1B_]m3j"WKp-l^+!']"*c>KN.m!nK$GMid2jSYsqmL7p!!"mRmZ/gt&0>%mHha]h(UZOOD0uZ]82Ko7-,XoXIYo<1\l.q**`NQ;F4uL[qb"KV.]oMOHBllnCne4lf%'H4H:%jP]X1\S`nFGo)f&X]Cfssl?tJ0"kG-PNV!TWS-)1?XpX<),FS%4O39[+~>endstream
|
| 124 |
-
endobj
|
| 125 |
-
18 0 obj
|
| 126 |
-
<<
|
| 127 |
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1842
|
| 128 |
-
>>
|
| 129 |
-
stream
|
| 130 |
-
Gb"/(>Ar7S'RnB33%kB>U>^>17IA>m]JW'>VQnjW:P3C4N]L=cA@kG0lUORQ0!RfK/iR1S@+[:G.3<q#+"udj!N\I1s.(bBIe)HQT*Z&9r[e\A3;=:Dp5f26,%XDu_%dAMkt1Q$W^)6IK"HPh^olaK-mQrIONihcDku@-?`;7!&=I_pFk=ZH@t/TL0g:65Z%$hD$DXZL`Q-*rjEpA\KE"R@I]#AHs3*pEo`)Q2HW?=L=ud;X?r(5h1f"PpMli\Dct[AZ*uiJFG!3RM^)6m!mXI'Q7`^n=4_KB^*Z923LR2GR>\E".<oN'Gm^$jJ9io0/.#q>(Y[0f@*9&*/YCh*+Mf]`S0[8&-ng$n0=aL*Y*2P.tm?f\$OC53o6"s?&:j=U8\>LZI@cgNn9$/-5,#OC]!]ZroU<5$?gK-m)n#aBmo*<mGg?\heL.rA]6:2YIoc_QOh"2d;.ZfoC/?$`45U@`^3ss6(No%F`**r6KTp/^^oNN0Wn+uQ.L+PDp!f?CBR0!J9h!JWo%bFk[WMH[O9%$%f-q:!SP7E`j<2XGN)pQ61)B.XH`Ku>lmXk)3qL2IPrOMac#W_CY=d"\j;5ui89QD()9t4cNFd&P?lT$FfS,-(9#aTa'>uhC`&\1c*Q'"q8)g,:%hf\9TfW@o1<Y[-.,L2k"$1iOgOhI4liT-YKR'sH^;$?lhg4o(EGkPWcW)^$V\g&eN(c_`nY7[IqTsifdS8pZ1I@8";h)b@Qj6ZP%bjON\;6A5pS<RMV5c*JIf=uFWNPHabM)l??96DnhbWD=^D626\aE7pg/AYA[T'?[`N8#/!j2j.&68Nb@*U:cfbWX<I)rt/5#bhIY013X'b^!q,<=o,6Ls8&i'>-9i7?CJ7gLJROq.V9u>kEeHJ85>HAajrs6<'PP%@H36iF[gJ!ikI0&5^VT.nI4^a9s5,m3')7ZUGEq?c1RTS^e'BFbWEhDjb5AjEa[nPfjEp-jen2YGMb"H/Sk'\=9fE@;I8BA&J4@[g-Jm@[,po!a'Nq$=^t['4O[B4)o]G--Us#*m@k>-ln&TrY;H3CBJ")Vulc<aJ(qRf#ZWte(Y1*#`]`\')+(=D2:-e"73PUBP,s`KFt%BG\JC9PL]$hql1*q<3<s]a;*s:b/T8c#Wsb_!_UW=<(C#$NC)e-=t-Dsa^[C9XkOPtgJ8G_ou@tO>Zl&`FZ2e![]rQ)Cua8%*4\d.`$o_/?.N'!MBl5h[?Zju>LmB'l4o\.\Zp<g]U\OfrCp?*M;29\di#ijGa/7:Wm5GAD8])M6`_ZKRT\6.Wei.%GKZs(;mYY*;aVqmbL)TiVaS_`QKN/u7E:J*,"f]u/44Pa?:@e[VKZIGf?.l2V>%MD=[l&tI%FY3ON\hBkKu,BiAOFQN5Ka"j;:inL07OV^FK)3<Da:3QR7SFBbGalPG0J'h2krt.&r:46k#e<@<MMk;3]eEFmUi>*p@K7'QnUj9@V,?CKhK9h&J(0ZmJF*)3`8q2VA+7_:hs:bf(o5+:0u<mG:'\E50oC%Ra!H3VBY1Z+B5e)ok&YZ@c##e4<ST)"e:+FV$L_A-I<M2In9('E'R6":,qcl<^3nlL[>S6R6MC:LPPkK,&5t:+HT)ilmpmgHg(G1oQgV"XPlRAR_l!fF)_())SX@W'8fb6hE%C>kS"2eKZShbV*+>q8c1<DSl9i's42e)03/iK"-Go2XpNfQ@G2h\#XoX>*:]sEjsoM2kDL*cKb_YqF2LCU;&=1fA0`QTD(?BLN=q&c,%#7\>i7rLMK"DBP"$Je_:qSg*S^9JbN+%.PE@gY-PK8\B<N'iG*.,Kl_3?MP[8uWV>IR'Ob2q*$"nd\?ar^~>endstream
|
| 131 |
-
endobj
|
| 132 |
-
19 0 obj
|
| 133 |
-
<<
|
| 134 |
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1956
|
| 135 |
-
>>
|
| 136 |
-
stream
|
| 137 |
-
Gb"/h>BA7Q'Z],..F-,^YRG2fZ".Y-d`Xt#67ImuL8Q&UR+@AO`HHe&l&k?-f#C<%eBYkrKVH\QmA%GBd?4&/!N\J\pj]^T3:u15Ig?SmrWaPUEkn<_M1>$jJ?5pg(gOn=k=JJ*TfN`.Y&IW26t"Vf!)Z4Qb94JZ:er;f#Q@UaS-&5<LO8mF:QA6Kd<K\P%'NG(oC3EO:.LLle+*'+n3;O)<;V->5$e2KK>cLDh(Bo:/iC$/:_:[q_eNj_%2=_I6[MSoYrTeUBJHs;q"2PEGj:?N9VW3*j)W2L_+KasABWp[b<l.<Y0p^g%.\'I/Dqkro.Z01B9:rhGq`/tQmKqF7JJIWDkB<,cQUJM(Ek>^oM_,h`kcs@3?):q8!%,]#D1+0?\smtjc3=/c,34Qh0P<*/LFN-E<-QEn6EA8#ToWZi8SIIe09lk,HdM(^r3JO7Ht/@edK(uds3dnHPI2/ce9$Je9*aD[3gpE;R<-nQPZhpVnHsrV3V7t%ATuiIQ`Wu99'9HjH':6JV!'ufu[n9B5-p#GTSM:WZ%Yn=U5X?\_XG\#K_k'UEogMc,4cA)WPAI(O$GH,jA\/pJLnRchWJ$`8PJh-'uTH<*pCp/SoZK_As#%/h&mNh+,/d9mnI-"FJ+A\udW]6ki,Hej`-"_Pl4sn]cf'6o`&Q$99W;Oi*XechFGtf3;JcB*2L)YMO`+*(OsK(>gSC:StHD%KGjU]5g3IghS2Y*S2-:N;Fu]TR6RH-df\(_sBEebkksK#@#Y'8A?dS2?gNupZ2aBY#k.k3cmmbRb9?Y8RY1=6enMmqF[\DRF<k8gXTsbV)hPS*MG+WbaE39C87dEH5WMUTTjB:>UqiAik.LK7HX%&V^q?u=U%_3(KkF=O@>aEOVL:a/Jj.(Pu_AgU=ml$Z8Bk$9P:Lg9r4\tj-]/3*MG,fbFhuiFtBb]'JaC*XEG;LY*NX-Xj7k\7WGZBrOsjjXf,2h;U`K2eCMI!aC`P$Qe0HKV8@sJDGF%jg<P:$MnXj`R7+/7j'9jS`13]*,[s.M'2b[lSo3tP9JSS..ar5"H>c@j`mTs7ba,2k3)>_WF2cq`g"geAoIkDL)m)Xl&&,oH4ge$J.uI;[1e]ToHHaM7RoYngnCH&di35&8U(**2)Q;1jQ&K=[H^ggP[;?5QbibbA3?;67q)24[A$s/(FD]VS@'QA/dHCjZa%Pa$@`)@(\F+;.3I901mV@UZig<8&i,5S5!aRSbiY=.4Gf@-%71f=nhk^V'2.c7T3si*m]OdqIPr;ZG).d:kPta(n<nUSdF%b\6gGe(;Ra>n]V6"2@\o"O-PRp6l/ZUFFlF+!9!#Z7k)mQF2U)_:0W+o/(K7Q-\:mV9I'q0;f[cg!+DqjAE>p>q]gNbQ<DYqO?a1>V`9`-W8SXfQnkD(:3r;%,)roe;`)>ikr5af0R62V/ip;8H-+:l6KOQC7i#ci1kVJm3Gq'pfrpF%bt1-I(!X7T&XN$/=Mql,Sg`aK4*>Qa_4"P<VPF78"SH]u]>op=gt'(Ok<H[@@X^G#>Y;BTT0h?5o_rugasP"0$!)BeJ4DCCqsJj0YVSlX%Y(4^'iVeOU`m-!a"4=_5lE*d=N7%_=bFIa]3YbDl;N]Q<D'E2TEU`NO*J0@/(3-M$1&m/]3SppCK_*sMjO_)PKje9Raj9C7b+u_*-NX6@c*""/S$Z+t[Y)H*<-6Dr&HNZsaYa`3C.;=qtK15E\A!Iou(2_/o(Rk`@Q,rAf_!T3jHiecJ3dj-m"Xi'pN\uf8W=jG><2h_n^-udf_lZIXSOQ[Xh"p\\fBn)'R1+AhZl?6e?/*17n.W&Y@'2i\R"#DA'oBJ0P>`$6)j9K'pblR3(qFqG=^pI"XC96G;#4@>Oh6.R%@49_ms@=`/89fje[M[-\\=pCSQg;QM/0/*=]oTQe1PjGI<V?*D='UOg=h,'NO94df@H*/^qgP=#gpi~>endstream
|
| 138 |
-
endobj
|
| 139 |
-
20 0 obj
|
| 140 |
-
<<
|
| 141 |
-
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1890
|
| 142 |
-
>>
|
| 143 |
-
stream
|
| 144 |
-
Gb!#]>>O!E'Ro4HS<qd3<UnAZ\*d0;/=TW@/N"%h<C4\H^r<QW?YGSqa6so'@OI0<Xf7U7GVlmN^,+YoF$--]n3`-6h@ec!J`$L'5T)Pj5_UQ#RDZ"FnE#1$4UhIi_)9EA/KB<N)Ps;S9[K_(9M9oU39;KBjR*"\UFcSn&gq<WX^Z<D9mo!h_;\pM3erRP>gAA5SCtmWDr?!IkDceE!C'lE6>/i"]@4\UK:<Vn#&2f=2qqlSYX,TI_JQ0A(?<!s/](r[/[Yj(iA6U8qrWDCZ^%*uY0t0e[N,N*`67)f#(C@5_tT#["##j%/7Mt@)p%?qf]<:E>(5[%-\u!ePoho17>tmUEOO#M7%a.g(iWuD(eh4d$#];bLa+m9UYESNOXImg']Z\jM.ZbXi_[:,1\@0.`_/7Hi$t#kGJG<>Kur\-1%3eGJ)g:p!H*7:R6$K=WC@TDKX(fKB?3=4>&UDe#fBPo&3^kPUoGW0)RAd/2K"N'NN*HrFWIfg3\6M<C(IA]l<+SqYcSn"0Jq@Y*8![29c-.F$)JOCT=jq<cn\H*AJ)OPaPL91*)DT-o9o1\9P5H>V.p2P<])KjWj[8T(BHio;"6PT[B'h[+L<bdJs8m+$KW,)=LD??>N%8T\>]OM31l_5>trU>.nMst/5BEJ>=1BT22dsK^m=c)j4>4)<T\k'JDMt<`_JcjKM>$CRpi`Bb7LM=^*u$Y4S5HtD82L&d*6tF*%k9;3f%PsK.*O)&=5E"&ZXE=U\ZrqEYAuD)TSMQO[3)1)8jT@pm*5oAk'!*BH0tJ!3oDHeZ"e6<6>G*\TbAg=Q:7J\'ORS/2Eej>R]X1a-+g9+["ns.e/iE4QM.!`\Ft.Ps1X+LhA-S37ddgihL>D1V!pg!8]*@LrM0X#LOsSFWS+e(\f,A;%2gloDB"u-!4.SC@'8df6u@EDL80O\uJr=*mJAhr2NY#9/1iFCK7@52<pu>Q'tlRql*1^/ZZ"Q;I+(i`pC)bZSlBt/W&2CYZh#bmT4N\Vc6]SgsXiCP&lLN:Oc!^/D;rZkT.S&l,gnREI'g>W!)_P:a40ng0ALe-/qU?Y!.TD(XRPlLZ>`0H3'43Z6P4FN5)a(:$8hV3X+([email protected]$heUD%;DqhQlpbIt<O4n"C_*'uNWhEcqgi'E)nE1\f>kicdpA=8B7Aj(O>hVqI]Mbh-HUJ--[^l<_<]n>MYmXm?SP:*l"<VpeOb"RDVVD:l41<Qu[>I,D#mS$f$9\Sh7C[\]J]!QZm<S9`>`b\?X&Hf@anc'fQcF:_n@E(mHe4UT<Oe`)TE:`E[:4>pr`q:D@OuSO#\!u`/>>QXUZJ8IS6>d(<Y(mJN!'PM`rAjeK[b3<F'gJX*[e4I(+*_H8r8;*2IW.^e1lD!tYM2aUX,%W-9,Kr=e3^nc;bm;HX&mhWXch?0m^MG$PtLOC)SG'90Tg&jAh!"udZd6I=>9oRR@>V9JU8'n=ZP:f$/DVm59pT@\n95d3C'/V+_[!nf-K9b06s!=D=UYkIH8f/i:RUlm.m"%Y<sA2)7mY2g%/,Q+=/g0PfJ%Vo$In>H.Z^>Y'6urnhCUrAI=:+$N0DX%ueY:*kHUk\7'??PW[)i5&qtRFoRoh\'#iEoCBimSpoa:H;45ICY-h.?)0!JdFZ;n?8m9]P?W`Q+9?@b2K]J@(pN_4q\r7Rk5Eo2]M.'.fP=NTRUCC(l=pWQostf.>V#'Vc')=:8"DNd66@nBGf4,)r`(U=*i9NaeX^(shZ5Ts<%57\b#OsB0Uoe/gE;a4Le--?P;=4JnX:&e;b#Ft@qm%M<Bf^Ed!p&iJ\!(LRs7A]f3742S7i\3<e\$^+X@Fl,pt++:MGpA0Kr@_KMUef.*kmQVN"IhIC$_)Y5"$g1KQ!_i'5Ig+^4~>endstream
|
| 145 |
-
endobj
|
| 146 |
-
xref
|
| 147 |
-
0 21
|
| 148 |
-
0000000000 65535 f
|
| 149 |
-
0000000073 00000 n
|
| 150 |
-
0000000135 00000 n
|
| 151 |
-
0000000242 00000 n
|
| 152 |
-
0000000354 00000 n
|
| 153 |
-
0000000559 00000 n
|
| 154 |
-
0000000674 00000 n
|
| 155 |
-
0000000879 00000 n
|
| 156 |
-
0000001084 00000 n
|
| 157 |
-
0000001289 00000 n
|
| 158 |
-
0000001494 00000 n
|
| 159 |
-
0000001578 00000 n
|
| 160 |
-
0000001784 00000 n
|
| 161 |
-
0000001854 00000 n
|
| 162 |
-
0000002138 00000 n
|
| 163 |
-
0000002229 00000 n
|
| 164 |
-
0000004328 00000 n
|
| 165 |
-
0000005597 00000 n
|
| 166 |
-
0000007730 00000 n
|
| 167 |
-
0000009664 00000 n
|
| 168 |
-
0000011712 00000 n
|
| 169 |
-
trailer
|
| 170 |
-
<<
|
| 171 |
-
/ID
|
| 172 |
-
[<2a85d7c14d3e5f84b4a14db13320b611><2a85d7c14d3e5f84b4a14db13320b611>]
|
| 173 |
-
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
| 174 |
-
|
| 175 |
-
/Info 13 0 R
|
| 176 |
-
/Root 12 0 R
|
| 177 |
-
/Size 21
|
| 178 |
-
>>
|
| 179 |
-
startxref
|
| 180 |
-
13694
|
| 181 |
-
%%EOF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
detector/__init__.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
from detector.attribution import AIModel
|
| 3 |
-
from detector.ensemble import EnsembleResult
|
| 4 |
-
from detector.attribution import ModelAttributor
|
| 5 |
-
from detector.ensemble import EnsembleClassifier
|
| 6 |
-
from detector.orchestrator import DetectionResult
|
| 7 |
-
from detector.attribution import AttributionResult
|
| 8 |
-
from detector.orchestrator import DetectionOrchestrator
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
__all__ = ["AIModel",
|
| 13 |
-
"EnsembleResult",
|
| 14 |
-
"DetectionResult",
|
| 15 |
-
"ModelAttributor",
|
| 16 |
-
"AttributionResult",
|
| 17 |
-
"EnsembleClassifier",
|
| 18 |
-
"DetectionOrchestrator",
|
| 19 |
-
]
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
detector/attribution.py
DELETED
|
@@ -1,962 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
import re
|
| 3 |
-
import numpy as np
|
| 4 |
-
from enum import Enum
|
| 5 |
-
from typing import Any
|
| 6 |
-
from typing import Dict
|
| 7 |
-
from typing import List
|
| 8 |
-
from typing import Tuple
|
| 9 |
-
from loguru import logger
|
| 10 |
-
from typing import Optional
|
| 11 |
-
from dataclasses import dataclass
|
| 12 |
-
from config.threshold_config import Domain
|
| 13 |
-
from metrics.base_metric import MetricResult
|
| 14 |
-
from processors.text_processor import ProcessedText
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
class AIModel(Enum):
|
| 19 |
-
"""
|
| 20 |
-
Supported AI models for attribution - ALIGNED WITH DOCUMENTATION
|
| 21 |
-
"""
|
| 22 |
-
GPT_3_5 = "gpt-3.5-turbo"
|
| 23 |
-
GPT_4 = "gpt-4"
|
| 24 |
-
GPT_4_TURBO = "gpt-4-turbo"
|
| 25 |
-
GPT_4o = "gpt-4o"
|
| 26 |
-
CLAUDE_3_OPUS = "claude-3-opus"
|
| 27 |
-
CLAUDE_3_SONNET = "claude-3-sonnet"
|
| 28 |
-
CLAUDE_3_HAIKU = "claude-3-haiku"
|
| 29 |
-
GEMINI_PRO = "gemini-pro"
|
| 30 |
-
GEMINI_ULTRA = "gemini-ultra"
|
| 31 |
-
GEMINI_FLASH = "gemini-flash"
|
| 32 |
-
LLAMA_2 = "llama-2"
|
| 33 |
-
LLAMA_3 = "llama-3"
|
| 34 |
-
MISTRAL = "mistral"
|
| 35 |
-
MIXTRAL = "mixtral"
|
| 36 |
-
DEEPSEEK_CHAT = "deepseek-chat"
|
| 37 |
-
DEEPSEEK_CODER = "deepseek-coder"
|
| 38 |
-
HUMAN = "human"
|
| 39 |
-
UNKNOWN = "unknown"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
@dataclass
|
| 43 |
-
class AttributionResult:
|
| 44 |
-
"""
|
| 45 |
-
Result of AI model attribution
|
| 46 |
-
"""
|
| 47 |
-
predicted_model : AIModel
|
| 48 |
-
confidence : float
|
| 49 |
-
model_probabilities : Dict[str, float]
|
| 50 |
-
reasoning : List[str]
|
| 51 |
-
fingerprint_matches : Dict[str, int]
|
| 52 |
-
domain_used : Domain
|
| 53 |
-
metric_contributions: Dict[str, float]
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 57 |
-
"""
|
| 58 |
-
Convert to dictionary
|
| 59 |
-
"""
|
| 60 |
-
return {"predicted_model" : self.predicted_model.value,
|
| 61 |
-
"confidence" : round(self.confidence, 4),
|
| 62 |
-
"model_probabilities" : {model: round(prob, 4) for model, prob in self.model_probabilities.items()},
|
| 63 |
-
"reasoning" : self.reasoning,
|
| 64 |
-
"fingerprint_matches" : self.fingerprint_matches,
|
| 65 |
-
"domain_used" : self.domain_used.value,
|
| 66 |
-
"metric_contributions": {metric: round(contrib, 4) for metric, contrib in self.metric_contributions.items()},
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
class ModelAttributor:
|
| 71 |
-
"""
|
| 72 |
-
Model attribution
|
| 73 |
-
|
| 74 |
-
FEATURES:
|
| 75 |
-
- Domain-aware calibration
|
| 76 |
-
- 6-metric ensemble integration
|
| 77 |
-
- Confidence-weighted aggregation
|
| 78 |
-
- Explainable reasoning
|
| 79 |
-
"""
|
| 80 |
-
# Metric weights from technical specification
|
| 81 |
-
METRIC_WEIGHTS = {"perplexity" : 0.25,
|
| 82 |
-
"structural" : 0.15,
|
| 83 |
-
"semantic_analysis" : 0.15,
|
| 84 |
-
"entropy" : 0.20,
|
| 85 |
-
"linguistic" : 0.15,
|
| 86 |
-
"multi_perturbation_stability" : 0.10,
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
# Domain-aware model patterns for ALL 16 DOMAINS
|
| 90 |
-
DOMAIN_MODEL_PREFERENCES = {Domain.GENERAL : [AIModel.GPT_4, AIModel.CLAUDE_3_SONNET, AIModel.GEMINI_PRO, AIModel.GPT_3_5],
|
| 91 |
-
Domain.ACADEMIC : [AIModel.GPT_4, AIModel.CLAUDE_3_OPUS, AIModel.GEMINI_ULTRA, AIModel.GPT_4_TURBO],
|
| 92 |
-
Domain.TECHNICAL_DOC : [AIModel.GPT_4_TURBO, AIModel.CLAUDE_3_SONNET, AIModel.LLAMA_3, AIModel.GPT_4],
|
| 93 |
-
Domain.AI_ML : [AIModel.GPT_4_TURBO, AIModel.GPT_4, AIModel.CLAUDE_3_OPUS, AIModel.DEEPSEEK_CODER],
|
| 94 |
-
Domain.SOFTWARE_DEV : [AIModel.GPT_4_TURBO, AIModel.DEEPSEEK_CODER, AIModel.CLAUDE_3_SONNET, AIModel.GPT_4],
|
| 95 |
-
Domain.ENGINEERING : [AIModel.GPT_4, AIModel.CLAUDE_3_OPUS, AIModel.GPT_4_TURBO, AIModel.LLAMA_3],
|
| 96 |
-
Domain.SCIENCE : [AIModel.GPT_4, AIModel.CLAUDE_3_OPUS, AIModel.GEMINI_ULTRA, AIModel.GPT_4_TURBO],
|
| 97 |
-
Domain.BUSINESS : [AIModel.GPT_4, AIModel.CLAUDE_3_SONNET, AIModel.GEMINI_PRO, AIModel.GPT_3_5],
|
| 98 |
-
Domain.LEGAL : [AIModel.GPT_4, AIModel.CLAUDE_3_OPUS, AIModel.GPT_4_TURBO, AIModel.CLAUDE_3_SONNET],
|
| 99 |
-
Domain.MEDICAL : [AIModel.GPT_4, AIModel.CLAUDE_3_OPUS, AIModel.GEMINI_ULTRA, AIModel.GPT_4_TURBO],
|
| 100 |
-
Domain.JOURNALISM : [AIModel.GPT_4, AIModel.CLAUDE_3_SONNET, AIModel.GEMINI_PRO, AIModel.GPT_3_5],
|
| 101 |
-
Domain.CREATIVE : [AIModel.CLAUDE_3_OPUS, AIModel.GPT_4, AIModel.GEMINI_PRO, AIModel.CLAUDE_3_SONNET],
|
| 102 |
-
Domain.MARKETING : [AIModel.GPT_4, AIModel.CLAUDE_3_SONNET, AIModel.GEMINI_PRO, AIModel.GPT_3_5],
|
| 103 |
-
Domain.SOCIAL_MEDIA : [AIModel.GPT_3_5, AIModel.GEMINI_PRO, AIModel.DEEPSEEK_CHAT, AIModel.LLAMA_3],
|
| 104 |
-
Domain.BLOG_PERSONAL : [AIModel.CLAUDE_3_SONNET, AIModel.GPT_4, AIModel.GEMINI_PRO, AIModel.GPT_3_5],
|
| 105 |
-
Domain.TUTORIAL : [AIModel.GPT_4, AIModel.CLAUDE_3_SONNET, AIModel.GEMINI_PRO, AIModel.GPT_4_TURBO],
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
# Model-specific fingerprints with comprehensive patterns
|
| 109 |
-
MODEL_FINGERPRINTS = {AIModel.GPT_3_5 : {"phrases" : ["as an ai language model",
|
| 110 |
-
"i don't have personal opinions",
|
| 111 |
-
"it's important to note that",
|
| 112 |
-
"it's worth noting that",
|
| 113 |
-
"keep in mind that",
|
| 114 |
-
"bear in mind that",
|
| 115 |
-
"i should point out",
|
| 116 |
-
"it's also important to",
|
| 117 |
-
"additionally, it's worth",
|
| 118 |
-
"furthermore, it should be",
|
| 119 |
-
"i cannot provide",
|
| 120 |
-
"i'm unable to",
|
| 121 |
-
"i don't have the ability",
|
| 122 |
-
"based on the information",
|
| 123 |
-
"according to the context",
|
| 124 |
-
],
|
| 125 |
-
"sentence_starters" : ["however,",
|
| 126 |
-
"additionally,",
|
| 127 |
-
"furthermore,",
|
| 128 |
-
"moreover,",
|
| 129 |
-
"in conclusion,",
|
| 130 |
-
"therefore,",
|
| 131 |
-
"consequently,",
|
| 132 |
-
"as a result,",
|
| 133 |
-
"in summary,",
|
| 134 |
-
"ultimately,",
|
| 135 |
-
],
|
| 136 |
-
"structural_patterns" : ["firstly",
|
| 137 |
-
"secondly",
|
| 138 |
-
"thirdly",
|
| 139 |
-
"on one hand",
|
| 140 |
-
"on the other hand",
|
| 141 |
-
"in terms of",
|
| 142 |
-
"with regard to",
|
| 143 |
-
],
|
| 144 |
-
"punctuation_patterns" : {"em_dash_frequency" : (0.01, 0.03),
|
| 145 |
-
"semicolon_frequency" : (0.005, 0.015),
|
| 146 |
-
"parentheses_frequency" : (0.01, 0.04),
|
| 147 |
-
},
|
| 148 |
-
"style_markers" : {"avg_sentence_length" : (18, 25),
|
| 149 |
-
"transition_word_density" : (0.08, 0.15),
|
| 150 |
-
"formality_score" : (0.7, 0.9),
|
| 151 |
-
"hedging_language" : (0.05, 0.12),
|
| 152 |
-
}
|
| 153 |
-
},
|
| 154 |
-
AIModel.GPT_4 : {"phrases" : ["it's important to note that",
|
| 155 |
-
"it's worth mentioning that",
|
| 156 |
-
"to clarify this point",
|
| 157 |
-
"in other words,",
|
| 158 |
-
"that being said,",
|
| 159 |
-
"in essence,",
|
| 160 |
-
"fundamentally,",
|
| 161 |
-
"at its core,",
|
| 162 |
-
"from a broader perspective",
|
| 163 |
-
"when considering",
|
| 164 |
-
"this suggests that",
|
| 165 |
-
"this implies that",
|
| 166 |
-
"it follows that",
|
| 167 |
-
"consequently,",
|
| 168 |
-
"accordingly,",
|
| 169 |
-
],
|
| 170 |
-
"sentence_starters" : ["interestingly,",
|
| 171 |
-
"notably,",
|
| 172 |
-
"crucially,",
|
| 173 |
-
"essentially,",
|
| 174 |
-
"ultimately,",
|
| 175 |
-
"significantly,",
|
| 176 |
-
"importantly,",
|
| 177 |
-
"remarkably,",
|
| 178 |
-
"surprisingly,",
|
| 179 |
-
],
|
| 180 |
-
"structural_patterns" : ["in light of",
|
| 181 |
-
"with respect to",
|
| 182 |
-
"pertaining to",
|
| 183 |
-
"as evidenced by",
|
| 184 |
-
"as indicated by",
|
| 185 |
-
"as suggested by",
|
| 186 |
-
],
|
| 187 |
-
"punctuation_patterns" : {"em_dash_frequency" : (0.02, 0.05),
|
| 188 |
-
"colon_frequency" : (0.01, 0.03),
|
| 189 |
-
"semicolon_frequency" : (0.01, 0.02),
|
| 190 |
-
},
|
| 191 |
-
"style_markers" : {"avg_sentence_length" : (20, 28),
|
| 192 |
-
"vocabulary_sophistication" : (0.7, 0.9),
|
| 193 |
-
"conceptual_density" : (0.6, 0.85),
|
| 194 |
-
"analytical_depth" : (0.65, 0.9),
|
| 195 |
-
}
|
| 196 |
-
},
|
| 197 |
-
AIModel.CLAUDE_3_OPUS : {"phrases" : ["i'd be glad to",
|
| 198 |
-
"i'm happy to help",
|
| 199 |
-
"let me explain this",
|
| 200 |
-
"to clarify this further",
|
| 201 |
-
"in this context,",
|
| 202 |
-
"from this perspective,",
|
| 203 |
-
"building on that point",
|
| 204 |
-
"expanding on this idea",
|
| 205 |
-
"delving deeper into",
|
| 206 |
-
"to elaborate further",
|
| 207 |
-
"it's worth considering",
|
| 208 |
-
"this raises the question",
|
| 209 |
-
"this highlights the importance",
|
| 210 |
-
"this underscores the need",
|
| 211 |
-
],
|
| 212 |
-
"sentence_starters" : ["certainly,",
|
| 213 |
-
"indeed,",
|
| 214 |
-
"particularly,",
|
| 215 |
-
"specifically,",
|
| 216 |
-
"notably,",
|
| 217 |
-
"importantly,",
|
| 218 |
-
"interestingly,",
|
| 219 |
-
"crucially,",
|
| 220 |
-
],
|
| 221 |
-
"structural_patterns" : ["in other words",
|
| 222 |
-
"to put it differently",
|
| 223 |
-
"that is to say",
|
| 224 |
-
"for instance",
|
| 225 |
-
"for example",
|
| 226 |
-
"as an illustration",
|
| 227 |
-
],
|
| 228 |
-
"punctuation_patterns" : {"em_dash_frequency" : (0.015, 0.04),
|
| 229 |
-
"parenthetical_usage" : (0.02, 0.06),
|
| 230 |
-
"colon_frequency" : (0.008, 0.025),
|
| 231 |
-
},
|
| 232 |
-
"style_markers" : {"avg_sentence_length" : (17, 24),
|
| 233 |
-
"nuanced_language" : (0.6, 0.85),
|
| 234 |
-
"explanatory_depth" : (0.7, 0.95),
|
| 235 |
-
"conceptual_clarity" : (0.65, 0.9),
|
| 236 |
-
}
|
| 237 |
-
},
|
| 238 |
-
AIModel.GEMINI_PRO : {"phrases" : ["here's what you need to know",
|
| 239 |
-
"here's how it works",
|
| 240 |
-
"let's explore this",
|
| 241 |
-
"let's look at this",
|
| 242 |
-
"consider this example",
|
| 243 |
-
"think of it this way",
|
| 244 |
-
"imagine if you will",
|
| 245 |
-
"picture this scenario",
|
| 246 |
-
"to break it down",
|
| 247 |
-
"in simple terms",
|
| 248 |
-
"put simply,",
|
| 249 |
-
"basically,",
|
| 250 |
-
"the key point is",
|
| 251 |
-
"the main idea here",
|
| 252 |
-
],
|
| 253 |
-
"sentence_starters" : ["now,",
|
| 254 |
-
"so,",
|
| 255 |
-
"well,",
|
| 256 |
-
"basically,",
|
| 257 |
-
"essentially,",
|
| 258 |
-
"actually,",
|
| 259 |
-
"technically,",
|
| 260 |
-
"practically,",
|
| 261 |
-
],
|
| 262 |
-
"structural_patterns" : ["on that note",
|
| 263 |
-
"speaking of which",
|
| 264 |
-
"by the way",
|
| 265 |
-
"as a side note",
|
| 266 |
-
"incidentally",
|
| 267 |
-
"in any case",
|
| 268 |
-
],
|
| 269 |
-
"punctuation_patterns" : {"exclamation_frequency" : (0.01, 0.03),
|
| 270 |
-
"question_frequency" : (0.02, 0.05),
|
| 271 |
-
"ellipsis_frequency" : (0.005, 0.02),
|
| 272 |
-
},
|
| 273 |
-
"style_markers" : {"avg_sentence_length" : (15, 22),
|
| 274 |
-
"conversational_tone" : (0.5, 0.8),
|
| 275 |
-
"accessibility_score" : (0.6, 0.9),
|
| 276 |
-
"engagement_level" : (0.55, 0.85),
|
| 277 |
-
}
|
| 278 |
-
},
|
| 279 |
-
AIModel.LLAMA_3 : {"phrases" : ["it's worth noting",
|
| 280 |
-
"it's important to understand",
|
| 281 |
-
"this means that",
|
| 282 |
-
"this indicates that",
|
| 283 |
-
"this shows that",
|
| 284 |
-
"this demonstrates that",
|
| 285 |
-
"based on this,",
|
| 286 |
-
"given this context",
|
| 287 |
-
"in this case,",
|
| 288 |
-
"for this reason",
|
| 289 |
-
"as such,",
|
| 290 |
-
"therefore,",
|
| 291 |
-
],
|
| 292 |
-
"sentence_starters" : ["first,",
|
| 293 |
-
"second,",
|
| 294 |
-
"third,",
|
| 295 |
-
"next,",
|
| 296 |
-
"then,",
|
| 297 |
-
"finally,",
|
| 298 |
-
"overall,",
|
| 299 |
-
"in general,",
|
| 300 |
-
],
|
| 301 |
-
"structural_patterns" : ["in addition",
|
| 302 |
-
"moreover",
|
| 303 |
-
"furthermore",
|
| 304 |
-
"however",
|
| 305 |
-
"nevertheless",
|
| 306 |
-
"nonetheless",
|
| 307 |
-
],
|
| 308 |
-
"punctuation_patterns" : {"comma_frequency" : (0.08, 0.15),
|
| 309 |
-
"period_frequency" : (0.06, 0.12),
|
| 310 |
-
"conjunction_frequency" : (0.05, 0.1),
|
| 311 |
-
},
|
| 312 |
-
"style_markers" : {"avg_sentence_length" : (16, 23),
|
| 313 |
-
"directness_score" : (0.6, 0.85),
|
| 314 |
-
"clarity_score" : (0.65, 0.9),
|
| 315 |
-
"structural_consistency" : (0.7, 0.95),
|
| 316 |
-
}
|
| 317 |
-
},
|
| 318 |
-
AIModel.DEEPSEEK_CHAT : {"phrases" : ["i understand you're asking",
|
| 319 |
-
"let me help you with that",
|
| 320 |
-
"i can assist you with",
|
| 321 |
-
"regarding your question",
|
| 322 |
-
"to answer your question",
|
| 323 |
-
"in response to your query",
|
| 324 |
-
"based on your request",
|
| 325 |
-
"as per your question",
|
| 326 |
-
"concerning your inquiry",
|
| 327 |
-
"with respect to your question",
|
| 328 |
-
"i'll do my best to",
|
| 329 |
-
"i'll try to help you",
|
| 330 |
-
"allow me to explain",
|
| 331 |
-
"let me break it down",
|
| 332 |
-
],
|
| 333 |
-
"sentence_starters" : ["well,",
|
| 334 |
-
"okay,",
|
| 335 |
-
"so,",
|
| 336 |
-
"now,",
|
| 337 |
-
"first,",
|
| 338 |
-
"actually,",
|
| 339 |
-
"specifically,",
|
| 340 |
-
"generally,",
|
| 341 |
-
],
|
| 342 |
-
"structural_patterns" : ["in other words",
|
| 343 |
-
"to put it simply",
|
| 344 |
-
"that is",
|
| 345 |
-
"for example",
|
| 346 |
-
"for instance",
|
| 347 |
-
"such as",
|
| 348 |
-
],
|
| 349 |
-
"punctuation_patterns" : {"comma_frequency" : (0.07, 0.14),
|
| 350 |
-
"period_frequency" : (0.05, 0.11),
|
| 351 |
-
"question_frequency" : (0.01, 0.04),
|
| 352 |
-
},
|
| 353 |
-
"style_markers" : {"avg_sentence_length" : (14, 21),
|
| 354 |
-
"helpfulness_tone" : (0.6, 0.9),
|
| 355 |
-
"explanatory_style" : (0.55, 0.85),
|
| 356 |
-
"user_focus" : (0.65, 0.95),
|
| 357 |
-
}
|
| 358 |
-
},
|
| 359 |
-
AIModel.MIXTRAL : {"phrases" : ["it should be noted that",
|
| 360 |
-
"it is important to recognize",
|
| 361 |
-
"this suggests that",
|
| 362 |
-
"this implies that",
|
| 363 |
-
"this indicates that",
|
| 364 |
-
"from this we can see",
|
| 365 |
-
"based on this analysis",
|
| 366 |
-
"considering these points",
|
| 367 |
-
"taking into account",
|
| 368 |
-
"in light of these factors",
|
| 369 |
-
],
|
| 370 |
-
"sentence_starters" : ["however,",
|
| 371 |
-
"moreover,",
|
| 372 |
-
"furthermore,",
|
| 373 |
-
"additionally,",
|
| 374 |
-
"conversely,",
|
| 375 |
-
"similarly,",
|
| 376 |
-
"likewise,",
|
| 377 |
-
],
|
| 378 |
-
"structural_patterns" : ["on the one hand",
|
| 379 |
-
"on the other hand",
|
| 380 |
-
"in contrast",
|
| 381 |
-
"by comparison",
|
| 382 |
-
"as opposed to",
|
| 383 |
-
"rather than",
|
| 384 |
-
],
|
| 385 |
-
"punctuation_patterns" : {"semicolon_frequency" : (0.008, 0.02),
|
| 386 |
-
"colon_frequency" : (0.006, 0.018),
|
| 387 |
-
"parentheses_frequency" : (0.012, 0.035),
|
| 388 |
-
},
|
| 389 |
-
"style_markers" : {"avg_sentence_length" : (19, 26),
|
| 390 |
-
"analytical_tone" : (0.65, 0.9),
|
| 391 |
-
"comparative_language" : (0.5, 0.8),
|
| 392 |
-
"balanced_perspective" : (0.6, 0.85),
|
| 393 |
-
}
|
| 394 |
-
}
|
| 395 |
-
}
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
def __init__(self):
|
| 399 |
-
"""
|
| 400 |
-
Initialize model attributor with domain awareness
|
| 401 |
-
"""
|
| 402 |
-
self.is_initialized = False
|
| 403 |
-
logger.info("ModelAttributor initialized with domain-aware calibration")
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
def initialize(self) -> bool:
|
| 407 |
-
"""
|
| 408 |
-
Initialize attribution system
|
| 409 |
-
"""
|
| 410 |
-
try:
|
| 411 |
-
self.is_initialized = True
|
| 412 |
-
logger.success("Model attribution system initialized with metric ensemble")
|
| 413 |
-
return True
|
| 414 |
-
|
| 415 |
-
except Exception as e:
|
| 416 |
-
logger.error(f"Failed to initialize attribution system: {repr(e)}")
|
| 417 |
-
return False
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
def attribute(self, text: str, processed_text: Optional[ProcessedText] = None, metric_results: Optional[Dict[str, MetricResult]] = None,
|
| 421 |
-
domain: Domain = Domain.GENERAL) -> AttributionResult:
|
| 422 |
-
"""
|
| 423 |
-
Attribute text to specific AI model with domain awareness
|
| 424 |
-
|
| 425 |
-
Arguments:
|
| 426 |
-
----------
|
| 427 |
-
text { str } : Input text
|
| 428 |
-
|
| 429 |
-
processed_text { ProcessedText } : Processed text metadata
|
| 430 |
-
|
| 431 |
-
metric_results { dict } : Results from 6 core metrics
|
| 432 |
-
|
| 433 |
-
domain { Domain } : Text domain for calibration
|
| 434 |
-
|
| 435 |
-
Returns:
|
| 436 |
-
--------
|
| 437 |
-
{ AttributionResult } : Attribution result with domain context
|
| 438 |
-
"""
|
| 439 |
-
try:
|
| 440 |
-
# Get domain-specific model preferences
|
| 441 |
-
domain_preferences = self.DOMAIN_MODEL_PREFERENCES.get(domain, [AIModel.GPT_4, AIModel.CLAUDE_3_SONNET])
|
| 442 |
-
|
| 443 |
-
# Fingerprint analysis
|
| 444 |
-
fingerprint_scores = self._calculate_fingerprint_scores(text = text,
|
| 445 |
-
domain = domain,
|
| 446 |
-
)
|
| 447 |
-
|
| 448 |
-
# Statistical pattern analysis
|
| 449 |
-
statistical_scores = self._analyze_statistical_patterns(text = text,
|
| 450 |
-
domain = domain,
|
| 451 |
-
)
|
| 452 |
-
|
| 453 |
-
# Metric-based attribution using all 6 metrics
|
| 454 |
-
metric_scores = self._analyze_metric_patterns(metric_results = metric_results, domain = domain) if metric_results else {}
|
| 455 |
-
|
| 456 |
-
# Ensemble Combination
|
| 457 |
-
combined_scores, metric_contributions = self._combine_attribution_scores(fingerprint_scores = fingerprint_scores,
|
| 458 |
-
statistical_scores = statistical_scores,
|
| 459 |
-
metric_scores = metric_scores,
|
| 460 |
-
domain = domain,
|
| 461 |
-
)
|
| 462 |
-
|
| 463 |
-
# Domain-aware prediction : Always show the actual highest probability model
|
| 464 |
-
predicted_model, confidence = self._make_domain_aware_prediction(combined_scores = combined_scores,
|
| 465 |
-
domain = domain,
|
| 466 |
-
domain_preferences = domain_preferences,
|
| 467 |
-
)
|
| 468 |
-
|
| 469 |
-
# Reasoning with domain context
|
| 470 |
-
reasoning = self._generate_detailed_reasoning(predicted_model = predicted_model,
|
| 471 |
-
confidence = confidence,
|
| 472 |
-
domain = domain,
|
| 473 |
-
metric_contributions = metric_contributions,
|
| 474 |
-
combined_scores = combined_scores,
|
| 475 |
-
)
|
| 476 |
-
|
| 477 |
-
return AttributionResult(predicted_model = predicted_model,
|
| 478 |
-
confidence = confidence,
|
| 479 |
-
model_probabilities = combined_scores,
|
| 480 |
-
reasoning = reasoning,
|
| 481 |
-
fingerprint_matches = self._get_top_fingerprints(fingerprint_scores),
|
| 482 |
-
domain_used = domain,
|
| 483 |
-
metric_contributions = metric_contributions,
|
| 484 |
-
)
|
| 485 |
-
|
| 486 |
-
except Exception as e:
|
| 487 |
-
logger.error(f"Error in model attribution: {repr(e)}")
|
| 488 |
-
return self._create_unknown_result(domain)
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
def _calculate_fingerprint_scores(self, text: str, domain: Domain) -> Dict[AIModel, float]:
|
| 492 |
-
"""
|
| 493 |
-
Calculate fingerprint match scores with domain calibration - for all domains
|
| 494 |
-
"""
|
| 495 |
-
scores = {model: 0.0 for model in AIModel if model not in [AIModel.HUMAN, AIModel.UNKNOWN]}
|
| 496 |
-
|
| 497 |
-
# Adjust sensitivity based on all domains
|
| 498 |
-
domain_sensitivity = {Domain.GENERAL : 1.00,
|
| 499 |
-
Domain.ACADEMIC : 1.20,
|
| 500 |
-
Domain.CREATIVE : 0.90,
|
| 501 |
-
Domain.AI_ML : 1.15,
|
| 502 |
-
Domain.SOFTWARE_DEV : 1.15,
|
| 503 |
-
Domain.TECHNICAL_DOC : 1.10,
|
| 504 |
-
Domain.ENGINEERING : 1.10,
|
| 505 |
-
Domain.SCIENCE : 1.20,
|
| 506 |
-
Domain.BUSINESS : 1.05,
|
| 507 |
-
Domain.LEGAL : 1.25,
|
| 508 |
-
Domain.MEDICAL : 1.20,
|
| 509 |
-
Domain.JOURNALISM : 1.00,
|
| 510 |
-
Domain.MARKETING : 0.95,
|
| 511 |
-
Domain.SOCIAL_MEDIA : 0.80,
|
| 512 |
-
Domain.BLOG_PERSONAL : 0.90,
|
| 513 |
-
Domain.TUTORIAL : 1.00,
|
| 514 |
-
}
|
| 515 |
-
|
| 516 |
-
sensitivity = domain_sensitivity.get(domain, 1.0)
|
| 517 |
-
text_lower = text.lower()
|
| 518 |
-
|
| 519 |
-
for model, fingerprints in self.MODEL_FINGERPRINTS.items():
|
| 520 |
-
match_count = 0
|
| 521 |
-
total_checks = 0
|
| 522 |
-
|
| 523 |
-
# Check phrase matches
|
| 524 |
-
if ("phrases" in fingerprints):
|
| 525 |
-
for phrase in fingerprints["phrases"]:
|
| 526 |
-
if (phrase in text_lower):
|
| 527 |
-
match_count += 3
|
| 528 |
-
|
| 529 |
-
total_checks += 1
|
| 530 |
-
|
| 531 |
-
# Check sentence starters
|
| 532 |
-
if ("sentence_starters" in fingerprints):
|
| 533 |
-
sentences = re.split(r'[.!?]+', text)
|
| 534 |
-
for sentence in sentences:
|
| 535 |
-
sentence = sentence.strip().lower()
|
| 536 |
-
for starter in fingerprints["sentence_starters"]:
|
| 537 |
-
if (sentence.startswith(starter)):
|
| 538 |
-
match_count += 2
|
| 539 |
-
break
|
| 540 |
-
|
| 541 |
-
total_checks += len(sentences)
|
| 542 |
-
|
| 543 |
-
# Check structural patterns
|
| 544 |
-
if ("structural_patterns" in fingerprints):
|
| 545 |
-
for pattern in fingerprints["structural_patterns"]:
|
| 546 |
-
if (pattern in text_lower):
|
| 547 |
-
match_count += 2
|
| 548 |
-
|
| 549 |
-
total_checks += 1
|
| 550 |
-
|
| 551 |
-
# Calculate normalized score
|
| 552 |
-
if (total_checks > 0):
|
| 553 |
-
base_score = min(1.0, match_count / (total_checks * 0.5))
|
| 554 |
-
# Apply domain calibration
|
| 555 |
-
scores[model] = min(1.0, base_score * sensitivity)
|
| 556 |
-
|
| 557 |
-
return scores
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
def _analyze_statistical_patterns(self, text: str, domain: Domain) -> Dict[AIModel, float]:
|
| 561 |
-
"""
|
| 562 |
-
Analyze statistical patterns to identify model with domain awareness
|
| 563 |
-
"""
|
| 564 |
-
scores = {model: 0.3 for model in AIModel if model not in [AIModel.HUMAN, AIModel.UNKNOWN]}
|
| 565 |
-
|
| 566 |
-
# Calculate text statistics
|
| 567 |
-
sentences = re.split(r'[.!?]+', text)
|
| 568 |
-
sentences = [s.strip() for s in sentences if s.strip()]
|
| 569 |
-
words = text.split()
|
| 570 |
-
|
| 571 |
-
if not sentences or not words:
|
| 572 |
-
return scores
|
| 573 |
-
|
| 574 |
-
# Basic statistics
|
| 575 |
-
avg_sentence_length = len(words) / len(sentences)
|
| 576 |
-
word_count = len(words)
|
| 577 |
-
sentence_count = len(sentences)
|
| 578 |
-
|
| 579 |
-
# Punctuation frequencies
|
| 580 |
-
em_dash_freq = text.count('—') / word_count if word_count else 0
|
| 581 |
-
semicolon_freq = text.count(';') / word_count if word_count else 0
|
| 582 |
-
colon_freq = text.count(':') / word_count if word_count else 0
|
| 583 |
-
comma_freq = text.count(',') / word_count if word_count else 0
|
| 584 |
-
question_freq = text.count('?') / sentence_count if sentence_count else 0
|
| 585 |
-
exclamation_freq = text.count('!') / sentence_count if sentence_count else 0
|
| 586 |
-
|
| 587 |
-
# DOMAIN-AWARE: Adjust expectations based on domains
|
| 588 |
-
domain_adjustments = {Domain.GENERAL : 1.00,
|
| 589 |
-
Domain.ACADEMIC : 1.10,
|
| 590 |
-
Domain.CREATIVE : 0.95,
|
| 591 |
-
Domain.AI_ML : 1.05,
|
| 592 |
-
Domain.SOFTWARE_DEV : 1.05,
|
| 593 |
-
Domain.TECHNICAL_DOC : 1.05,
|
| 594 |
-
Domain.ENGINEERING : 1.05,
|
| 595 |
-
Domain.SCIENCE : 1.08,
|
| 596 |
-
Domain.BUSINESS : 1.00,
|
| 597 |
-
Domain.LEGAL : 1.12,
|
| 598 |
-
Domain.MEDICAL : 1.08,
|
| 599 |
-
Domain.JOURNALISM : 0.95,
|
| 600 |
-
Domain.MARKETING : 0.92,
|
| 601 |
-
Domain.SOCIAL_MEDIA : 0.85,
|
| 602 |
-
Domain.BLOG_PERSONAL : 0.95,
|
| 603 |
-
Domain.TUTORIAL : 1.00,
|
| 604 |
-
}
|
| 605 |
-
|
| 606 |
-
domain_factor = domain_adjustments.get(domain, 1.0)
|
| 607 |
-
|
| 608 |
-
# Compare against model fingerprints
|
| 609 |
-
for model, fingerprints in self.MODEL_FINGERPRINTS.items():
|
| 610 |
-
if ("style_markers" not in fingerprints) or ("punctuation_patterns" not in fingerprints):
|
| 611 |
-
continue
|
| 612 |
-
|
| 613 |
-
style = fingerprints["style_markers"]
|
| 614 |
-
punct = fingerprints["punctuation_patterns"]
|
| 615 |
-
match_score = 0.3
|
| 616 |
-
|
| 617 |
-
# Check sentence length with domain adjustment
|
| 618 |
-
if ("avg_sentence_length" in style):
|
| 619 |
-
min_len, max_len = style["avg_sentence_length"]
|
| 620 |
-
adjusted_min = min_len * domain_factor
|
| 621 |
-
adjusted_max = max_len * domain_factor
|
| 622 |
-
|
| 623 |
-
if (adjusted_min <= avg_sentence_length <= adjusted_max):
|
| 624 |
-
match_score += 0.25
|
| 625 |
-
|
| 626 |
-
# Check punctuation patterns
|
| 627 |
-
punctuation_checks = [("em_dash_frequency", em_dash_freq),
|
| 628 |
-
("semicolon_frequency", semicolon_freq),
|
| 629 |
-
("colon_frequency", colon_freq),
|
| 630 |
-
("comma_frequency", comma_freq),
|
| 631 |
-
("question_frequency", question_freq),
|
| 632 |
-
("exclamation_frequency", exclamation_freq),
|
| 633 |
-
]
|
| 634 |
-
|
| 635 |
-
for pattern_name, observed_freq in punctuation_checks:
|
| 636 |
-
if (pattern_name in punct):
|
| 637 |
-
min_freq, max_freq = punct[pattern_name]
|
| 638 |
-
|
| 639 |
-
if (min_freq <= observed_freq <= max_freq):
|
| 640 |
-
match_score += 0.08
|
| 641 |
-
|
| 642 |
-
scores[model] = min(1.0, match_score)
|
| 643 |
-
|
| 644 |
-
return scores
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
def _analyze_metric_patterns(self, metric_results: Dict[str, MetricResult], domain: Domain) -> Dict[AIModel, float]:
|
| 648 |
-
"""
|
| 649 |
-
Use all 6 metrics with proper weights for attribution
|
| 650 |
-
"""
|
| 651 |
-
scores = {model: 0.0 for model in AIModel if model not in [AIModel.HUMAN, AIModel.UNKNOWN]}
|
| 652 |
-
|
| 653 |
-
if not metric_results:
|
| 654 |
-
return scores
|
| 655 |
-
|
| 656 |
-
# DOMAIN-AWARE: Adjust metric sensitivity based on domain
|
| 657 |
-
domain_metric_weights = {Domain.GENERAL : {"perplexity": 1.0, "structural": 1.0, "entropy": 1.0, "semantic_analysis": 1.0, "linguistic": 1.0, "multi_perturbation_stability": 1.0},
|
| 658 |
-
Domain.ACADEMIC : {"perplexity": 1.2, "structural": 1.0, "entropy": 0.9, "semantic_analysis": 1.1, "linguistic": 1.3, "multi_perturbation_stability": 0.8},
|
| 659 |
-
Domain.TECHNICAL_DOC : {"perplexity": 1.2, "structural": 1.1, "entropy": 0.9, "semantic_analysis": 1.2, "linguistic": 1.1, "multi_perturbation_stability": 0.8},
|
| 660 |
-
Domain.AI_ML : {"perplexity": 1.3, "structural": 1.0, "entropy": 0.9, "semantic_analysis": 1.2, "linguistic": 1.2, "multi_perturbation_stability": 0.8},
|
| 661 |
-
Domain.SOFTWARE_DEV : {"perplexity": 1.2, "structural": 1.1, "entropy": 0.9, "semantic_analysis": 1.1, "linguistic": 1.0, "multi_perturbation_stability": 0.9},
|
| 662 |
-
Domain.ENGINEERING : {"perplexity": 1.2, "structural": 1.1, "entropy": 0.9, "semantic_analysis": 1.1, "linguistic": 1.2, "multi_perturbation_stability": 0.8},
|
| 663 |
-
Domain.SCIENCE : {"perplexity": 1.2, "structural": 1.0, "entropy": 0.9, "semantic_analysis": 1.2, "linguistic": 1.3, "multi_perturbation_stability": 0.8},
|
| 664 |
-
Domain.BUSINESS : {"perplexity": 1.1, "structural": 1.0, "entropy": 1.0, "semantic_analysis": 1.2, "linguistic": 1.1, "multi_perturbation_stability": 0.9},
|
| 665 |
-
Domain.LEGAL : {"perplexity": 1.2, "structural": 1.1, "entropy": 0.9, "semantic_analysis": 1.3, "linguistic": 1.3, "multi_perturbation_stability": 0.8},
|
| 666 |
-
Domain.MEDICAL : {"perplexity": 1.2, "structural": 1.0, "entropy": 0.9, "semantic_analysis": 1.2, "linguistic": 1.2, "multi_perturbation_stability": 0.8},
|
| 667 |
-
Domain.JOURNALISM : {"perplexity": 1.1, "structural": 1.0, "entropy": 1.0, "semantic_analysis": 1.1, "linguistic": 1.1, "multi_perturbation_stability": 0.9},
|
| 668 |
-
Domain.CREATIVE : {"perplexity": 0.9, "structural": 0.9, "entropy": 1.2, "semantic_analysis": 1.0, "linguistic": 1.3, "multi_perturbation_stability": 0.9},
|
| 669 |
-
Domain.MARKETING : {"perplexity": 1.0, "structural": 1.0, "entropy": 1.1, "semantic_analysis": 1.1, "linguistic": 1.2, "multi_perturbation_stability": 0.8},
|
| 670 |
-
Domain.SOCIAL_MEDIA : {"perplexity": 1.0, "structural": 0.8, "entropy": 1.3, "semantic_analysis": 0.9, "linguistic": 0.9, "multi_perturbation_stability": 0.9},
|
| 671 |
-
Domain.BLOG_PERSONAL : {"perplexity": 1.0, "structural": 0.9, "entropy": 1.2, "semantic_analysis": 1.0, "linguistic": 1.1, "multi_perturbation_stability": 0.8},
|
| 672 |
-
Domain.TUTORIAL : {"perplexity": 1.1, "structural": 1.0, "entropy": 1.0, "semantic_analysis": 1.1, "linguistic": 1.1, "multi_perturbation_stability": 0.9},
|
| 673 |
-
}
|
| 674 |
-
|
| 675 |
-
domain_weights = domain_metric_weights.get(domain, domain_metric_weights[Domain.GENERAL])
|
| 676 |
-
|
| 677 |
-
# PERPLEXITY ANALYSIS (25% weight)
|
| 678 |
-
if ("perplexity" in metric_results):
|
| 679 |
-
perplexity_result = metric_results["perplexity"]
|
| 680 |
-
overall_perplexity = perplexity_result.details.get("overall_perplexity", 50)
|
| 681 |
-
domain_weight = domain_weights.get("perplexity", 1.0)
|
| 682 |
-
|
| 683 |
-
# GPT models typically have lower perplexity
|
| 684 |
-
if (overall_perplexity < 25):
|
| 685 |
-
scores[AIModel.GPT_4] += 0.6 * self.METRIC_WEIGHTS["perplexity"] * domain_weight
|
| 686 |
-
scores[AIModel.GPT_4_TURBO] += 0.5 * self.METRIC_WEIGHTS["perplexity"] * domain_weight
|
| 687 |
-
|
| 688 |
-
elif (overall_perplexity < 35):
|
| 689 |
-
scores[AIModel.GPT_3_5] += 0.4 * self.METRIC_WEIGHTS["perplexity"] * domain_weight
|
| 690 |
-
scores[AIModel.GEMINI_PRO] += 0.3 * self.METRIC_WEIGHTS["perplexity"] * domain_weight
|
| 691 |
-
|
| 692 |
-
# STRUCTURAL ANALYSIS (15% weight)
|
| 693 |
-
if ("structural" in metric_results):
|
| 694 |
-
structural_result = metric_results["structural"]
|
| 695 |
-
burstiness = structural_result.details.get("burstiness_score", 0.5)
|
| 696 |
-
uniformity = structural_result.details.get("length_uniformity", 0.5)
|
| 697 |
-
domain_weight = domain_weights.get("structural", 1.0)
|
| 698 |
-
|
| 699 |
-
# Claude models show more structural consistency
|
| 700 |
-
if (uniformity > 0.7):
|
| 701 |
-
scores[AIModel.CLAUDE_3_OPUS] += 0.5 * self.METRIC_WEIGHTS["structural"] * domain_weight
|
| 702 |
-
scores[AIModel.CLAUDE_3_SONNET] += 0.4 * self.METRIC_WEIGHTS["structural"] * domain_weight
|
| 703 |
-
|
| 704 |
-
# SEMANTIC ANALYSIS (15% weight)
|
| 705 |
-
if ("semantic_analysis" in metric_results):
|
| 706 |
-
semantic_result = metric_results["semantic_analysis"]
|
| 707 |
-
coherence = semantic_result.details.get("coherence_score", 0.5)
|
| 708 |
-
consistency = semantic_result.details.get("consistency_score", 0.5)
|
| 709 |
-
domain_weight = domain_weights.get("semantic_analysis", 1.0)
|
| 710 |
-
|
| 711 |
-
# GPT-4 shows exceptional semantic coherence
|
| 712 |
-
if (coherence > 0.8):
|
| 713 |
-
scores[AIModel.GPT_4] += 0.7 * self.METRIC_WEIGHTS["semantic_analysis"] * domain_weight
|
| 714 |
-
scores[AIModel.GPT_4_TURBO] += 0.6 * self.METRIC_WEIGHTS["semantic_analysis"] * domain_weight
|
| 715 |
-
|
| 716 |
-
# ENTROPY ANALYSIS (20% weight)
|
| 717 |
-
if ("entropy" in metric_results):
|
| 718 |
-
entropy_result = metric_results["entropy"]
|
| 719 |
-
token_diversity = entropy_result.details.get("token_diversity", 0.5)
|
| 720 |
-
sequence_unpredictability = entropy_result.details.get("sequence_unpredictability", 0.5)
|
| 721 |
-
domain_weight = domain_weights.get("entropy", 1.0)
|
| 722 |
-
|
| 723 |
-
# Higher entropy diversity suggests more sophisticated models
|
| 724 |
-
if (token_diversity > 0.7):
|
| 725 |
-
scores[AIModel.CLAUDE_3_OPUS] += 0.6 * self.METRIC_WEIGHTS["entropy"] * domain_weight
|
| 726 |
-
scores[AIModel.GPT_4] += 0.5 * self.METRIC_WEIGHTS["entropy"] * domain_weight
|
| 727 |
-
|
| 728 |
-
# LINGUISTIC ANALYSIS (15% weight)
|
| 729 |
-
if ("linguistic" in metric_results):
|
| 730 |
-
linguistic_result = metric_results["linguistic"]
|
| 731 |
-
pos_diversity = linguistic_result.details.get("pos_diversity", 0.5)
|
| 732 |
-
syntactic_complexity = linguistic_result.details.get("syntactic_complexity", 2.5)
|
| 733 |
-
domain_weight = domain_weights.get("linguistic", 1.0)
|
| 734 |
-
|
| 735 |
-
# Complex linguistic patterns suggest advanced models
|
| 736 |
-
if (syntactic_complexity > 3.0):
|
| 737 |
-
scores[AIModel.CLAUDE_3_OPUS] += 0.5 * self.METRIC_WEIGHTS["linguistic"] * domain_weight
|
| 738 |
-
scores[AIModel.GPT_4] += 0.4 * self.METRIC_WEIGHTS["linguistic"] * domain_weight
|
| 739 |
-
|
| 740 |
-
# MULTI-PERTURBATION STABILITY ANALYSIS (10% weight)
|
| 741 |
-
if ("multi_perturbation_stability" in metric_results):
|
| 742 |
-
multi_perturbation_stability_result = metric_results["multi_perturbation_stability"]
|
| 743 |
-
stability = multi_perturbation_stability_result.details.get("stability_score", 0.5)
|
| 744 |
-
curvature = multi_perturbation_stability_result.details.get("curvature_score", 0.5)
|
| 745 |
-
|
| 746 |
-
# Specific stability patterns for different model families
|
| 747 |
-
if (0.4 <= stability <= 0.6):
|
| 748 |
-
scores[AIModel.MIXTRAL] += 0.4 * self.METRIC_WEIGHTS["multi_perturbation_stability"]
|
| 749 |
-
scores[AIModel.LLAMA_3] += 0.3 * self.METRIC_WEIGHTS["multi_perturbation_stability"]
|
| 750 |
-
|
| 751 |
-
# Normalize scores
|
| 752 |
-
for model in scores:
|
| 753 |
-
scores[model] = min(1.0, scores[model])
|
| 754 |
-
|
| 755 |
-
return scores
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
def _combine_attribution_scores(self, fingerprint_scores: Dict[AIModel, float], statistical_scores: Dict[AIModel, float],
|
| 759 |
-
metric_scores: Dict[AIModel, float], domain: Domain) -> Tuple[Dict[str, float], Dict[str, float]]:
|
| 760 |
-
"""
|
| 761 |
-
ENSEMBLE COMBINATION using document-specified weights and domain awareness
|
| 762 |
-
"""
|
| 763 |
-
# DOMAIN-AWARE weighting for ALL 16 DOMAINS
|
| 764 |
-
domain_weights = {Domain.GENERAL : {"fingerprint": 0.35, "statistical": 0.30, "metric": 0.35},
|
| 765 |
-
Domain.ACADEMIC : {"fingerprint": 0.30, "statistical": 0.35, "metric": 0.35},
|
| 766 |
-
Domain.TECHNICAL_DOC : {"fingerprint": 0.25, "statistical": 0.40, "metric": 0.35},
|
| 767 |
-
Domain.AI_ML : {"fingerprint": 0.28, "statistical": 0.37, "metric": 0.35},
|
| 768 |
-
Domain.SOFTWARE_DEV : {"fingerprint": 0.27, "statistical": 0.38, "metric": 0.35},
|
| 769 |
-
Domain.ENGINEERING : {"fingerprint": 0.28, "statistical": 0.37, "metric": 0.35},
|
| 770 |
-
Domain.SCIENCE : {"fingerprint": 0.30, "statistical": 0.35, "metric": 0.35},
|
| 771 |
-
Domain.BUSINESS : {"fingerprint": 0.33, "statistical": 0.35, "metric": 0.32},
|
| 772 |
-
Domain.LEGAL : {"fingerprint": 0.28, "statistical": 0.40, "metric": 0.32},
|
| 773 |
-
Domain.MEDICAL : {"fingerprint": 0.30, "statistical": 0.38, "metric": 0.32},
|
| 774 |
-
Domain.JOURNALISM : {"fingerprint": 0.35, "statistical": 0.33, "metric": 0.32},
|
| 775 |
-
Domain.CREATIVE : {"fingerprint": 0.40, "statistical": 0.30, "metric": 0.30},
|
| 776 |
-
Domain.MARKETING : {"fingerprint": 0.38, "statistical": 0.32, "metric": 0.30},
|
| 777 |
-
Domain.SOCIAL_MEDIA : {"fingerprint": 0.45, "statistical": 0.35, "metric": 0.20},
|
| 778 |
-
Domain.BLOG_PERSONAL : {"fingerprint": 0.42, "statistical": 0.32, "metric": 0.26},
|
| 779 |
-
Domain.TUTORIAL : {"fingerprint": 0.33, "statistical": 0.35, "metric": 0.32},
|
| 780 |
-
}
|
| 781 |
-
|
| 782 |
-
weights = domain_weights.get(domain, domain_weights[Domain.GENERAL])
|
| 783 |
-
|
| 784 |
-
combined = dict()
|
| 785 |
-
metric_contributions = dict()
|
| 786 |
-
|
| 787 |
-
all_models = set(fingerprint_scores.keys()) | set(statistical_scores.keys()) | set(metric_scores.keys())
|
| 788 |
-
|
| 789 |
-
for model in all_models:
|
| 790 |
-
score = (fingerprint_scores.get(model, 0.0) * weights["fingerprint"] +
|
| 791 |
-
statistical_scores.get(model, 0.0) * weights["statistical"] +
|
| 792 |
-
metric_scores.get(model, 0.0) * weights["metric"]
|
| 793 |
-
)
|
| 794 |
-
|
| 795 |
-
combined[model.value] = score
|
| 796 |
-
|
| 797 |
-
# Normalize scores to sum to 1.0 for proper probability distribution
|
| 798 |
-
total_score = sum(combined.values())
|
| 799 |
-
|
| 800 |
-
if (total_score > 0):
|
| 801 |
-
combined = {model: score / total_score for model, score in combined.items()}
|
| 802 |
-
|
| 803 |
-
# Calculate metric contributions for explainability
|
| 804 |
-
if metric_scores:
|
| 805 |
-
total_metric_impact = sum(metric_scores.values())
|
| 806 |
-
if (total_metric_impact > 0):
|
| 807 |
-
for model, score in metric_scores.items():
|
| 808 |
-
metric_contributions[model.value] = score / total_metric_impact
|
| 809 |
-
|
| 810 |
-
return combined, metric_contributions
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
def _make_domain_aware_prediction(self, combined_scores: Dict[str, float], domain: Domain, domain_preferences: List[AIModel]) -> Tuple[AIModel, float]:
|
| 814 |
-
"""
|
| 815 |
-
Domain aware prediction that considers domain-specific model preferences
|
| 816 |
-
"""
|
| 817 |
-
if not combined_scores:
|
| 818 |
-
return AIModel.UNKNOWN, 0.0
|
| 819 |
-
|
| 820 |
-
# Find the model with the highest probability
|
| 821 |
-
sorted_models = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
|
| 822 |
-
|
| 823 |
-
if not sorted_models:
|
| 824 |
-
return AIModel.UNKNOWN, 0.0
|
| 825 |
-
|
| 826 |
-
best_model_name, best_score = sorted_models[0]
|
| 827 |
-
|
| 828 |
-
# Thresholding to show model only if confidence is sufficient
|
| 829 |
-
if (best_score < 0.01):
|
| 830 |
-
return AIModel.UNKNOWN, best_score
|
| 831 |
-
|
| 832 |
-
try:
|
| 833 |
-
best_model = AIModel(best_model_name)
|
| 834 |
-
|
| 835 |
-
except ValueError:
|
| 836 |
-
best_model = AIModel.UNKNOWN
|
| 837 |
-
|
| 838 |
-
# Calculate confidence - be more generous
|
| 839 |
-
if (len(sorted_models) > 1):
|
| 840 |
-
second_score = sorted_models[1][1]
|
| 841 |
-
margin = best_score - second_score
|
| 842 |
-
# More generous confidence calculation
|
| 843 |
-
confidence = min(1.0, best_score * 0.8 + margin * 1.5)
|
| 844 |
-
|
| 845 |
-
else:
|
| 846 |
-
confidence = best_score * 0.9
|
| 847 |
-
|
| 848 |
-
# Always return the actual best model, never downgrade to UNKNOWN
|
| 849 |
-
return best_model, max(0.05, confidence)
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
def _generate_detailed_reasoning(self, predicted_model: AIModel, confidence: float, domain: Domain, metric_contributions: Dict[str, float],
|
| 853 |
-
combined_scores: Dict[str, float]) -> List[str]:
|
| 854 |
-
"""
|
| 855 |
-
Generate Explainable reasoning - ENHANCED version
|
| 856 |
-
"""
|
| 857 |
-
reasoning = []
|
| 858 |
-
|
| 859 |
-
reasoning.append("**AI Model Attribution Analysis**")
|
| 860 |
-
reasoning.append("")
|
| 861 |
-
|
| 862 |
-
# Show prediction with confidence
|
| 863 |
-
if (predicted_model == AIModel.UNKNOWN):
|
| 864 |
-
reasoning.append("**Most Likely**: Unable to determine with high confidence")
|
| 865 |
-
|
| 866 |
-
else:
|
| 867 |
-
model_name = predicted_model.value.replace("-", " ").replace("_", " ").title()
|
| 868 |
-
reasoning.append(f"**Predicted Model**: {model_name}")
|
| 869 |
-
reasoning.append(f"**Confidence**: {confidence*100:.1f}%")
|
| 870 |
-
|
| 871 |
-
reasoning.append(f"**Domain**: {domain.value.replace('_', ' ').title()}")
|
| 872 |
-
reasoning.append("")
|
| 873 |
-
|
| 874 |
-
# Show model probability distribution
|
| 875 |
-
reasoning.append("**Model Probability Distribution:**")
|
| 876 |
-
reasoning.append("")
|
| 877 |
-
|
| 878 |
-
if combined_scores:
|
| 879 |
-
sorted_models = sorted(combined_scores.items(), key = lambda x: x[1], reverse = True)
|
| 880 |
-
|
| 881 |
-
for i, (model_name, score) in enumerate(sorted_models[:6]):
|
| 882 |
-
# Skip very low probabilities
|
| 883 |
-
if (score < 0.01):
|
| 884 |
-
continue
|
| 885 |
-
|
| 886 |
-
display_name = model_name.replace("-", " ").replace("_", " ").title()
|
| 887 |
-
percentage = score * 100
|
| 888 |
-
|
| 889 |
-
# Use proper markdown formatting
|
| 890 |
-
reasoning.append(f"• **{display_name}**: {percentage:.1f}%")
|
| 891 |
-
|
| 892 |
-
reasoning.append("")
|
| 893 |
-
|
| 894 |
-
# Add analysis insights
|
| 895 |
-
reasoning.append("**Analysis Notes:**")
|
| 896 |
-
|
| 897 |
-
if (confidence < 0.3):
|
| 898 |
-
reasoning.append("• Low confidence attribution - text patterns are ambiguous")
|
| 899 |
-
reasoning.append("• May be human-written or from multiple AI sources")
|
| 900 |
-
|
| 901 |
-
else:
|
| 902 |
-
reasoning.append(f"• Calibrated for {domain.value.replace('_', ' ')} domain")
|
| 903 |
-
|
| 904 |
-
# Domain-specific insights
|
| 905 |
-
domain_insights = {Domain.ACADEMIC : "Academic writing patterns analyzed",
|
| 906 |
-
Domain.TECHNICAL_DOC : "Technical coherence and structure weighted",
|
| 907 |
-
Domain.CREATIVE : "Stylistic and linguistic diversity emphasized",
|
| 908 |
-
Domain.SOCIAL_MEDIA : "Casual language and engagement patterns considered",
|
| 909 |
-
Domain.AI_ML : "Technical terminology and analytical patterns emphasized",
|
| 910 |
-
Domain.SOFTWARE_DEV : "Code-like structures and technical precision weighted",
|
| 911 |
-
Domain.ENGINEERING : "Technical specifications and formal language analyzed",
|
| 912 |
-
Domain.SCIENCE : "Scientific terminology and methodological patterns considered",
|
| 913 |
-
Domain.BUSINESS : "Professional communication and strategic language weighted",
|
| 914 |
-
Domain.LEGAL : "Formal language and legal terminology emphasized",
|
| 915 |
-
Domain.MEDICAL : "Medical terminology and clinical language analyzed",
|
| 916 |
-
Domain.JOURNALISM : "News reporting style and factual presentation weighted",
|
| 917 |
-
Domain.MARKETING : "Persuasive language and engagement patterns considered",
|
| 918 |
-
Domain.BLOG_PERSONAL : "Personal voice and conversational style analyzed",
|
| 919 |
-
Domain.TUTORIAL : "Instructional clarity and step-by-step structure weighted",
|
| 920 |
-
}
|
| 921 |
-
|
| 922 |
-
insight = domain_insights.get(domain, "Multiple attribution factors analyzed")
|
| 923 |
-
|
| 924 |
-
reasoning.append(f"• {insight}")
|
| 925 |
-
|
| 926 |
-
return reasoning
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
def _get_top_fingerprints(self, fingerprint_scores: Dict[AIModel, float]) -> Dict[str, int]:
|
| 930 |
-
"""
|
| 931 |
-
Get top fingerprint matches for display
|
| 932 |
-
"""
|
| 933 |
-
top_matches = dict()
|
| 934 |
-
sorted_models = sorted(fingerprint_scores.items(), key = lambda x: x[1], reverse = True)[:5]
|
| 935 |
-
|
| 936 |
-
for model, score in sorted_models:
|
| 937 |
-
# Only show meaningful matches
|
| 938 |
-
if (score > 0.1):
|
| 939 |
-
top_matches[model.value] = int(score * 100)
|
| 940 |
-
|
| 941 |
-
return top_matches
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
def _create_unknown_result(self, domain: Domain) -> AttributionResult:
|
| 945 |
-
"""
|
| 946 |
-
Create result for unknown attribution with domain context
|
| 947 |
-
"""
|
| 948 |
-
return AttributionResult(predicted_model = AIModel.UNKNOWN,
|
| 949 |
-
confidence = 0.0,
|
| 950 |
-
model_probabilities = {},
|
| 951 |
-
reasoning = [f"Model attribution inconclusive for {domain.value} content. Text may be human-written or from unidentifiable model"],
|
| 952 |
-
fingerprint_matches = {},
|
| 953 |
-
domain_used = domain,
|
| 954 |
-
metric_contributions = {},
|
| 955 |
-
)
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
# Export
|
| 959 |
-
__all__ = ["AIModel",
|
| 960 |
-
"ModelAttributor",
|
| 961 |
-
"AttributionResult",
|
| 962 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
detector/orchestrator.py
DELETED
|
@@ -1,576 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
import time
|
| 3 |
-
from typing import Any
|
| 4 |
-
from typing import Dict
|
| 5 |
-
from typing import List
|
| 6 |
-
from loguru import logger
|
| 7 |
-
from typing import Optional
|
| 8 |
-
from dataclasses import dataclass
|
| 9 |
-
from config.settings import settings
|
| 10 |
-
from metrics.entropy import EntropyMetric
|
| 11 |
-
from config.threshold_config import Domain
|
| 12 |
-
from metrics.base_metric import MetricResult
|
| 13 |
-
from detector.ensemble import EnsembleResult
|
| 14 |
-
from metrics.perplexity import PerplexityMetric
|
| 15 |
-
from metrics.linguistic import LinguisticMetric
|
| 16 |
-
from metrics.structural import StructuralMetric
|
| 17 |
-
from detector.ensemble import EnsembleClassifier
|
| 18 |
-
from processors.text_processor import TextProcessor
|
| 19 |
-
from processors.text_processor import ProcessedText
|
| 20 |
-
from processors.domain_classifier import DomainClassifier
|
| 21 |
-
from processors.domain_classifier import DomainPrediction
|
| 22 |
-
from processors.language_detector import LanguageDetector
|
| 23 |
-
from metrics.semantic_analysis import SemanticAnalysisMetric
|
| 24 |
-
from processors.language_detector import LanguageDetectionResult
|
| 25 |
-
from metrics.multi_perturbation_stability import MultiPerturbationStabilityMetric
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
@dataclass
|
| 29 |
-
class DetectionResult:
|
| 30 |
-
"""
|
| 31 |
-
Complete detection result with all metadata
|
| 32 |
-
"""
|
| 33 |
-
# Final results
|
| 34 |
-
ensemble_result : EnsembleResult
|
| 35 |
-
|
| 36 |
-
# Input metadata
|
| 37 |
-
processed_text : ProcessedText
|
| 38 |
-
domain_prediction : DomainPrediction
|
| 39 |
-
language_result : Optional[LanguageDetectionResult]
|
| 40 |
-
|
| 41 |
-
# Metric details
|
| 42 |
-
metric_results : Dict[str, MetricResult]
|
| 43 |
-
|
| 44 |
-
# Performance metrics
|
| 45 |
-
processing_time : float
|
| 46 |
-
metrics_execution_time : Dict[str, float]
|
| 47 |
-
|
| 48 |
-
# Warnings and errors
|
| 49 |
-
warnings : List[str]
|
| 50 |
-
errors : List[str]
|
| 51 |
-
|
| 52 |
-
# File information
|
| 53 |
-
file_info : Optional[Dict[str, Any]] = None
|
| 54 |
-
|
| 55 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 56 |
-
"""
|
| 57 |
-
Convert to dictionary for JSON serialization
|
| 58 |
-
"""
|
| 59 |
-
result = {"prediction" : {"verdict" : self.ensemble_result.final_verdict,
|
| 60 |
-
"ai_probability" : round(self.ensemble_result.ai_probability, 4),
|
| 61 |
-
"human_probability" : round(self.ensemble_result.human_probability, 4),
|
| 62 |
-
"mixed_probability" : round(self.ensemble_result.mixed_probability, 4),
|
| 63 |
-
"confidence" : round(self.ensemble_result.overall_confidence, 4),
|
| 64 |
-
},
|
| 65 |
-
"analysis" : {"domain" : self.domain_prediction.primary_domain.value,
|
| 66 |
-
"domain_confidence" : round(self.domain_prediction.confidence, 4),
|
| 67 |
-
"language" : self.language_result.primary_language.value if self.language_result else "unknown",
|
| 68 |
-
"language_confidence" : round(self.language_result.confidence, 4) if self.language_result else 0.0,
|
| 69 |
-
"text_length" : self.processed_text.word_count,
|
| 70 |
-
"sentence_count" : self.processed_text.sentence_count,
|
| 71 |
-
},
|
| 72 |
-
"metrics" : {name: result.to_dict() for name, result in self.metric_results.items()},
|
| 73 |
-
"ensemble" : self.ensemble_result.to_dict(),
|
| 74 |
-
"performance" : {"total_time" : round(self.processing_time, 3),
|
| 75 |
-
"metrics_time" : {name: round(t, 3) for name, t in self.metrics_execution_time.items()},
|
| 76 |
-
},
|
| 77 |
-
"warnings" : self.warnings,
|
| 78 |
-
"errors" : self.errors,
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
# Include file_info if available
|
| 82 |
-
if self.file_info:
|
| 83 |
-
result["file_info"] = self.file_info
|
| 84 |
-
|
| 85 |
-
return result
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
class DetectionOrchestrator:
|
| 89 |
-
"""
|
| 90 |
-
Coordinates the entire detection pipeline from text input to final results.
|
| 91 |
-
|
| 92 |
-
Pipeline:
|
| 93 |
-
1. Text preprocessing
|
| 94 |
-
2. Domain classification
|
| 95 |
-
3. Language detection (optional)
|
| 96 |
-
4. Metric execution (parallel/sequential)
|
| 97 |
-
5. Ensemble aggregation
|
| 98 |
-
6. Result generation
|
| 99 |
-
"""
|
| 100 |
-
|
| 101 |
-
def __init__(self, enable_language_detection: bool = False, parallel_execution: bool = False, skip_expensive_metrics: bool = False):
|
| 102 |
-
"""
|
| 103 |
-
Initialize detection orchestrator
|
| 104 |
-
|
| 105 |
-
Arguments:
|
| 106 |
-
----------
|
| 107 |
-
enable_language_detection { bool } : Enable language detection step
|
| 108 |
-
|
| 109 |
-
parallel_execution { bool } : Execute metrics in parallel (future feature)
|
| 110 |
-
|
| 111 |
-
skip_expensive_metrics { bool } : Skip computationally expensive metrics
|
| 112 |
-
"""
|
| 113 |
-
self.enable_language_detection = enable_language_detection
|
| 114 |
-
self.parallel_execution = parallel_execution
|
| 115 |
-
self.skip_expensive_metrics = skip_expensive_metrics
|
| 116 |
-
|
| 117 |
-
# Initialize processors
|
| 118 |
-
self.text_processor = TextProcessor(min_text_length = settings.MIN_TEXT_LENGTH,
|
| 119 |
-
max_text_length = settings.MAX_TEXT_LENGTH,
|
| 120 |
-
)
|
| 121 |
-
self.domain_classifier = DomainClassifier()
|
| 122 |
-
|
| 123 |
-
if self.enable_language_detection:
|
| 124 |
-
self.language_detector = LanguageDetector(use_model = True)
|
| 125 |
-
|
| 126 |
-
else:
|
| 127 |
-
self.language_detector = None
|
| 128 |
-
|
| 129 |
-
# Initialize metrics
|
| 130 |
-
self.metrics = self._initialize_metrics()
|
| 131 |
-
|
| 132 |
-
# Initialize ensemble
|
| 133 |
-
self.ensemble = EnsembleClassifier(primary_method = "confidence_calibrated",
|
| 134 |
-
fallback_method = "domain_weighted",
|
| 135 |
-
use_ml_ensemble = False,
|
| 136 |
-
min_metrics_required = 3,
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
logger.info(f"DetectionOrchestrator initialized (language_detection={enable_language_detection}, skip_expensive={skip_expensive_metrics})")
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
def _initialize_metrics(self) -> Dict[str, Any]:
|
| 143 |
-
"""
|
| 144 |
-
Initialize all enabled metrics
|
| 145 |
-
"""
|
| 146 |
-
metrics = dict()
|
| 147 |
-
|
| 148 |
-
# Structural metric (statistical analysis)
|
| 149 |
-
try:
|
| 150 |
-
metrics["structural"] = StructuralMetric()
|
| 151 |
-
logger.debug("Structural metric initialized")
|
| 152 |
-
|
| 153 |
-
except Exception as e:
|
| 154 |
-
logger.error(f"Failed to initialize structural metric: {repr(e)}")
|
| 155 |
-
|
| 156 |
-
# Entropy metric
|
| 157 |
-
try:
|
| 158 |
-
metrics["entropy"] = EntropyMetric()
|
| 159 |
-
logger.debug("Entropy metric initialized")
|
| 160 |
-
|
| 161 |
-
except Exception as e:
|
| 162 |
-
logger.error(f"Failed to initialize entropy metric: {repr(e)}")
|
| 163 |
-
|
| 164 |
-
# Perplexity metric
|
| 165 |
-
try:
|
| 166 |
-
metrics["perplexity"] = PerplexityMetric()
|
| 167 |
-
logger.debug("Perplexity metric initialized")
|
| 168 |
-
|
| 169 |
-
except Exception as e:
|
| 170 |
-
logger.error(f"Failed to initialize perplexity metric: {repr(e)}")
|
| 171 |
-
|
| 172 |
-
# Semantic analysis metric
|
| 173 |
-
try:
|
| 174 |
-
metrics["semantic_analysis"] = SemanticAnalysisMetric()
|
| 175 |
-
logger.debug("Semantic analysis metric initialized")
|
| 176 |
-
|
| 177 |
-
except Exception as e:
|
| 178 |
-
logger.error(f"Failed to initialize semantic analysis metric: {repr(e)}")
|
| 179 |
-
|
| 180 |
-
# Linguistic metric
|
| 181 |
-
try:
|
| 182 |
-
metrics["linguistic"] = LinguisticMetric()
|
| 183 |
-
logger.debug("Linguistic metric initialized")
|
| 184 |
-
|
| 185 |
-
except Exception as e:
|
| 186 |
-
logger.error(f"Failed to initialize linguistic metric: {repr(e)}")
|
| 187 |
-
|
| 188 |
-
# MultiPerturbationStability metric (expensive)
|
| 189 |
-
try:
|
| 190 |
-
metrics["multi_perturbation_stability"] = MultiPerturbationStabilityMetric()
|
| 191 |
-
logger.debug("MultiPerturbationStability metric initialized")
|
| 192 |
-
|
| 193 |
-
except Exception as e:
|
| 194 |
-
logger.error(f"Failed to initialize MultiPerturbationStability metric: {repr(e)}")
|
| 195 |
-
|
| 196 |
-
logger.info(f"Initialized {len(metrics)} metrics: {list(metrics.keys())}")
|
| 197 |
-
return metrics
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
def initialize(self) -> bool:
|
| 201 |
-
"""
|
| 202 |
-
Initialize all components (load models, etc.)
|
| 203 |
-
|
| 204 |
-
Returns:
|
| 205 |
-
--------
|
| 206 |
-
{ bool } : True if successful, False otherwise
|
| 207 |
-
"""
|
| 208 |
-
try:
|
| 209 |
-
logger.info("Initializing detection pipeline...")
|
| 210 |
-
|
| 211 |
-
# Initialize domain classifier
|
| 212 |
-
if not self.domain_classifier.initialize():
|
| 213 |
-
logger.warning("Domain classifier initialization failed")
|
| 214 |
-
|
| 215 |
-
# Initialize language detector
|
| 216 |
-
if self.language_detector:
|
| 217 |
-
if not self.language_detector.initialize():
|
| 218 |
-
logger.warning("Language detector initialization failed")
|
| 219 |
-
|
| 220 |
-
# Initialize metrics
|
| 221 |
-
successful_metrics = 0
|
| 222 |
-
|
| 223 |
-
for name, metric in self.metrics.items():
|
| 224 |
-
try:
|
| 225 |
-
if metric.initialize():
|
| 226 |
-
successful_metrics += 1
|
| 227 |
-
logger.debug(f"Metric {name} initialized successfully")
|
| 228 |
-
|
| 229 |
-
else:
|
| 230 |
-
logger.warning(f"Metric {name} initialization failed")
|
| 231 |
-
|
| 232 |
-
except Exception as e:
|
| 233 |
-
logger.error(f"Error initializing metric {name}: {repr(e)}")
|
| 234 |
-
|
| 235 |
-
# Need at least 3 metrics for reliable detection
|
| 236 |
-
logger.success(f"Detection pipeline initialized: {successful_metrics}/{len(self.metrics)} metrics ready")
|
| 237 |
-
return (successful_metrics >= 3)
|
| 238 |
-
|
| 239 |
-
except Exception as e:
|
| 240 |
-
logger.error(f"Failed to initialize detection pipeline: {repr(e)}")
|
| 241 |
-
return False
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
def analyze(self, text: str, domain: Optional[Domain] = None, **kwargs) -> DetectionResult:
|
| 245 |
-
"""
|
| 246 |
-
Analyze text and detect if AI-generated
|
| 247 |
-
|
| 248 |
-
Arguments:
|
| 249 |
-
----------
|
| 250 |
-
text { str } : Input text to analyze
|
| 251 |
-
|
| 252 |
-
domain { Domain } : Override automatic domain detection
|
| 253 |
-
|
| 254 |
-
**kwargs : Additional options
|
| 255 |
-
|
| 256 |
-
Returns:
|
| 257 |
-
--------
|
| 258 |
-
{ DetectionResult } : DetectionResult with complete analysis
|
| 259 |
-
"""
|
| 260 |
-
start_time = time.time()
|
| 261 |
-
warnings = list()
|
| 262 |
-
errors = list()
|
| 263 |
-
|
| 264 |
-
try:
|
| 265 |
-
# Preprocess text
|
| 266 |
-
logger.info("Step 1: Preprocessing text...")
|
| 267 |
-
processed_text = self.text_processor.process(text = text)
|
| 268 |
-
|
| 269 |
-
if not processed_text.is_valid:
|
| 270 |
-
logger.warning(f"Text validation failed: {processed_text.validation_errors}")
|
| 271 |
-
warnings.extend(processed_text.validation_errors)
|
| 272 |
-
# Continue anyway if text is present
|
| 273 |
-
|
| 274 |
-
# Detect language
|
| 275 |
-
language_result = None
|
| 276 |
-
|
| 277 |
-
if self.language_detector:
|
| 278 |
-
logger.info("Step 2: Detecting language...")
|
| 279 |
-
|
| 280 |
-
try:
|
| 281 |
-
language_result = self.language_detector.detect(processed_text.cleaned_text)
|
| 282 |
-
|
| 283 |
-
if (language_result.primary_language.value != "en"):
|
| 284 |
-
warnings.append(f"Non-English text detected ({language_result.primary_language.value}). Detection accuracy may be reduced.")
|
| 285 |
-
|
| 286 |
-
if (language_result.is_multilingual):
|
| 287 |
-
warnings.append("Multilingual content detected")
|
| 288 |
-
|
| 289 |
-
if (language_result.confidence < 0.7):
|
| 290 |
-
warnings.append(f"Low language detection confidence ({language_result.confidence:.2f})")
|
| 291 |
-
|
| 292 |
-
except Exception as e:
|
| 293 |
-
logger.warning(f"Language detection failed: {repr(e)}")
|
| 294 |
-
warnings.append("Language detection failed")
|
| 295 |
-
|
| 296 |
-
# Classify domain
|
| 297 |
-
logger.info("Step 3: Classifying domain...")
|
| 298 |
-
if domain is None:
|
| 299 |
-
try:
|
| 300 |
-
domain_prediction = self.domain_classifier.classify(processed_text.cleaned_text)
|
| 301 |
-
domain = domain_prediction.primary_domain
|
| 302 |
-
|
| 303 |
-
if (domain_prediction.confidence < 0.5):
|
| 304 |
-
warnings.append(f"Low domain classification confidence ({domain_prediction.confidence:.2f})")
|
| 305 |
-
|
| 306 |
-
except Exception as e:
|
| 307 |
-
logger.warning(f"Domain classification failed: {repr(e)}")
|
| 308 |
-
domain_prediction = DomainPrediction(primary_domain = Domain.GENERAL,
|
| 309 |
-
secondary_domain = None,
|
| 310 |
-
confidence = 0.5,
|
| 311 |
-
domain_scores = {},
|
| 312 |
-
)
|
| 313 |
-
domain = Domain.GENERAL
|
| 314 |
-
|
| 315 |
-
warnings.append("Domain classification failed, using GENERAL")
|
| 316 |
-
|
| 317 |
-
else:
|
| 318 |
-
# Use provided domain
|
| 319 |
-
domain_prediction = DomainPrediction(primary_domain = domain,
|
| 320 |
-
secondary_domain = None,
|
| 321 |
-
confidence = 1.0,
|
| 322 |
-
domain_scores = {domain.value: 1.0},
|
| 323 |
-
)
|
| 324 |
-
|
| 325 |
-
logger.info(f"Detected domain: {domain.value} (confidence: {domain_prediction.confidence:.2f})")
|
| 326 |
-
|
| 327 |
-
# Execute metrics calculations
|
| 328 |
-
logger.info("Step 4: Executing detection metrics calculations...")
|
| 329 |
-
metric_results = dict()
|
| 330 |
-
metrics_execution_time = dict()
|
| 331 |
-
|
| 332 |
-
for name, metric in self.metrics.items():
|
| 333 |
-
metric_start = time.time()
|
| 334 |
-
|
| 335 |
-
try:
|
| 336 |
-
# Check if we should skip expensive metrics
|
| 337 |
-
if (self.skip_expensive_metrics and (name == "multi_perturbation_stability")):
|
| 338 |
-
logger.info(f"Skipping expensive metric: {name}")
|
| 339 |
-
continue
|
| 340 |
-
|
| 341 |
-
logger.debug(f"Computing metric: {name}")
|
| 342 |
-
|
| 343 |
-
result = metric.compute(text = processed_text.cleaned_text,
|
| 344 |
-
domain = domain,
|
| 345 |
-
skip_expensive = self.skip_expensive_metrics,
|
| 346 |
-
)
|
| 347 |
-
|
| 348 |
-
metric_results[name] = result
|
| 349 |
-
|
| 350 |
-
if result.error:
|
| 351 |
-
warnings.append(f"{name} metric error: {result.error}")
|
| 352 |
-
|
| 353 |
-
except Exception as e:
|
| 354 |
-
logger.error(f"Error computing metric {name}: {repr(e)}")
|
| 355 |
-
errors.append(f"{name}: {repr(e)}")
|
| 356 |
-
|
| 357 |
-
# Create error result
|
| 358 |
-
metric_results[name] = MetricResult(metric_name = name,
|
| 359 |
-
ai_probability = 0.5,
|
| 360 |
-
human_probability = 0.5,
|
| 361 |
-
mixed_probability = 0.0,
|
| 362 |
-
confidence = 0.0,
|
| 363 |
-
error = repr(e),
|
| 364 |
-
)
|
| 365 |
-
finally:
|
| 366 |
-
metrics_execution_time[name] = time.time() - metric_start
|
| 367 |
-
|
| 368 |
-
logger.info(f"Executed {len(metric_results)} metrics successfully")
|
| 369 |
-
|
| 370 |
-
# Ensemble aggregation
|
| 371 |
-
logger.info("Step 5: Aggregating results with ensemble...")
|
| 372 |
-
|
| 373 |
-
try:
|
| 374 |
-
ensemble_result = self.ensemble.predict(metric_results = metric_results,
|
| 375 |
-
domain = domain,
|
| 376 |
-
)
|
| 377 |
-
|
| 378 |
-
except Exception as e:
|
| 379 |
-
logger.error(f"Ensemble prediction failed: {repr(e)}")
|
| 380 |
-
errors.append(f"Ensemble: {repr(e)}")
|
| 381 |
-
|
| 382 |
-
# Create fallback result
|
| 383 |
-
ensemble_result = EnsembleResult(final_verdict = "Error",
|
| 384 |
-
ai_probability = 0.5,
|
| 385 |
-
human_probability = 0.5,
|
| 386 |
-
mixed_probability = 0.0,
|
| 387 |
-
overall_confidence = 0.0,
|
| 388 |
-
domain = domain,
|
| 389 |
-
metric_results = metric_results,
|
| 390 |
-
metric_weights = {},
|
| 391 |
-
weighted_scores = {},
|
| 392 |
-
reasoning = ["Ensemble aggregation failed"],
|
| 393 |
-
uncertainty_score = 1.0,
|
| 394 |
-
consensus_level = 0.0,
|
| 395 |
-
)
|
| 396 |
-
|
| 397 |
-
# Calculate total processing time
|
| 398 |
-
processing_time = time.time() - start_time
|
| 399 |
-
|
| 400 |
-
logger.success(f"Analysis complete: {ensemble_result.final_verdict} "
|
| 401 |
-
f"(AI probability: {ensemble_result.ai_probability:.1%}, "
|
| 402 |
-
f"confidence: {ensemble_result.overall_confidence:.2f}) "
|
| 403 |
-
f"in {processing_time:.2f}s")
|
| 404 |
-
|
| 405 |
-
return DetectionResult(ensemble_result = ensemble_result,
|
| 406 |
-
processed_text = processed_text,
|
| 407 |
-
domain_prediction = domain_prediction,
|
| 408 |
-
language_result = language_result,
|
| 409 |
-
metric_results = metric_results,
|
| 410 |
-
processing_time = processing_time,
|
| 411 |
-
metrics_execution_time = metrics_execution_time,
|
| 412 |
-
warnings = warnings,
|
| 413 |
-
errors = errors,
|
| 414 |
-
)
|
| 415 |
-
|
| 416 |
-
except Exception as e:
|
| 417 |
-
logger.error(f"Fatal error in detection pipeline: {repr(e)}")
|
| 418 |
-
processing_time = time.time() - start_time
|
| 419 |
-
|
| 420 |
-
# Return error result
|
| 421 |
-
return DetectionResult(ensemble_result = EnsembleResult(final_verdict = "Error",
|
| 422 |
-
ai_probability = 0.5,
|
| 423 |
-
human_probability = 0.5,
|
| 424 |
-
mixed_probability = 0.0,
|
| 425 |
-
overall_confidence = 0.0,
|
| 426 |
-
domain = Domain.GENERAL,
|
| 427 |
-
metric_results = {},
|
| 428 |
-
metric_weights = {},
|
| 429 |
-
weighted_scores = {},
|
| 430 |
-
reasoning = [f"Fatal error: {str(e)}"],
|
| 431 |
-
uncertainty_score = 1.0,
|
| 432 |
-
consensus_level = 0.0,
|
| 433 |
-
),
|
| 434 |
-
processed_text = ProcessedText(original_text = text,
|
| 435 |
-
cleaned_text = "",
|
| 436 |
-
sentences = [],
|
| 437 |
-
words = [],
|
| 438 |
-
paragraphs = [],
|
| 439 |
-
char_count = 0,
|
| 440 |
-
word_count = 0,
|
| 441 |
-
sentence_count = 0,
|
| 442 |
-
paragraph_count = 0,
|
| 443 |
-
avg_sentence_length = 0.0,
|
| 444 |
-
avg_word_length = 0.0,
|
| 445 |
-
is_valid = False,
|
| 446 |
-
validation_errors = ["Processing failed"],
|
| 447 |
-
metadata = {},
|
| 448 |
-
),
|
| 449 |
-
domain_prediction = DomainPrediction(primary_domain = Domain.GENERAL,
|
| 450 |
-
secondary_domain = None,
|
| 451 |
-
confidence = 0.0,
|
| 452 |
-
domain_scores = {},
|
| 453 |
-
),
|
| 454 |
-
language_result = None,
|
| 455 |
-
metric_results = {},
|
| 456 |
-
processing_time = processing_time,
|
| 457 |
-
metrics_execution_time = {},
|
| 458 |
-
warnings = [],
|
| 459 |
-
errors = [f"Fatal error: {repr(e)}"],
|
| 460 |
-
)
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
def batch_analyze(self, texts: List[str], domain: Optional[Domain] = None) -> List[DetectionResult]:
|
| 464 |
-
"""
|
| 465 |
-
Analyze multiple texts
|
| 466 |
-
|
| 467 |
-
Arguments:
|
| 468 |
-
----------
|
| 469 |
-
texts { list } : List of texts to analyze
|
| 470 |
-
|
| 471 |
-
domain { Domain } : Override automatic domain detection
|
| 472 |
-
|
| 473 |
-
Returns:
|
| 474 |
-
--------
|
| 475 |
-
{ list } : List of DetectionResult objects
|
| 476 |
-
"""
|
| 477 |
-
logger.info(f"Batch analyzing {len(texts)} texts...")
|
| 478 |
-
|
| 479 |
-
results = list()
|
| 480 |
-
|
| 481 |
-
for i, text in enumerate(texts):
|
| 482 |
-
logger.info(f"Analyzing text {i+1}/{len(texts)}...")
|
| 483 |
-
try:
|
| 484 |
-
result = self.analyze(text = text,
|
| 485 |
-
domain = domain,
|
| 486 |
-
)
|
| 487 |
-
|
| 488 |
-
results.append(result)
|
| 489 |
-
|
| 490 |
-
except Exception as e:
|
| 491 |
-
logger.error(f"Error analyzing text {i+1}: {repr(e)}")
|
| 492 |
-
# Create error result for this text
|
| 493 |
-
error_result = DetectionResult(ensemble_result = EnsembleResult(final_verdict = "Error",
|
| 494 |
-
ai_probability = 0.5,
|
| 495 |
-
human_probability = 0.5,
|
| 496 |
-
mixed_probability = 0.0,
|
| 497 |
-
overall_confidence = 0.0,
|
| 498 |
-
domain = Domain.GENERAL,
|
| 499 |
-
metric_results = {},
|
| 500 |
-
metric_weights = {},
|
| 501 |
-
weighted_scores = {},
|
| 502 |
-
reasoning = [f"Analysis failed: {str(e)}"],
|
| 503 |
-
uncertainty_score = 1.0,
|
| 504 |
-
consensus_level = 0.0,
|
| 505 |
-
),
|
| 506 |
-
processed_text = ProcessedText(original_text = text,
|
| 507 |
-
cleaned_text = "",
|
| 508 |
-
sentences = [],
|
| 509 |
-
words = [],
|
| 510 |
-
paragraphs = [],
|
| 511 |
-
char_count = 0,
|
| 512 |
-
word_count = 0,
|
| 513 |
-
sentence_count = 0,
|
| 514 |
-
paragraph_count = 0,
|
| 515 |
-
avg_sentence_length = 0.0,
|
| 516 |
-
avg_word_length = 0.0,
|
| 517 |
-
is_valid = False,
|
| 518 |
-
validation_errors = ["Processing failed"],
|
| 519 |
-
metadata = {},
|
| 520 |
-
),
|
| 521 |
-
domain_prediction = DomainPrediction(primary_domain = Domain.GENERAL,
|
| 522 |
-
secondary_domain = None,
|
| 523 |
-
confidence = 0.0,
|
| 524 |
-
domain_scores = {},
|
| 525 |
-
),
|
| 526 |
-
language_result = None,
|
| 527 |
-
metric_results = {},
|
| 528 |
-
processing_time = 0.0,
|
| 529 |
-
metrics_execution_time = {},
|
| 530 |
-
warnings = [],
|
| 531 |
-
errors = [f"Analysis failed: {repr(e)}"],
|
| 532 |
-
)
|
| 533 |
-
results.append(error_result)
|
| 534 |
-
|
| 535 |
-
logger.info(f"Batch analysis complete: {len(results)}/{len(texts)} processed")
|
| 536 |
-
return results
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
def cleanup(self):
|
| 540 |
-
"""
|
| 541 |
-
Clean up resources
|
| 542 |
-
"""
|
| 543 |
-
logger.info("Cleaning up detection orchestrator...")
|
| 544 |
-
|
| 545 |
-
for name, metric in self.metrics.items():
|
| 546 |
-
try:
|
| 547 |
-
metric.cleanup()
|
| 548 |
-
logger.debug(f"Cleaned up metric: {name}")
|
| 549 |
-
|
| 550 |
-
except Exception as e:
|
| 551 |
-
logger.warning(f"Error cleaning up metric {name}: {repr(e)}")
|
| 552 |
-
|
| 553 |
-
if self.domain_classifier:
|
| 554 |
-
try:
|
| 555 |
-
self.domain_classifier.cleanup()
|
| 556 |
-
logger.debug("Cleaned up domain classifier")
|
| 557 |
-
|
| 558 |
-
except Exception as e:
|
| 559 |
-
logger.warning(f"Error cleaning up domain classifier: {repr(e)}")
|
| 560 |
-
|
| 561 |
-
if self.language_detector:
|
| 562 |
-
try:
|
| 563 |
-
self.language_detector.cleanup()
|
| 564 |
-
logger.debug("Cleaned up language detector")
|
| 565 |
-
|
| 566 |
-
except Exception as e:
|
| 567 |
-
logger.warning(f"Error cleaning up language detector: {repr(e)}")
|
| 568 |
-
|
| 569 |
-
logger.info("Cleanup complete")
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
# Export
|
| 574 |
-
__all__ = ["DetectionResult",
|
| 575 |
-
"DetectionOrchestrator",
|
| 576 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/API_DOCUMENTATION.md
ADDED
|
@@ -0,0 +1,705 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TEXT-AUTH API Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The TEXT-AUTH API provides evidence-based text forensics and statistical consistency assessment through a RESTful interface. This document covers all endpoints, request/response formats, authentication, rate limiting, and integration examples.
|
| 6 |
+
|
| 7 |
+
**API Version:** 1.0.0
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## Table of Contents
|
| 12 |
+
|
| 13 |
+
1. [Authentication & Security](#authentication--security)
|
| 14 |
+
2. [Rate Limiting](#rate-limiting)
|
| 15 |
+
3. [Common Response Format](#common-response-format)
|
| 16 |
+
4. [Error Handling](#error-handling)
|
| 17 |
+
5. [Core Endpoints](#core-endpoints)
|
| 18 |
+
- [Text Analysis](#text-analysis)
|
| 19 |
+
- [File Analysis](#file-analysis)
|
| 20 |
+
- [Batch Analysis](#batch-analysis)
|
| 21 |
+
6. [Report Endpoints](#report-endpoints)
|
| 22 |
+
7. [Utility Endpoints](#utility-endpoints)
|
| 23 |
+
8. [Best Practices](#best-practices)
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## Authentication & Security
|
| 28 |
+
|
| 29 |
+
### API Key Authentication
|
| 30 |
+
|
| 31 |
+
*Authentication is not enforced in the current deployment. API key authentication may be added in future versions.*
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
## Rate Limiting
|
| 35 |
+
|
| 36 |
+
*Rate limiting is not enforced at the application level. Deployments should use an external gateway (NGINX, API Gateway, Cloudflare) to enforce rate limits if required.*
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
## Common Response Format
|
| 40 |
+
|
| 41 |
+
All successful responses follow this structure:
|
| 42 |
+
|
| 43 |
+
```json
|
| 44 |
+
{
|
| 45 |
+
"status": "success",
|
| 46 |
+
"analysis_id": "...",
|
| 47 |
+
"detection_result": {...},
|
| 48 |
+
"highlighted_html": "...",
|
| 49 |
+
"reasoning": {...},
|
| 50 |
+
"processing_time": 2.34,
|
| 51 |
+
"timestamp": "..."
|
| 52 |
+
}
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### HTTP Status Codes
|
| 56 |
+
|
| 57 |
+
| Code | Meaning | Description |
|
| 58 |
+
|------|---------|-------------|
|
| 59 |
+
| 200 | OK | Request succeeded |
|
| 60 |
+
| 201 | Created | Resource created successfully |
|
| 61 |
+
| 400 | Bad Request | Invalid request parameters |
|
| 62 |
+
| 404 | Not Found | Resource not found |
|
| 63 |
+
| 500 | Internal Server Error | Server error |
|
| 64 |
+
| 503 | Service Unavailable | Service temporarily unavailable |
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## Error Handling
|
| 69 |
+
|
| 70 |
+
### Error Response Format
|
| 71 |
+
|
| 72 |
+
```json
|
| 73 |
+
{
|
| 74 |
+
"status": "error",
|
| 75 |
+
"error": "Invalid domain...",
|
| 76 |
+
"timestamp": "..."
|
| 77 |
+
}
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
### Common Error Codes
|
| 81 |
+
|
| 82 |
+
| Code | Description | Resolution |
|
| 83 |
+
|------|-------------|------------|
|
| 84 |
+
| `TEXT_TOO_LONG` | Text exceeds maximum length (50,000 chars) | Split into multiple requests |
|
| 85 |
+
| `FILE_TOO_LARGE` | File exceeds size limit | Compress or split file |
|
| 86 |
+
| `UNSUPPORTED_FORMAT` | File format not supported | Use .txt, .pdf, .docx, .doc, or .md |
|
| 87 |
+
| `EXTRACTION_FAILED` | Document text extraction failed | Ensure file is not corrupted or password-protected |
|
| 88 |
+
| `MODEL_UNAVAILABLE` | Required model temporarily unavailable | Retry after a few minutes |
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Core Endpoints
|
| 93 |
+
|
| 94 |
+
### Text Analysis
|
| 95 |
+
|
| 96 |
+
**Endpoint:** `POST /api/analyze`
|
| 97 |
+
|
| 98 |
+
Analyze raw text for statistical consistency patterns and forensic signals.
|
| 99 |
+
|
| 100 |
+
#### Request
|
| 101 |
+
|
| 102 |
+
**Headers:**
|
| 103 |
+
```http
|
| 104 |
+
Content-Type: application/json
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
**Body:**
|
| 108 |
+
```json
|
| 109 |
+
{
|
| 110 |
+
"text": "Your text content here...",
|
| 111 |
+
"domain": "academic",
|
| 112 |
+
"enable_highlighting": true,
|
| 113 |
+
"skip_expensive_metrics": false,
|
| 114 |
+
"use_sentence_level": true,
|
| 115 |
+
"include_metrics_summary": true,
|
| 116 |
+
"generate_report": false
|
| 117 |
+
}
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
**Parameters:**
|
| 121 |
+
|
| 122 |
+
| Parameter | Type | Required | Default | Description |
|
| 123 |
+
|-----------|------|----------|---------|-------------|
|
| 124 |
+
| `text` | string | **Yes** | - | Text to analyze (50-50,000 chars) |
|
| 125 |
+
| `domain` | string | No | `null` (auto-detect) | Content domain (see [Domains](#supported-domains)) |
|
| 126 |
+
| `enable_highlighting` | boolean | No | `true` | Generate sentence-level highlights |
|
| 127 |
+
| `skip_expensive_metrics` | boolean | No | `false` | Skip computationally expensive metrics for faster results |
|
| 128 |
+
| `use_sentence_level` | boolean | No | `true` | Use sentence-level granularity for highlighting |
|
| 129 |
+
| `include_metrics_summary` | boolean | No | `true` | Include metric summaries in highlights |
|
| 130 |
+
| `generate_report` | boolean | No | `false` | Generate downloadable PDF/JSON report |
|
| 131 |
+
|
| 132 |
+
#### Response
|
| 133 |
+
|
| 134 |
+
```json
|
| 135 |
+
{
|
| 136 |
+
"status": "success",
|
| 137 |
+
"analysis_id": "analysis_1735555800000",
|
| 138 |
+
"detection_result": {
|
| 139 |
+
"ensemble_result": {
|
| 140 |
+
"final_verdict": "Synthetic",
|
| 141 |
+
"overall_confidence": 0.89,
|
| 142 |
+
"synthetic_probability": 0.92,
|
| 143 |
+
"authentic_probability": 0.08,
|
| 144 |
+
"uncertainty_score": 0.23,
|
| 145 |
+
"decision_boundary_distance": 0.42
|
| 146 |
+
},
|
| 147 |
+
"metric_results": {
|
| 148 |
+
"perplexity": {
|
| 149 |
+
"synthetic_probability": 0.94,
|
| 150 |
+
"confidence": 0.91,
|
| 151 |
+
"raw_score": 15.23,
|
| 152 |
+
"evidence_strength": "strong"
|
| 153 |
+
},
|
| 154 |
+
"entropy": {
|
| 155 |
+
"synthetic_probability": 0.88,
|
| 156 |
+
"confidence": 0.85,
|
| 157 |
+
"raw_score": 4.67,
|
| 158 |
+
"evidence_strength": "moderate"
|
| 159 |
+
},
|
| 160 |
+
"structural": {
|
| 161 |
+
"synthetic_probability": 0.91,
|
| 162 |
+
"confidence": 0.87,
|
| 163 |
+
"burstiness": -0.12,
|
| 164 |
+
"uniformity": 0.85,
|
| 165 |
+
"evidence_strength": "strong"
|
| 166 |
+
},
|
| 167 |
+
"linguistic": {
|
| 168 |
+
"synthetic_probability": 0.86,
|
| 169 |
+
"confidence": 0.82,
|
| 170 |
+
"pos_diversity": 0.42,
|
| 171 |
+
"mean_tree_depth": 4.2,
|
| 172 |
+
"evidence_strength": "moderate"
|
| 173 |
+
},
|
| 174 |
+
"semantic": {
|
| 175 |
+
"synthetic_probability": 0.93,
|
| 176 |
+
"confidence": 0.88,
|
| 177 |
+
"coherence_mean": 0.91,
|
| 178 |
+
"coherence_variance": 0.03,
|
| 179 |
+
"evidence_strength": "strong"
|
| 180 |
+
},
|
| 181 |
+
"multi_perturbation_stability": {
|
| 182 |
+
"synthetic_probability": 0.89,
|
| 183 |
+
"confidence": 0.84,
|
| 184 |
+
"stability_score": 0.12,
|
| 185 |
+
"evidence_strength": "moderate"
|
| 186 |
+
}
|
| 187 |
+
},
|
| 188 |
+
"domain_prediction": {
|
| 189 |
+
"primary_domain": "academic",
|
| 190 |
+
"confidence": 0.94,
|
| 191 |
+
"alternative_domains": [
|
| 192 |
+
{"domain": "technical_doc", "probability": 0.23},
|
| 193 |
+
{"domain": "science", "probability": 0.18}
|
| 194 |
+
]
|
| 195 |
+
},
|
| 196 |
+
"processed_text": {
|
| 197 |
+
"word_count": 487,
|
| 198 |
+
"sentence_count": 23,
|
| 199 |
+
"paragraph_count": 5,
|
| 200 |
+
"avg_sentence_length": 21.2,
|
| 201 |
+
"language": "en"
|
| 202 |
+
}
|
| 203 |
+
},
|
| 204 |
+
"highlighted_html": "<div class=\"text-forensics-highlight\">...</div>",
|
| 205 |
+
"reasoning": {
|
| 206 |
+
"summary": "The text exhibits strong statistical consistency patterns typical of language model generation...",
|
| 207 |
+
"key_indicators": [
|
| 208 |
+
"Unusually uniform sentence structure (burstiness: -0.12)",
|
| 209 |
+
"High semantic coherence across all sentences (mean: 0.91)",
|
| 210 |
+
"Low perplexity variance indicating predictable token sequences"
|
| 211 |
+
],
|
| 212 |
+
"confidence_factors": {
|
| 213 |
+
"supporting_evidence": [
|
| 214 |
+
"6/6 metrics indicate synthetic patterns",
|
| 215 |
+
"Strong cross-metric agreement (correlation: 0.87)"
|
| 216 |
+
],
|
| 217 |
+
"uncertainty_sources": [
|
| 218 |
+
"Domain-specific terminology may affect baseline expectations"
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
"metric_contributions": {
|
| 222 |
+
"perplexity": 0.28,
|
| 223 |
+
"entropy": 0.19,
|
| 224 |
+
"structural": 0.16,
|
| 225 |
+
"semantic": 0.17,
|
| 226 |
+
"linguistic": 0.12,
|
| 227 |
+
"multi_perturbation_stability": 0.08
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"report_files": null,
|
| 231 |
+
"processing_time": 2.34,
|
| 232 |
+
"timestamp": "2025-12-30T10:30:00Z"
|
| 233 |
+
}
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
#### Verdict Interpretation
|
| 237 |
+
|
| 238 |
+
| Verdict | Probability Range | Interpretation |
|
| 239 |
+
|---------|-------------------|----------------|
|
| 240 |
+
| **Synthetic** | > 0.70 | High consistency with language model generation patterns |
|
| 241 |
+
| **Likely Synthetic** | 0.55 - 0.70 | Moderate consistency with synthetic patterns |
|
| 242 |
+
| **Inconclusive** | 0.45 - 0.55 | Insufficient evidence for confident assessment |
|
| 243 |
+
| **Likely Authentic** | 0.30 - 0.45 | Moderate consistency with human authorship patterns |
|
| 244 |
+
| **Authentic** | < 0.30 | High consistency with human authorship patterns |
|
| 245 |
+
|
| 246 |
+
**Important:** These verdicts represent statistical consistency assessments, not definitive authorship claims.
|
| 247 |
+
|
| 248 |
+
#### Highlighting Color Key
|
| 249 |
+
|
| 250 |
+
| Color | Meaning | Probability Range |
|
| 251 |
+
|-------|---------|-------------------|
|
| 252 |
+
| 🔴 Red | Strong synthetic signals | > 0.80 |
|
| 253 |
+
| 🟠 Orange | Moderate synthetic signals | 0.60 - 0.80 |
|
| 254 |
+
| 🟡 Yellow | Weak signals | 0.40 - 0.60 |
|
| 255 |
+
| 🟢 Green | Authentic signals | < 0.40 |
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
### File Analysis
|
| 260 |
+
|
| 261 |
+
**Endpoint:** `POST /api/analyze/file`
|
| 262 |
+
|
| 263 |
+
Analyze uploaded documents (PDF, DOCX, DOC, TXT, MD).
|
| 264 |
+
|
| 265 |
+
#### Request
|
| 266 |
+
|
| 267 |
+
**Headers:**
|
| 268 |
+
```http
|
| 269 |
+
Content-Type: multipart/form-data
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
**Body (form-data):**
|
| 273 |
+
```
|
| 274 |
+
file: [binary file data]
|
| 275 |
+
domain: "academic"
|
| 276 |
+
skip_expensive_metrics: false
|
| 277 |
+
use_sentence_level: true
|
| 278 |
+
include_metrics_summary: true
|
| 279 |
+
generate_report: false
|
| 280 |
+
```
|
| 281 |
+
|
| 282 |
+
**Parameters:**
|
| 283 |
+
|
| 284 |
+
| Parameter | Type | Required | Default | Description |
|
| 285 |
+
|-----------|------|----------|---------|-------------|
|
| 286 |
+
| `file` | file | **Yes** | - | Document file (max 25MB) |
|
| 287 |
+
| `domain` | string | No | `null` | Content domain override |
|
| 288 |
+
| `skip_expensive_metrics` | boolean | No | `false` | Skip expensive metrics |
|
| 289 |
+
| `use_sentence_level` | boolean | No | `true` | Sentence-level highlighting |
|
| 290 |
+
| `include_metrics_summary` | boolean | No | `true` | Include metric summaries |
|
| 291 |
+
| `generate_report` | boolean | No | `false` | Generate report |
|
| 292 |
+
|
| 293 |
+
#### Supported File Formats
|
| 294 |
+
|
| 295 |
+
| Format | Extensions | Max Size | Notes |
|
| 296 |
+
|--------|-----------|----------|-------|
|
| 297 |
+
| Plain Text | .txt, .md | 25MB | UTF-8 encoding recommended |
|
| 298 |
+
| PDF | .pdf | 25MB | Text-based PDFs; OCR not supported |
|
| 299 |
+
| Word | .docx, .doc | 25MB | Modern and legacy formats |
|
| 300 |
+
|
| 301 |
+
#### Response
|
| 302 |
+
|
| 303 |
+
Same structure as [Text Analysis](#text-analysis) with additional `file_info`:
|
| 304 |
+
|
| 305 |
+
```json
|
| 306 |
+
{
|
| 307 |
+
"status": "success",
|
| 308 |
+
"analysis_id": "file_1735555800000",
|
| 309 |
+
"file_info": {
|
| 310 |
+
"filename": "research_paper.pdf",
|
| 311 |
+
"file_type": ".pdf",
|
| 312 |
+
"pages": 12,
|
| 313 |
+
"extraction_method": "pdfplumber",
|
| 314 |
+
"highlighted_html": true
|
| 315 |
+
},
|
| 316 |
+
"detection_result": { /* same as text analysis */ },
|
| 317 |
+
"highlighted_html": "...",
|
| 318 |
+
"reasoning": { /* same as text analysis */ },
|
| 319 |
+
"processing_time": 4.12,
|
| 320 |
+
"timestamp": "2025-12-30T10:30:00Z"
|
| 321 |
+
}
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
#### cURL Example
|
| 325 |
+
|
| 326 |
+
```bash
|
| 327 |
+
curl -X POST https://your-domain.com/api/analyze/file \
|
| 328 |
+
-F "file=@/path/to/document.pdf" \
|
| 329 |
+
-F "domain=academic" \
|
| 330 |
+
-F "generate_report=true"
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
---
|
| 334 |
+
|
| 335 |
+
### Batch Analysis
|
| 336 |
+
|
| 337 |
+
**Endpoint:** `POST /api/analyze/batch`
|
| 338 |
+
|
| 339 |
+
Analyze multiple texts in a single request for efficiency.
|
| 340 |
+
|
| 341 |
+
#### Request
|
| 342 |
+
|
| 343 |
+
```json
|
| 344 |
+
{
|
| 345 |
+
"texts": [
|
| 346 |
+
"First text to analyze...",
|
| 347 |
+
"Second text to analyze...",
|
| 348 |
+
"Third text to analyze..."
|
| 349 |
+
],
|
| 350 |
+
"domain": "academic",
|
| 351 |
+
"skip_expensive_metrics": true,
|
| 352 |
+
"generate_reports": false
|
| 353 |
+
}
|
| 354 |
+
```
|
| 355 |
+
|
| 356 |
+
**Parameters:**
|
| 357 |
+
|
| 358 |
+
| Parameter | Type | Required | Default | Description |
|
| 359 |
+
|-----------|------|----------|---------|-------------|
|
| 360 |
+
| `texts` | array[string] | **Yes** | - | 1-100 texts to analyze |
|
| 361 |
+
| `domain` | string | No | `null` | Apply same domain to all texts |
|
| 362 |
+
| `skip_expensive_metrics` | boolean | No | `true` | Skip expensive metrics (recommended for batch) |
|
| 363 |
+
| `generate_reports` | boolean | No | `false` | Generate reports for each text |
|
| 364 |
+
|
| 365 |
+
#### Response
|
| 366 |
+
|
| 367 |
+
```json
|
| 368 |
+
{
|
| 369 |
+
"status": "success",
|
| 370 |
+
"batch_id": "batch_1735555800000",
|
| 371 |
+
"total": 3,
|
| 372 |
+
"successful": 3,
|
| 373 |
+
"failed": 0,
|
| 374 |
+
"results": [
|
| 375 |
+
{
|
| 376 |
+
"index": 0,
|
| 377 |
+
"status": "success",
|
| 378 |
+
"detection": {
|
| 379 |
+
"ensemble_result": { /* ... */ },
|
| 380 |
+
"metric_results": { /* ... */ }
|
| 381 |
+
},
|
| 382 |
+
"reasoning": { /* ... */ },
|
| 383 |
+
"report_files": null
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"index": 1,
|
| 387 |
+
"status": "success",
|
| 388 |
+
"detection": { /* ... */ }
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"index": 2,
|
| 392 |
+
"status": "error",
|
| 393 |
+
"error": "Text too short (minimum 50 characters)"
|
| 394 |
+
}
|
| 395 |
+
],
|
| 396 |
+
"processing_time": 8.92,
|
| 397 |
+
"timestamp": "2025-12-30T10:30:00Z"
|
| 398 |
+
}
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
#### Performance Tips
|
| 402 |
+
|
| 403 |
+
- Set `skip_expensive_metrics: true` for faster batch processing
|
| 404 |
+
- Keep batch size under 50 texts for optimal performance
|
| 405 |
+
- Consider parallel API calls for batches > 100 texts
|
| 406 |
+
- Monitor `processing_time` to adjust batch sizes
|
| 407 |
+
|
| 408 |
+
---
|
| 409 |
+
|
| 410 |
+
## Report Endpoints
|
| 411 |
+
|
| 412 |
+
### Generate Report
|
| 413 |
+
|
| 414 |
+
**Endpoint:** `POST /api/report/generate`
|
| 415 |
+
|
| 416 |
+
Generate detailed PDF/JSON reports for cached analyses.
|
| 417 |
+
|
| 418 |
+
#### Request
|
| 419 |
+
|
| 420 |
+
**Headers:**
|
| 421 |
+
```http
|
| 422 |
+
Content-Type: application/x-www-form-urlencoded
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
**Body:**
|
| 426 |
+
```
|
| 427 |
+
analysis_id=analysis_1735555800000
|
| 428 |
+
formats=json,pdf
|
| 429 |
+
include_highlights=true
|
| 430 |
+
```
|
| 431 |
+
|
| 432 |
+
**Parameters:**
|
| 433 |
+
|
| 434 |
+
| Parameter | Type | Required | Default | Description |
|
| 435 |
+
|-----------|------|----------|---------|-------------|
|
| 436 |
+
| `analysis_id` | string | **Yes** | - | Analysis ID from previous request |
|
| 437 |
+
| `formats` | string | No | `"json,pdf"` | Comma-separated formats |
|
| 438 |
+
| `include_highlights` | boolean | No | `true` | Include sentence highlights in report |
|
| 439 |
+
|
| 440 |
+
#### Response
|
| 441 |
+
|
| 442 |
+
```json
|
| 443 |
+
{
|
| 444 |
+
"status": "success",
|
| 445 |
+
"analysis_id": "analysis_1735555800000",
|
| 446 |
+
"reports": {
|
| 447 |
+
"json": "analysis_1735555800000.json",
|
| 448 |
+
"pdf": "analysis_1735555800000.pdf"
|
| 449 |
+
},
|
| 450 |
+
"timestamp": "2025-12-30T10:30:00Z"
|
| 451 |
+
}
|
| 452 |
+
```
|
| 453 |
+
|
| 454 |
+
### Download Report
|
| 455 |
+
|
| 456 |
+
**Endpoint:** `GET /api/report/download/{filename}`
|
| 457 |
+
|
| 458 |
+
Download a generated report file.
|
| 459 |
+
|
| 460 |
+
#### Request
|
| 461 |
+
|
| 462 |
+
```http
|
| 463 |
+
GET /api/report/download/analysis_1735555800000.pdf
|
| 464 |
+
```
|
| 465 |
+
|
| 466 |
+
#### Response
|
| 467 |
+
|
| 468 |
+
Binary file download with appropriate `Content-Type` header.
|
| 469 |
+
|
| 470 |
+
**Headers:**
|
| 471 |
+
```http
|
| 472 |
+
Content-Type: application/pdf
|
| 473 |
+
Content-Disposition: attachment; filename="analysis_1735555800000.pdf"
|
| 474 |
+
Content-Length: 524288
|
| 475 |
+
```
|
| 476 |
+
|
| 477 |
+
---
|
| 478 |
+
|
| 479 |
+
## Utility Endpoints
|
| 480 |
+
|
| 481 |
+
### Health Check
|
| 482 |
+
|
| 483 |
+
**Endpoint:** `GET /health`
|
| 484 |
+
|
| 485 |
+
Check API health and model availability.
|
| 486 |
+
|
| 487 |
+
#### Response
|
| 488 |
+
|
| 489 |
+
```json
|
| 490 |
+
{
|
| 491 |
+
"status": "healthy",
|
| 492 |
+
"version": "1.0.0",
|
| 493 |
+
"uptime": 86400.5,
|
| 494 |
+
"models_loaded": {
|
| 495 |
+
"orchestrator": true,
|
| 496 |
+
"highlighter": true,
|
| 497 |
+
"reporter": true,
|
| 498 |
+
"reasoning_generator": true,
|
| 499 |
+
"document_extractor": true,
|
| 500 |
+
"analysis_cache": true,
|
| 501 |
+
"parallel_executor": true
|
| 502 |
+
}
|
| 503 |
+
}
|
| 504 |
+
```
|
| 505 |
+
|
| 506 |
+
### List Domains
|
| 507 |
+
|
| 508 |
+
**Endpoint:** `GET /api/domains`
|
| 509 |
+
|
| 510 |
+
Get all supported content domains with descriptions.
|
| 511 |
+
|
| 512 |
+
#### Response
|
| 513 |
+
|
| 514 |
+
```json
|
| 515 |
+
{
|
| 516 |
+
"domains": [
|
| 517 |
+
{
|
| 518 |
+
"value": "general",
|
| 519 |
+
"name": "General",
|
| 520 |
+
"description": "General-purpose text without domain-specific structure"
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"value": "academic",
|
| 524 |
+
"name": "Academic",
|
| 525 |
+
"description": "Academic papers, essays, research"
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"value": "creative",
|
| 529 |
+
"name": "Creative",
|
| 530 |
+
"description": "Creative writing, fiction, poetry"
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"value": "technical_doc",
|
| 534 |
+
"name": "Technical Doc",
|
| 535 |
+
"description": "Technical documentation, manuals, specs"
|
| 536 |
+
}
|
| 537 |
+
// ... 12 more domains
|
| 538 |
+
]
|
| 539 |
+
}
|
| 540 |
+
```
|
| 541 |
+
|
| 542 |
+
### Supported Domains
|
| 543 |
+
|
| 544 |
+
| Domain | Use Cases | Threshold Adjustments |
|
| 545 |
+
|--------|-----------|----------------------|
|
| 546 |
+
| `general` | Default fallback | Balanced weights |
|
| 547 |
+
| `academic` | Research papers, essays | Higher linguistic weight |
|
| 548 |
+
| `creative` | Fiction, poetry | Higher entropy/structural |
|
| 549 |
+
| `ai_ml` | ML papers, technical AI content | Semantic prioritized |
|
| 550 |
+
| `software_dev` | Code docs, READMEs | Structural relaxed |
|
| 551 |
+
| `technical_doc` | Manuals, specs | Higher semantic weight |
|
| 552 |
+
| `engineering` | Technical reports | Balanced technical focus |
|
| 553 |
+
| `science` | Scientific papers | Academic-like calibration |
|
| 554 |
+
| `business` | Reports, proposals | Formal structure emphasis |
|
| 555 |
+
| `legal` | Contracts, court filings | Strict structural patterns |
|
| 556 |
+
| `medical` | Clinical notes, research | Domain-specific terminology |
|
| 557 |
+
| `journalism` | News articles | Balanced, lower burstiness |
|
| 558 |
+
| `marketing` | Ad copy, campaigns | Creative elements |
|
| 559 |
+
| `social_media` | Posts, casual writing | Relaxed metrics, high perplexity weight |
|
| 560 |
+
| `blog_personal` | Personal blogs, diaries | Creative + casual mix |
|
| 561 |
+
| `tutorial` | How-to guides | Instructional patterns |
|
| 562 |
+
|
| 563 |
+
### Cache Statistics
|
| 564 |
+
|
| 565 |
+
**Endpoint:** `GET /api/cache/stats`
|
| 566 |
+
|
| 567 |
+
Get analysis cache statistics (admin only).
|
| 568 |
+
|
| 569 |
+
#### Response
|
| 570 |
+
|
| 571 |
+
```json
|
| 572 |
+
{
|
| 573 |
+
"cache_size": 42,
|
| 574 |
+
"max_size": 100,
|
| 575 |
+
"ttl_seconds": 3600
|
| 576 |
+
}
|
| 577 |
+
```
|
| 578 |
+
|
| 579 |
+
### Clear Cache
|
| 580 |
+
|
| 581 |
+
**Endpoint:** `POST /api/cache/clear`
|
| 582 |
+
|
| 583 |
+
Clear analysis cache (admin only).
|
| 584 |
+
|
| 585 |
+
#### Response
|
| 586 |
+
|
| 587 |
+
```json
|
| 588 |
+
{
|
| 589 |
+
"status": "success",
|
| 590 |
+
"message": "Cache cleared"
|
| 591 |
+
}
|
| 592 |
+
```
|
| 593 |
+
|
| 594 |
+
---
|
| 595 |
+
|
| 596 |
+
## Best Practices
|
| 597 |
+
|
| 598 |
+
### Optimization Tips
|
| 599 |
+
|
| 600 |
+
1. **Domain Selection**
|
| 601 |
+
- Always specify domain when known for better accuracy
|
| 602 |
+
- Use `/api/domains` to explore available options
|
| 603 |
+
- Let system auto-detect only when domain is truly unknown
|
| 604 |
+
|
| 605 |
+
2. **Performance**
|
| 606 |
+
- Set `skip_expensive_metrics: true` for faster results when speed matters
|
| 607 |
+
- Use batch API for multiple texts instead of sequential single requests
|
| 608 |
+
- Cache `analysis_id` to regenerate reports without reanalysis
|
| 609 |
+
|
| 610 |
+
3. **Accuracy**
|
| 611 |
+
- Provide clean, well-formatted text (remove excessive whitespace)
|
| 612 |
+
- Minimum 100 words recommended for reliable results
|
| 613 |
+
- Avoid mixing languages in single analysis
|
| 614 |
+
|
| 615 |
+
4. **Rate Limiting**
|
| 616 |
+
- Implement exponential backoff on 429 responses
|
| 617 |
+
- Monitor `X-RateLimit-Remaining` header
|
| 618 |
+
- Upgrade tier if consistently hitting limits
|
| 619 |
+
|
| 620 |
+
5. **Error Handling**
|
| 621 |
+
- Always check `status` field in response
|
| 622 |
+
- Log `request_id` for support requests
|
| 623 |
+
- Implement retry logic with jitter for transient errors
|
| 624 |
+
|
| 625 |
+
### Security Recommendations
|
| 626 |
+
|
| 627 |
+
1. **API Key Management**
|
| 628 |
+
- Rotate keys every 90 days
|
| 629 |
+
- Use separate keys for dev/staging/production
|
| 630 |
+
- Revoke compromised keys immediately
|
| 631 |
+
|
| 632 |
+
2. **Data Privacy**
|
| 633 |
+
- Never send PII unless absolutely necessary
|
| 634 |
+
- Use client-side redaction before API calls
|
| 635 |
+
- Enable data retention policies in dashboard
|
| 636 |
+
|
| 637 |
+
3. **Input Validation**
|
| 638 |
+
- Sanitize user input before sending to API
|
| 639 |
+
- Validate file types client-side
|
| 640 |
+
- Implement size limits before upload
|
| 641 |
+
|
| 642 |
+
---
|
| 643 |
+
|
| 644 |
+
## Version History:
|
| 645 |
+
|
| 646 |
+
- **1.0.0** (2025-12-30): Initial release
|
| 647 |
+
- 6 forensic metrics
|
| 648 |
+
- 16 domain support
|
| 649 |
+
- PDF/JSON reporting
|
| 650 |
+
- Batch processing
|
| 651 |
+
|
| 652 |
+
---
|
| 653 |
+
|
| 654 |
+
## Appendix
|
| 655 |
+
|
| 656 |
+
### Complete Domain List with Aliases
|
| 657 |
+
|
| 658 |
+
```python
|
| 659 |
+
DOMAIN_ALIASES = {
|
| 660 |
+
'general': ['default', 'generic'],
|
| 661 |
+
'academic': ['education', 'research', 'scholarly', 'university'],
|
| 662 |
+
'creative': ['fiction', 'literature', 'story', 'narrative'],
|
| 663 |
+
'ai_ml': ['ai', 'ml', 'machinelearning', 'neural'],
|
| 664 |
+
'software_dev': ['software', 'code', 'programming', 'dev'],
|
| 665 |
+
'technical_doc': ['technical', 'tech', 'documentation', 'manual'],
|
| 666 |
+
'engineering': ['engineer'],
|
| 667 |
+
'science': ['scientific'],
|
| 668 |
+
'business': ['corporate', 'commercial', 'enterprise'],
|
| 669 |
+
'legal': ['law', 'contract', 'court'],
|
| 670 |
+
'medical': ['healthcare', 'clinical', 'medicine', 'health'],
|
| 671 |
+
'journalism': ['news', 'reporting', 'media', 'press'],
|
| 672 |
+
'marketing': ['advertising', 'promotional', 'brand', 'sales'],
|
| 673 |
+
'social_media': ['social', 'casual', 'informal', 'posts'],
|
| 674 |
+
'blog_personal': ['blog', 'personal', 'diary', 'lifestyle'],
|
| 675 |
+
'tutorial': ['guide', 'howto', 'instructional', 'walkthrough']
|
| 676 |
+
}
|
| 677 |
+
```
|
| 678 |
+
|
| 679 |
+
### Metric Weight Defaults
|
| 680 |
+
|
| 681 |
+
```python
|
| 682 |
+
DEFAULT_WEIGHTS = {
|
| 683 |
+
'perplexity': 0.25,
|
| 684 |
+
'entropy': 0.20,
|
| 685 |
+
'structural': 0.15,
|
| 686 |
+
'semantic': 0.15,
|
| 687 |
+
'linguistic': 0.15,
|
| 688 |
+
'multi_perturbation_stability': 0.10
|
| 689 |
+
}
|
| 690 |
+
```
|
| 691 |
+
|
| 692 |
+
### Response Time Estimates
|
| 693 |
+
|
| 694 |
+
| Operation | Min | Avg | Max | P95 |
|
| 695 |
+
|-----------|-----|-----|-----|-----|
|
| 696 |
+
| Text Analysis (500 words) | 1.2s | 2.3s | 4.5s | 3.8s |
|
| 697 |
+
| File Analysis (PDF, 10 pages) | 2.5s | 4.1s | 8.2s | 6.9s |
|
| 698 |
+
| Batch (10 texts) | 5.8s | 9.2s | 15.3s | 13.1s |
|
| 699 |
+
| Report Generation | 0.3s | 0.8s | 2.1s | 1.5s |
|
| 700 |
+
|
| 701 |
+
---
|
| 702 |
+
|
| 703 |
+
*Last Updated: December 30, 2025*
|
| 704 |
+
*API Version: 1.0.0*
|
| 705 |
+
*Documentation Version: 1.0.0*
|
docs/ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,821 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# TEXT-AUTH: System Architecture Documentation
|
| 2 |
+
|
| 3 |
+
> TEXT-AUTH is an evidence-first, domain-aware AI text detection system
|
| 4 |
+
> designed around independent signals, calibrated aggregation, and
|
| 5 |
+
> explainability rather than black-box classification.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Table of Contents
|
| 10 |
+
1. [System Overview](#system-overview)
|
| 11 |
+
2. [High-Level Architecture](#high-level-architecture)
|
| 12 |
+
3. [Layer-by-Layer Architecture](#layer-by-layer-architecture)
|
| 13 |
+
4. [Data Flow](#data-flow)
|
| 14 |
+
5. [Technology Stack](#technology-stack)
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## System Overview
|
| 19 |
+
|
| 20 |
+
**TEXT-AUTH** is a sophisticated AI text detection system that employs multiple machine learning metrics and ensemble methods to determine whether text is synthetically generated, authentically written, or hybrid content.
|
| 21 |
+
|
| 22 |
+
### Key Capabilities
|
| 23 |
+
- **Multi-Metric Analysis**: 6 independent detection metrics (Structural, Perplexity, Entropy, Semantic, Linguistic, Multi-Perturbation Stability)
|
| 24 |
+
- **Domain-Aware Calibration**: Adaptive thresholds for 16 text domains (Academic, Creative, Technical, etc.)
|
| 25 |
+
- **Ensemble Aggregation**: Confidence-weighted combination with uncertainty quantification
|
| 26 |
+
- **Sentence-Level Highlighting**: Visual feedback with probability scores
|
| 27 |
+
- **Comprehensive Reporting**: JSON and PDF reports with detailed analysis
|
| 28 |
+
|
| 29 |
+
### Design Principles
|
| 30 |
+
- **Modular Architecture**: Clean separation of concerns across layers
|
| 31 |
+
- **Fail-Safe Design**: Graceful degradation with fallback strategies
|
| 32 |
+
- **Parallel Processing**: Multi-threaded metric execution for performance
|
| 33 |
+
- **Domain Expertise**: Specialized thresholds calibrated per content type
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
## Why Multi-Metric Instead of a Single Classifier?
|
| 37 |
+
|
| 38 |
+
- Single classifiers overfit stylistic artifacts
|
| 39 |
+
- LLMs rapidly adapt to detectors
|
| 40 |
+
- Independent statistical signals decay slower
|
| 41 |
+
- Ensemble disagreement is itself evidence
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
## High-Level Architecture
|
| 46 |
+
|
| 47 |
+
```mermaid
|
| 48 |
+
graph TB
|
| 49 |
+
subgraph "Presentation Layer"
|
| 50 |
+
UI[Web Interface/API]
|
| 51 |
+
end
|
| 52 |
+
|
| 53 |
+
subgraph "Application Layer"
|
| 54 |
+
ORCH[Detection Orchestrator]
|
| 55 |
+
ORCH --> |coordinates| PIPE[Processing Pipeline]
|
| 56 |
+
end
|
| 57 |
+
|
| 58 |
+
subgraph "Service Layer"
|
| 59 |
+
ENSEMBLE[Ensemble Classifier]
|
| 60 |
+
HIGHLIGHT[Text Highlighter]
|
| 61 |
+
REASON[Reasoning Generator]
|
| 62 |
+
REPORT[Report Generator]
|
| 63 |
+
end
|
| 64 |
+
|
| 65 |
+
subgraph "Processing Layer"
|
| 66 |
+
EXTRACT[Document Extractor]
|
| 67 |
+
TEXTPROC[Text Processor]
|
| 68 |
+
DOMAIN[Domain Classifier]
|
| 69 |
+
LANG[Language Detector]
|
| 70 |
+
end
|
| 71 |
+
|
| 72 |
+
subgraph "Metrics Layer"
|
| 73 |
+
STRUCT[Structural Metric]
|
| 74 |
+
PERP[Perplexity Metric]
|
| 75 |
+
ENT[Entropy Metric]
|
| 76 |
+
SEM[Semantic Metric]
|
| 77 |
+
LING[Linguistic Metric]
|
| 78 |
+
MPS[Multi-Perturbation Stability]
|
| 79 |
+
end
|
| 80 |
+
|
| 81 |
+
subgraph "Model Layer"
|
| 82 |
+
MANAGER[Model Manager]
|
| 83 |
+
REGISTRY[Model Registry]
|
| 84 |
+
CACHE[(Model Cache)]
|
| 85 |
+
end
|
| 86 |
+
|
| 87 |
+
subgraph "Configuration Layer"
|
| 88 |
+
CONFIG[Settings]
|
| 89 |
+
ENUMS[Enums]
|
| 90 |
+
SCHEMAS[Data Schemas]
|
| 91 |
+
CONSTANTS[Constants]
|
| 92 |
+
THRESHOLDS[Domain Thresholds]
|
| 93 |
+
end
|
| 94 |
+
|
| 95 |
+
UI --> ORCH
|
| 96 |
+
|
| 97 |
+
ORCH --> EXTRACT
|
| 98 |
+
ORCH --> TEXTPROC
|
| 99 |
+
ORCH --> DOMAIN
|
| 100 |
+
ORCH --> LANG
|
| 101 |
+
|
| 102 |
+
ORCH --> STRUCT
|
| 103 |
+
ORCH --> PERP
|
| 104 |
+
ORCH --> ENT
|
| 105 |
+
ORCH --> SEM
|
| 106 |
+
ORCH --> LING
|
| 107 |
+
ORCH --> MPS
|
| 108 |
+
|
| 109 |
+
ORCH --> ENSEMBLE
|
| 110 |
+
ENSEMBLE --> HIGHLIGHT
|
| 111 |
+
ENSEMBLE --> REASON
|
| 112 |
+
ENSEMBLE --> REPORT
|
| 113 |
+
|
| 114 |
+
STRUCT --> MANAGER
|
| 115 |
+
PERP --> MANAGER
|
| 116 |
+
ENT --> MANAGER
|
| 117 |
+
SEM --> MANAGER
|
| 118 |
+
LING --> MANAGER
|
| 119 |
+
MPS --> MANAGER
|
| 120 |
+
DOMAIN --> MANAGER
|
| 121 |
+
LANG --> MANAGER
|
| 122 |
+
|
| 123 |
+
MANAGER --> REGISTRY
|
| 124 |
+
MANAGER --> CACHE
|
| 125 |
+
|
| 126 |
+
ORCH --> CONFIG
|
| 127 |
+
ENSEMBLE --> THRESHOLDS
|
| 128 |
+
|
| 129 |
+
style UI fill:#e1f5ff
|
| 130 |
+
style ORCH fill:#fff3e0
|
| 131 |
+
style ENSEMBLE fill:#f3e5f5
|
| 132 |
+
style MANAGER fill:#e8f5e9
|
| 133 |
+
style CONFIG fill:#fce4ec
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
---
|
| 137 |
+
|
| 138 |
+
## Layer-by-Layer Architecture
|
| 139 |
+
|
| 140 |
+
### 1. Configuration Layer (`config/`)
|
| 141 |
+
|
| 142 |
+
The foundation layer providing enums, schemas, constants, and domain-specific thresholds.
|
| 143 |
+
|
| 144 |
+
```mermaid
|
| 145 |
+
graph LR
|
| 146 |
+
subgraph "Configuration Layer"
|
| 147 |
+
direction TB
|
| 148 |
+
|
| 149 |
+
ENUMS["enums.py
|
| 150 |
+
Domain, Language, Script,
|
| 151 |
+
ModelType ConfidenceLevel"]
|
| 152 |
+
|
| 153 |
+
SCHEMAS["schemas.py
|
| 154 |
+
ModelConfig, ProcessedText, MetricResult, EnsembleResult,
|
| 155 |
+
DetectionResult"]
|
| 156 |
+
|
| 157 |
+
CONSTANTS["constants.py
|
| 158 |
+
TextProcessingParams, MetricParams,
|
| 159 |
+
EnsembleParams"]
|
| 160 |
+
|
| 161 |
+
THRESHOLDS["threshold_config.py
|
| 162 |
+
DomainThresholds 16,
|
| 163 |
+
Domain Configs MetricThresholds"]
|
| 164 |
+
|
| 165 |
+
MODELCFG["model_config.py
|
| 166 |
+
Model Registry, Model Groups, Default Weights"]
|
| 167 |
+
|
| 168 |
+
SETTINGS["settings.py
|
| 169 |
+
App Settings, Paths, Feature Flags"]
|
| 170 |
+
end
|
| 171 |
+
|
| 172 |
+
ENUMS -.->|used by| SCHEMAS
|
| 173 |
+
ENUMS -.->|used by| THRESHOLDS
|
| 174 |
+
SCHEMAS -.->|used by| CONSTANTS
|
| 175 |
+
THRESHOLDS -.->|imports| ENUMS
|
| 176 |
+
MODELCFG -.->|imports| ENUMS
|
| 177 |
+
|
| 178 |
+
style ENUMS fill:#ffebee
|
| 179 |
+
style SCHEMAS fill:#fff3e0
|
| 180 |
+
style CONSTANTS fill:#e8f5e9
|
| 181 |
+
style THRESHOLDS fill:#e1f5ff
|
| 182 |
+
style MODELCFG fill:#f3e5f5
|
| 183 |
+
style SETTINGS fill:#fce4ec
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
**Key Components:**
|
| 187 |
+
- **enums.py**: Core enumerations (Domain, Language, Script, ModelType, ConfidenceLevel)
|
| 188 |
+
- **schemas.py**: Data classes for structured data exchange
|
| 189 |
+
- **constants.py**: Frozen dataclasses with hyperparameters for each metric
|
| 190 |
+
- **threshold_config.py**: Domain-specific thresholds for 16 domains
|
| 191 |
+
- **model_config.py**: Model registry with download priorities and configurations
|
| 192 |
+
- **settings.py**: Application settings with Pydantic validation
|
| 193 |
+
|
| 194 |
+
---
|
| 195 |
+
|
| 196 |
+
### 2. Model Abstraction Layer (`models/`)
|
| 197 |
+
|
| 198 |
+
Conceptual model abstraction layer used by metrics for centralized loading and reuse - loading, caching, and providing unified access.
|
| 199 |
+
|
| 200 |
+
```mermaid
|
| 201 |
+
graph TB
|
| 202 |
+
subgraph "Model Layer"
|
| 203 |
+
direction TB
|
| 204 |
+
|
| 205 |
+
MANAGER["Model Manager
|
| 206 |
+
Singleton Pattern Lazy Loading"]
|
| 207 |
+
|
| 208 |
+
REGISTRY["Model Registry
|
| 209 |
+
10 Model Configs Priority Groups"]
|
| 210 |
+
|
| 211 |
+
subgraph "Model Cache"
|
| 212 |
+
direction LR
|
| 213 |
+
GPT2[GPT-2548MBPerplexity/MPS]
|
| 214 |
+
MINILM[MiniLM-L6-v280MBSemantic]
|
| 215 |
+
SPACY[spaCy sm13MBLinguistic]
|
| 216 |
+
ROBERTA[RoBERTa500MBDomain Classifier]
|
| 217 |
+
DISTIL[DistilRoBERTa330MBMPS Mask]
|
| 218 |
+
XLM[XLM-RoBERTa1100MBLanguage Detection]
|
| 219 |
+
end
|
| 220 |
+
|
| 221 |
+
STATS[Usage StatisticsTracking Performance Metrics]
|
| 222 |
+
end
|
| 223 |
+
|
| 224 |
+
MANAGER -->|loads from| REGISTRY
|
| 225 |
+
MANAGER -->|manages| GPT2
|
| 226 |
+
MANAGER -->|manages| MINILM
|
| 227 |
+
MANAGER -->|manages| SPACY
|
| 228 |
+
MANAGER -->|manages| ROBERTA
|
| 229 |
+
MANAGER -->|manages| DISTIL
|
| 230 |
+
MANAGER -->|manages| XLM
|
| 231 |
+
MANAGER -->|tracks| STATS
|
| 232 |
+
|
| 233 |
+
REGISTRY -.->|defines| GPT2
|
| 234 |
+
REGISTRY -.->|defines| MINILM
|
| 235 |
+
REGISTRY -.->|defines| SPACY
|
| 236 |
+
|
| 237 |
+
style MANAGER fill:#e3f2fd
|
| 238 |
+
style REGISTRY fill:#f3e5f5
|
| 239 |
+
style STATS fill:#fff3e0
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
**Key Features:**
|
| 243 |
+
- **Lazy Loading**: Models loaded on-demand
|
| 244 |
+
- **Caching Strategy**: LRU cache with max 5 models
|
| 245 |
+
- **Usage Tracking**: Statistics for optimization
|
| 246 |
+
- **Priority Groups**: Essential, Extended, Optional
|
| 247 |
+
- **Total Size**: ~2.8GB for all models
|
| 248 |
+
|
| 249 |
+
---
|
| 250 |
+
|
| 251 |
+
### 3. Processing Layer (`processors/`)
|
| 252 |
+
|
| 253 |
+
Handles document extraction, text preprocessing, domain classification, and language detection.
|
| 254 |
+
|
| 255 |
+
```mermaid
|
| 256 |
+
graph TB
|
| 257 |
+
subgraph "Processing Layer"
|
| 258 |
+
direction TB
|
| 259 |
+
|
| 260 |
+
subgraph "Document Extraction"
|
| 261 |
+
EXTRACT[Document Extractor]
|
| 262 |
+
EXTRACT -->|PDF| PYPDF[PyMuPDF Primary]
|
| 263 |
+
EXTRACT -->|PDF| PDFPLUMB[pdfplumber Fallback]
|
| 264 |
+
EXTRACT -->|PDF| PYPDF2[PyPDF2 Fallback]
|
| 265 |
+
EXTRACT -->|DOCX| DOCX[python-docx]
|
| 266 |
+
EXTRACT -->|HTML| BS4[BeautifulSoup4]
|
| 267 |
+
EXTRACT -->|RTF| RTF[Basic Parser]
|
| 268 |
+
EXTRACT -->|TXT| TXT[Chardet Encoding]
|
| 269 |
+
end
|
| 270 |
+
|
| 271 |
+
subgraph "Text Processing"
|
| 272 |
+
TEXTPROC[Text Processor]
|
| 273 |
+
TEXTPROC --> CLEAN[Unicode NormalizationURL/Email RemovalWhitespace Cleaning]
|
| 274 |
+
TEXTPROC --> SPLIT[Smart Sentence SplittingAbbreviation HandlingWord Tokenization]
|
| 275 |
+
TEXTPROC --> VALIDATE[Length ValidationQuality ChecksStatistics]
|
| 276 |
+
end
|
| 277 |
+
|
| 278 |
+
subgraph "Domain Classification"
|
| 279 |
+
DOMAIN[Domain Classifier]
|
| 280 |
+
DOMAIN --> ZERO[Heuristic + optional model-assisted domain inference RoBERTa/DeBERTa]
|
| 281 |
+
DOMAIN --> LABELS[16 Domain LabelsMulti-Label Candidates]
|
| 282 |
+
DOMAIN --> THRESH[Domain-SpecificThreshold Selection]
|
| 283 |
+
end
|
| 284 |
+
|
| 285 |
+
subgraph "Language Detection"
|
| 286 |
+
LANG[Language Detector]
|
| 287 |
+
LANG --> MODEL[XLM-RoBERTaChunk-Based Analysis]
|
| 288 |
+
LANG --> FALLBACK[langdetect Library]
|
| 289 |
+
LANG --> HEURISTIC[Script DetectionCharacter Analysis]
|
| 290 |
+
end
|
| 291 |
+
end
|
| 292 |
+
|
| 293 |
+
EXTRACT -->|ProcessedText| TEXTPROC
|
| 294 |
+
TEXTPROC -->|Cleaned Text| DOMAIN
|
| 295 |
+
TEXTPROC -->|Cleaned Text| LANG
|
| 296 |
+
|
| 297 |
+
style EXTRACT fill:#e8f5e9
|
| 298 |
+
style TEXTPROC fill:#fff3e0
|
| 299 |
+
style DOMAIN fill:#e1f5ff
|
| 300 |
+
style LANG fill:#f3e5f5
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
**Processing Pipeline:**
|
| 304 |
+
1. **Document Extraction**: Multi-format support with fallback strategies
|
| 305 |
+
2. **Text Cleaning**: Unicode normalization, noise removal, validation
|
| 306 |
+
3. **Domain Classification**: Zero-shot classification with confidence scores
|
| 307 |
+
4. **Language Detection**: Multi-strategy approach with script analysis
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
### 4. Metrics Layer (`metrics/`)
|
| 312 |
+
|
| 313 |
+
Six independent detection metrics analyzing different text characteristics.
|
| 314 |
+
|
| 315 |
+
```mermaid
|
| 316 |
+
graph TB
|
| 317 |
+
subgraph "Metrics Layer"
|
| 318 |
+
direction TB
|
| 319 |
+
|
| 320 |
+
BASE[Base MetricAbstract ClassCommon Interface]
|
| 321 |
+
|
| 322 |
+
subgraph "Statistical Metrics"
|
| 323 |
+
STRUCT[Structural MetricNo ML ModelStatistical Features]
|
| 324 |
+
STRUCT --> SF1[Sentence Length DistributionBurstiness ScoreReadability]
|
| 325 |
+
STRUCT --> SF2[N-gram DiversityType-Token RatioRepetition Patterns]
|
| 326 |
+
end
|
| 327 |
+
|
| 328 |
+
subgraph "ML-Based Metrics"
|
| 329 |
+
PERP[Perplexity MetricGPT-2 ModelText Predictability]
|
| 330 |
+
PERP --> PF1[Overall PerplexitySentence-Level PerplexityCross-Entropy]
|
| 331 |
+
PERP --> PF2[Chunk AnalysisVariance ScoringNormalization]
|
| 332 |
+
|
| 333 |
+
ENT[Entropy MetricGPT-2 TokenizerRandomness Analysis]
|
| 334 |
+
ENT --> EF1[Character EntropyWord EntropyToken Entropy]
|
| 335 |
+
ENT --> EF2[Token DiversitySequence UnpredictabilityPattern Detection]
|
| 336 |
+
|
| 337 |
+
SEM[Semantic MetricMiniLM EmbeddingsCoherence Analysis]
|
| 338 |
+
SEM --> SF3[Sentence SimilarityTopic ConsistencyCoherence Score]
|
| 339 |
+
SEM --> SF4[Repetition DetectionTopic DriftContextual Consistency]
|
| 340 |
+
|
| 341 |
+
LING[Linguistic MetricspaCy NLPGrammar Analysis]
|
| 342 |
+
LING --> LF1[POS DiversityPOS EntropySyntactic Complexity]
|
| 343 |
+
LING --> LF2[Grammatical PatternsWriting StylePattern Detection]
|
| 344 |
+
|
| 345 |
+
MPS[Multi-PerturbationGPT-2 + DistilRoBERTaStability Analysis]
|
| 346 |
+
MPS --> MF1[Text PerturbationLikelihood CalculationStability Score]
|
| 347 |
+
MPS --> MF2[Curvature AnalysisChunk StabilityVariance Scoring]
|
| 348 |
+
end
|
| 349 |
+
end
|
| 350 |
+
|
| 351 |
+
BASE -.->|inherited by| STRUCT
|
| 352 |
+
BASE -.->|inherited by| PERP
|
| 353 |
+
BASE -.->|inherited by| ENT
|
| 354 |
+
BASE -.->|inherited by| SEM
|
| 355 |
+
BASE -.->|inherited by| LING
|
| 356 |
+
BASE -.->|inherited by| MPS
|
| 357 |
+
|
| 358 |
+
style BASE fill:#ffebee
|
| 359 |
+
style STRUCT fill:#e8f5e9
|
| 360 |
+
style PERP fill:#fff3e0
|
| 361 |
+
style ENT fill:#e1f5ff
|
| 362 |
+
style SEM fill:#f3e5f5
|
| 363 |
+
style LING fill:#fce4ec
|
| 364 |
+
style MPS fill:#fff9c4
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
**Metric Characteristics:**
|
| 368 |
+
|
| 369 |
+
| Metric | Model Required | Complexity | Typical Influence Range (Indicative) |
|
| 370 |
+
|--------|---------------|------------|--------------|
|
| 371 |
+
| Structural | ❌ | Low | 15-20% |
|
| 372 |
+
| Perplexity | GPT-2 | Medium | 20-27% |
|
| 373 |
+
| Entropy | GPT-2 Tokenizer | Medium | 13-17% |
|
| 374 |
+
| Semantic | MiniLM | Medium | 18-20% |
|
| 375 |
+
| Linguistic | spaCy | Medium | 12-16% |
|
| 376 |
+
| MPS | GPT-2 + DistilRoBERTa | High | 8-10% |
|
| 377 |
+
|
| 378 |
+
> *Actual weights are dynamically calibrated per domain and configuration.*
|
| 379 |
+
|
| 380 |
+
---
|
| 381 |
+
|
| 382 |
+
### 5. Service Layer (`services/`)
|
| 383 |
+
|
| 384 |
+
Coordinates ensemble aggregation, highlighting, reasoning generation, and orchestration.
|
| 385 |
+
|
| 386 |
+
```mermaid
|
| 387 |
+
graph TB
|
| 388 |
+
subgraph "Service Layer"
|
| 389 |
+
direction TB
|
| 390 |
+
|
| 391 |
+
subgraph "Orchestrator"
|
| 392 |
+
ORCH[Detection OrchestratorPipeline Coordinator]
|
| 393 |
+
ORCH --> PIPE[Processing Pipeline6-Step Execution]
|
| 394 |
+
PIPE --> STEP1[1. Text Preprocessing]
|
| 395 |
+
PIPE --> STEP2[2. Language Detection]
|
| 396 |
+
PIPE --> STEP3[3. Domain Classification]
|
| 397 |
+
PIPE --> STEP4[4. Metric ExecutionParallel/Sequential]
|
| 398 |
+
PIPE --> STEP5[5. Ensemble Aggregation]
|
| 399 |
+
PIPE --> STEP6[6. Result Compilation]
|
| 400 |
+
end
|
| 401 |
+
|
| 402 |
+
subgraph "Ensemble Classifier"
|
| 403 |
+
ENSEMBLE[Ensemble ClassifierMulti-Strategy Aggregation]
|
| 404 |
+
ENSEMBLE --> METHOD1[Confidence CalibratedSigmoid Weighting]
|
| 405 |
+
ENSEMBLE --> METHOD2[Consensus BasedAgreement Rewards]
|
| 406 |
+
ENSEMBLE --> METHOD3[Domain WeightedStatic Weights]
|
| 407 |
+
ENSEMBLE --> METHOD4[Simple AverageFallback]
|
| 408 |
+
ENSEMBLE --> CALC[Uncertainty QuantificationConsensus AnalysisConfidence Scoring]
|
| 409 |
+
end
|
| 410 |
+
|
| 411 |
+
subgraph "Highlighter"
|
| 412 |
+
HIGHLIGHT[Text HighlighterSentence-Level Analysis]
|
| 413 |
+
HIGHLIGHT --> COLORS[4-Color SystemAuthentic/UncertainHybrid/Synthetic]
|
| 414 |
+
HIGHLIGHT --> SENTENCE[Sentence EnsembleDomain AdjustmentsTooltip Generation]
|
| 415 |
+
end
|
| 416 |
+
|
| 417 |
+
subgraph "Reasoning"
|
| 418 |
+
REASON[Reasoning GeneratorExplainable AI]
|
| 419 |
+
REASON --> SUMMARY[Executive SummaryVerdict Explanation]
|
| 420 |
+
REASON --> INDICATORS[Key IndicatorsMetric Breakdown]
|
| 421 |
+
REASON --> EVIDENCE[Supporting EvidenceContradicting Evidence]
|
| 422 |
+
REASON --> RECOM[RecommendationsUncertainty Analysis]
|
| 423 |
+
end
|
| 424 |
+
end
|
| 425 |
+
|
| 426 |
+
ORCH -->|coordinates| ENSEMBLE
|
| 427 |
+
ORCH -->|uses| HIGHLIGHT
|
| 428 |
+
ORCH -->|uses| REASON
|
| 429 |
+
ENSEMBLE -->|provides| HIGHLIGHT
|
| 430 |
+
ENSEMBLE -->|provides| REASON
|
| 431 |
+
|
| 432 |
+
style ORCH fill:#fff3e0
|
| 433 |
+
style ENSEMBLE fill:#e3f2fd
|
| 434 |
+
style HIGHLIGHT fill:#f3e5f5
|
| 435 |
+
style REASON fill:#e8f5e9
|
| 436 |
+
```
|
| 437 |
+
|
| 438 |
+
**Service Features:**
|
| 439 |
+
- **Parallel Execution**: ThreadPoolExecutor for metric computation
|
| 440 |
+
- **Ensemble Methods**: 4 aggregation strategies with fallbacks
|
| 441 |
+
- **Sentence Highlighting**: 4-category color system (Authentic/Uncertain/Hybrid/Synthetic)
|
| 442 |
+
- **Explainable AI**: Detailed reasoning with metric contributions
|
| 443 |
+
|
| 444 |
+
---
|
| 445 |
+
|
| 446 |
+
### 6. Reporter Layer (`reporter/`)
|
| 447 |
+
|
| 448 |
+
Generates comprehensive reports in multiple formats.
|
| 449 |
+
|
| 450 |
+
```mermaid
|
| 451 |
+
graph TB
|
| 452 |
+
subgraph "Reporter Layer"
|
| 453 |
+
direction TB
|
| 454 |
+
|
| 455 |
+
REPORT[Report Generator]
|
| 456 |
+
|
| 457 |
+
subgraph "JSON Report"
|
| 458 |
+
JSON[Structured JSON]
|
| 459 |
+
JSON --> META[Report MetadataTimestampVersion]
|
| 460 |
+
JSON --> RESULTS[Overall ResultsProbabilitiesConfidence]
|
| 461 |
+
JSON --> METRICS[Detailed MetricsSub-metricsWeights]
|
| 462 |
+
JSON --> REASONING[Detection ReasoningEvidenceRecommendations]
|
| 463 |
+
JSON --> HIGHLIGHT[Highlighted SentencesColor ClassesProbabilities]
|
| 464 |
+
JSON --> PERF[Performance MetricsExecution TimesWarnings/Errors]
|
| 465 |
+
end
|
| 466 |
+
|
| 467 |
+
subgraph "PDF Report"
|
| 468 |
+
PDF[Professional PDF]
|
| 469 |
+
PDF --> PAGE1[Page 1: Executive SummaryVerdict, Stats, Reasoning]
|
| 470 |
+
PDF --> PAGE2[Page 2: Content AnalysisDomain, Metrics, Weights]
|
| 471 |
+
PDF --> PAGE3[Page 3: Structural & Entropy]
|
| 472 |
+
PDF --> PAGE4[Page 4: Perplexity & Semantic]
|
| 473 |
+
PDF --> PAGE5[Page 5: Linguistic & MPS]
|
| 474 |
+
PDF --> PAGE6[Page 6: Recommendations]
|
| 475 |
+
|
| 476 |
+
STYLE[Premium Styling]
|
| 477 |
+
STYLE --> COLORS[Color SchemeBlue/Green/Red/Purple]
|
| 478 |
+
STYLE --> TABLES[Professional TablesCharts, Metrics]
|
| 479 |
+
STYLE --> LAYOUT[Multi-Page LayoutHeaders, Footers]
|
| 480 |
+
end
|
| 481 |
+
end
|
| 482 |
+
|
| 483 |
+
REPORT -->|generates| JSON
|
| 484 |
+
REPORT -->|generates| PDF
|
| 485 |
+
PDF -->|uses| STYLE
|
| 486 |
+
|
| 487 |
+
style REPORT fill:#fff3e0
|
| 488 |
+
style JSON fill:#e8f5e9
|
| 489 |
+
style PDF fill:#e3f2fd
|
| 490 |
+
style STYLE fill:#f3e5f5
|
| 491 |
+
```
|
| 492 |
+
|
| 493 |
+
**Report Formats:**
|
| 494 |
+
- **JSON**: Machine-readable with complete data
|
| 495 |
+
- **PDF**: Human-readable with professional formatting
|
| 496 |
+
- **Charts**: Pie charts for probability distribution
|
| 497 |
+
- **Tables**: Metric contributions, detailed sub-metrics
|
| 498 |
+
- **Styling**: Color-coded, multi-page layout with branding
|
| 499 |
+
|
| 500 |
+
---
|
| 501 |
+
|
| 502 |
+
## Data Flow
|
| 503 |
+
|
| 504 |
+
### Complete Detection Pipeline
|
| 505 |
+
|
| 506 |
+
```mermaid
|
| 507 |
+
sequenceDiagram
|
| 508 |
+
participant User
|
| 509 |
+
participant Orchestrator
|
| 510 |
+
participant Processors
|
| 511 |
+
participant Metrics
|
| 512 |
+
participant Ensemble
|
| 513 |
+
participant Services
|
| 514 |
+
participant Reporter
|
| 515 |
+
|
| 516 |
+
User->>Orchestrator: analyze(text)
|
| 517 |
+
|
| 518 |
+
Note over Orchestrator: Step 1: Preprocessing
|
| 519 |
+
Orchestrator->>Processors: TextProcessor.process()
|
| 520 |
+
Processors-->>Orchestrator: ProcessedText
|
| 521 |
+
|
| 522 |
+
Note over Orchestrator: Step 2: Language Detection
|
| 523 |
+
Orchestrator->>Processors: LanguageDetector.detect()
|
| 524 |
+
Processors-->>Orchestrator: LanguageResult
|
| 525 |
+
|
| 526 |
+
Note over Orchestrator: Step 3: Domain Classification
|
| 527 |
+
Orchestrator->>Processors: DomainClassifier.classify()
|
| 528 |
+
Processors-->>Orchestrator: DomainPrediction
|
| 529 |
+
|
| 530 |
+
Note over Orchestrator: Step 4: Parallel Metric Execution
|
| 531 |
+
par Structural
|
| 532 |
+
Orchestrator->>Metrics: Structural.compute()
|
| 533 |
+
Metrics-->>Orchestrator: MetricResult
|
| 534 |
+
and Perplexity
|
| 535 |
+
Orchestrator->>Metrics: Perplexity.compute()
|
| 536 |
+
Metrics-->>Orchestrator: MetricResult
|
| 537 |
+
and Entropy
|
| 538 |
+
Orchestrator->>Metrics: Entropy.compute()
|
| 539 |
+
Metrics-->>Orchestrator: MetricResult
|
| 540 |
+
and Semantic
|
| 541 |
+
Orchestrator->>Metrics: Semantic.compute()
|
| 542 |
+
Metrics-->>Orchestrator: MetricResult
|
| 543 |
+
and Linguistic
|
| 544 |
+
Orchestrator->>Metrics: Linguistic.compute()
|
| 545 |
+
Metrics-->>Orchestrator: MetricResult
|
| 546 |
+
and MPS
|
| 547 |
+
Orchestrator->>Metrics: MPS.compute()
|
| 548 |
+
Metrics-->>Orchestrator: MetricResult
|
| 549 |
+
end
|
| 550 |
+
|
| 551 |
+
Note over Orchestrator: Step 5: Ensemble Aggregation
|
| 552 |
+
Orchestrator->>Ensemble: predict(metric_results, domain)
|
| 553 |
+
Ensemble-->>Orchestrator: EnsembleResult
|
| 554 |
+
|
| 555 |
+
Note over Orchestrator: Step 6: Services
|
| 556 |
+
Orchestrator->>Services: generate_highlights()
|
| 557 |
+
Services-->>Orchestrator: HighlightedSentences
|
| 558 |
+
|
| 559 |
+
Orchestrator->>Services: generate_reasoning()
|
| 560 |
+
Services-->>Orchestrator: DetailedReasoning
|
| 561 |
+
|
| 562 |
+
Orchestrator->>Reporter: generate_report()
|
| 563 |
+
Reporter-->>Orchestrator: Report Files
|
| 564 |
+
|
| 565 |
+
Orchestrator-->>User: DetectionResult
|
| 566 |
+
```
|
| 567 |
+
|
| 568 |
+
### Ensemble Aggregation Flow
|
| 569 |
+
|
| 570 |
+
```mermaid
|
| 571 |
+
graph TD
|
| 572 |
+
START[Metric Results] --> FILTER[Filter Valid MetricsRemove Errors]
|
| 573 |
+
FILTER --> WEIGHTS[Get Domain WeightsBase Weights]
|
| 574 |
+
|
| 575 |
+
WEIGHTS --> METHOD{Primary Method?}
|
| 576 |
+
|
| 577 |
+
METHOD -->|Confidence Calibrated| CONF[Sigmoid ConfidenceAdjustment]
|
| 578 |
+
METHOD -->|Consensus Based| CONS[AgreementCalculation]
|
| 579 |
+
METHOD -->|Domain Weighted| DOMAIN[Static DomainWeights]
|
| 580 |
+
|
| 581 |
+
CONF --> AGGREGATE[Weighted Aggregation]
|
| 582 |
+
CONS --> AGGREGATE
|
| 583 |
+
DOMAIN --> AGGREGATE
|
| 584 |
+
|
| 585 |
+
AGGREGATE --> NORMALIZE[Normalize to 1.0]
|
| 586 |
+
|
| 587 |
+
NORMALIZE --> CALC[Calculate Metrics]
|
| 588 |
+
CALC --> CONFIDENCE[Overall ConfidenceBase + Agreement+ Certainty + Quality]
|
| 589 |
+
CALC --> UNCERTAINTY[Uncertainty ScoreVariance + Confidence+ Decision]
|
| 590 |
+
CALC --> CONSENSUS[Consensus LevelStd Dev Analysis]
|
| 591 |
+
|
| 592 |
+
CONFIDENCE --> THRESHOLD[Apply AdaptiveThreshold]
|
| 593 |
+
UNCERTAINTY --> THRESHOLD
|
| 594 |
+
|
| 595 |
+
THRESHOLD --> VERDICT{Verdict}
|
| 596 |
+
VERDICT -->|Synthetic >= 0.6| SYNTH[Synthetically-Generated]
|
| 597 |
+
VERDICT -->|Authentic >= 0.6| AUTH[Authentically-Written]
|
| 598 |
+
VERDICT -->|Hybrid > 0.25| HYBRID[Hybrid]
|
| 599 |
+
VERDICT -->|Uncertain| UNC[Uncertain]
|
| 600 |
+
|
| 601 |
+
SYNTH --> REASON[Generate Reasoning]
|
| 602 |
+
AUTH --> REASON
|
| 603 |
+
HYBRID --> REASON
|
| 604 |
+
UNC --> REASON
|
| 605 |
+
|
| 606 |
+
REASON --> RESULT[EnsembleResult]
|
| 607 |
+
|
| 608 |
+
style START fill:#e8f5e9
|
| 609 |
+
style RESULT fill:#e3f2fd
|
| 610 |
+
style SYNTH fill:#ffebee
|
| 611 |
+
style AUTH fill:#e8f5e9
|
| 612 |
+
style HYBRID fill:#fff3e0
|
| 613 |
+
style UNC fill:#f5f5f5
|
| 614 |
+
```
|
| 615 |
+
|
| 616 |
+
---
|
| 617 |
+
|
| 618 |
+
## Technology Stack
|
| 619 |
+
|
| 620 |
+
### Core Technologies
|
| 621 |
+
|
| 622 |
+
```mermaid
|
| 623 |
+
graph LR
|
| 624 |
+
subgraph "Language & Runtime"
|
| 625 |
+
PYTHON[Python 3.10+]
|
| 626 |
+
CONDA[Conda Environment]
|
| 627 |
+
end
|
| 628 |
+
|
| 629 |
+
subgraph "ML Frameworks"
|
| 630 |
+
TORCH[PyTorch]
|
| 631 |
+
HF[HuggingFace Transformers]
|
| 632 |
+
SPACY[spaCy]
|
| 633 |
+
SKLEARN[scikit-learn]
|
| 634 |
+
end
|
| 635 |
+
|
| 636 |
+
subgraph "NLP Models"
|
| 637 |
+
GPT2[GPT-2Perplexity/MPS]
|
| 638 |
+
MINILM[MiniLM-L6-v2Semantic]
|
| 639 |
+
ROBERTA[RoBERTaDomain Classify]
|
| 640 |
+
DISTIL[DistilRoBERTaMPS Mask]
|
| 641 |
+
XLM[XLM-RoBERTaLanguage Detect]
|
| 642 |
+
SPACYMODEL[en_core_web_smLinguistic]
|
| 643 |
+
end
|
| 644 |
+
|
| 645 |
+
subgraph "Document Processing"
|
| 646 |
+
PYMUPDF[PyMuPDF]
|
| 647 |
+
PDFPLUMBER[pdfplumber]
|
| 648 |
+
PYPDF2[PyPDF2]
|
| 649 |
+
DOCX[python-docx]
|
| 650 |
+
BS4[BeautifulSoup4]
|
| 651 |
+
end
|
| 652 |
+
|
| 653 |
+
subgraph "Utilities"
|
| 654 |
+
NUMPY[NumPy]
|
| 655 |
+
PYDANTIC[Pydantic]
|
| 656 |
+
LOGURU[Loguru]
|
| 657 |
+
REPORTLAB[ReportLab]
|
| 658 |
+
end
|
| 659 |
+
|
| 660 |
+
PYTHON --> TORCH
|
| 661 |
+
TORCH --> HF
|
| 662 |
+
HF --> GPT2
|
| 663 |
+
HF --> MINILM
|
| 664 |
+
HF --> ROBERTA
|
| 665 |
+
HF --> DISTIL
|
| 666 |
+
HF --> XLM
|
| 667 |
+
PYTHON --> SPACY
|
| 668 |
+
SPACY --> SPACYMODEL
|
| 669 |
+
|
| 670 |
+
style PYTHON fill:#306998
|
| 671 |
+
style TORCH fill:#ee4c2c
|
| 672 |
+
style HF fill:#ff6f00
|
| 673 |
+
style SPACY fill:#09a3d5
|
| 674 |
+
```
|
| 675 |
+
|
| 676 |
+
### Dependencies Summary
|
| 677 |
+
|
| 678 |
+
| Category | Libraries | Purpose |
|
| 679 |
+
|----------|-----------|---------|
|
| 680 |
+
| **ML Core** | PyTorch, Transformers, spaCy | Model execution, NLP |
|
| 681 |
+
| **Document** | PyMuPDF, pdfplumber, python-docx | Multi-format extraction |
|
| 682 |
+
| **Analysis** | NumPy, scikit-learn | Numerical computation |
|
| 683 |
+
| **Validation** | Pydantic | Data validation |
|
| 684 |
+
| **Logging** | Loguru | Structured logging |
|
| 685 |
+
| **Reporting** | ReportLab | PDF generation |
|
| 686 |
+
|
| 687 |
+
---
|
| 688 |
+
|
| 689 |
+
## Deployment Architecture
|
| 690 |
+
|
| 691 |
+
```mermaid
|
| 692 |
+
graph TB
|
| 693 |
+
subgraph "Deployment Options"
|
| 694 |
+
direction TB
|
| 695 |
+
|
| 696 |
+
subgraph "Standalone Application"
|
| 697 |
+
SCRIPT[Python Scripts]
|
| 698 |
+
end
|
| 699 |
+
|
| 700 |
+
subgraph "Web Application"
|
| 701 |
+
FASTAPI[FastAPI Server]
|
| 702 |
+
end
|
| 703 |
+
|
| 704 |
+
subgraph "API Service"
|
| 705 |
+
REST[REST API Endpoints]
|
| 706 |
+
BATCH[Batch Processing]
|
| 707 |
+
ASYNC[Async Workers]
|
| 708 |
+
end
|
| 709 |
+
|
| 710 |
+
subgraph "Infrastructure"
|
| 711 |
+
DOCKER[Docker Container]
|
| 712 |
+
GPU[GPU SupportOptional]
|
| 713 |
+
STORAGE[Model Cache2.8GB]
|
| 714 |
+
end
|
| 715 |
+
end
|
| 716 |
+
|
| 717 |
+
FASTAPI --> DOCKER
|
| 718 |
+
REST --> DOCKER
|
| 719 |
+
|
| 720 |
+
DOCKER --> GPU
|
| 721 |
+
DOCKER --> STORAGE
|
| 722 |
+
|
| 723 |
+
style FASTAPI fill:#e3f2fd
|
| 724 |
+
style DOCKER fill:#2496ed
|
| 725 |
+
style GPU fill:#76b900
|
| 726 |
+
```
|
| 727 |
+
|
| 728 |
+
### System Requirements
|
| 729 |
+
|
| 730 |
+
- **Python**: 3.10+
|
| 731 |
+
- **RAM**: 8GB minimum, 16GB recommended
|
| 732 |
+
- **Storage**: 5GB (models + data)
|
| 733 |
+
- **GPU**: Optional (CUDA/MPS for faster inference)
|
| 734 |
+
- **CPU**: 4+ cores for parallel execution
|
| 735 |
+
|
| 736 |
+
---
|
| 737 |
+
|
| 738 |
+
## Performance Characteristics
|
| 739 |
+
|
| 740 |
+
### Execution Modes
|
| 741 |
+
|
| 742 |
+
```mermaid
|
| 743 |
+
graph LR
|
| 744 |
+
subgraph "Sequential Mode"
|
| 745 |
+
S1[Metric 1] --> S2[Metric 2]
|
| 746 |
+
S2 --> S3[Metric 3]
|
| 747 |
+
S3 --> S4[Metric 4]
|
| 748 |
+
S4 --> S5[Metric 5]
|
| 749 |
+
S5 --> S6[Metric 6]
|
| 750 |
+
S6 --> SRESULT[~15-30s]
|
| 751 |
+
end
|
| 752 |
+
|
| 753 |
+
subgraph "Parallel Mode"
|
| 754 |
+
P1[Metric 1]
|
| 755 |
+
P2[Metric 2]
|
| 756 |
+
P3[Metric 3]
|
| 757 |
+
P4[Metric 4]
|
| 758 |
+
P5[Metric 5]
|
| 759 |
+
P6[Metric 6]
|
| 760 |
+
|
| 761 |
+
P1 --> PRESULT[~8-12s]
|
| 762 |
+
P2 --> PRESULT
|
| 763 |
+
P3 --> PRESULT
|
| 764 |
+
P4 --> PRESULT
|
| 765 |
+
P5 --> PRESULT
|
| 766 |
+
P6 --> PRESULT
|
| 767 |
+
end
|
| 768 |
+
|
| 769 |
+
style SRESULT fill:#ffebee
|
| 770 |
+
style PRESULT fill:#e8f5e9
|
| 771 |
+
```
|
| 772 |
+
|
| 773 |
+
### Metric Execution Times
|
| 774 |
+
|
| 775 |
+
| Metric | Avg Time | Complexity | Model Size |
|
| 776 |
+
|--------|----------|------------|------------|
|
| 777 |
+
| Structural | 0.5-1s | Low | 0MB |
|
| 778 |
+
| Perplexity | 2-4s | Medium | 548MB |
|
| 779 |
+
| Entropy | 1-2s | Medium | ~50MB (shared) |
|
| 780 |
+
| Semantic | 3-5s | Medium | 80MB |
|
| 781 |
+
| Linguistic | 2-3s | Medium | 13MB |
|
| 782 |
+
| MPS | 5-10s | High | 878MB (GPT-2 + DistilRoBERTa) |
|
| 783 |
+
|
| 784 |
+
**Total Sequential**: ~15-25 seconds
|
| 785 |
+
**Total Parallel**: ~8-12 seconds (limited by slowest metric)
|
| 786 |
+
|
| 787 |
+
---
|
| 788 |
+
|
| 789 |
+
## Security & Privacy
|
| 790 |
+
|
| 791 |
+
### Data Handling
|
| 792 |
+
|
| 793 |
+
```mermaid
|
| 794 |
+
graph TD
|
| 795 |
+
INPUT[Text Input] --> PROCESS[Processing]
|
| 796 |
+
PROCESS --> MEMORY[In-Memory Only]
|
| 797 |
+
MEMORY --> ANALYSIS[Analysis]
|
| 798 |
+
ANALYSIS --> CLEANUP[Auto Cleanup]
|
| 799 |
+
|
| 800 |
+
MODELS[Model Cache] -.->|Read Only| ANALYSIS
|
| 801 |
+
|
| 802 |
+
REPORTS[Optional Reports] --> STORAGE[Local Storage Only]
|
| 803 |
+
|
| 804 |
+
CLEANUP --> DISCARD[Data Discarded]
|
| 805 |
+
|
| 806 |
+
style INPUT fill:#e3f2fd
|
| 807 |
+
style MEMORY fill:#fff3e0
|
| 808 |
+
style CLEANUP fill:#e8f5e9
|
| 809 |
+
style DISCARD fill:#ffebee
|
| 810 |
+
```
|
| 811 |
+
|
| 812 |
+
### Security Features
|
| 813 |
+
- ✅ **No External Data Transmission**: All processing local
|
| 814 |
+
- ✅ **No Data Persistence**: Text data not stored by default
|
| 815 |
+
- ✅ **Model Integrity**: Checksums for downloaded models
|
| 816 |
+
- ✅ **Input Validation**: Pydantic schemas for all inputs
|
| 817 |
+
- ✅ **Error Isolation**: Graceful degradation, no information leakage
|
| 818 |
+
|
| 819 |
+
---
|
| 820 |
+
|
| 821 |
+
> This system does not claim ground truth authorship. It estimates probabilistic authenticity signals based on measurable text properties.
|
docs/BLOGPOST.md
CHANGED
|
@@ -1,398 +1,448 @@
|
|
| 1 |
-
# Building
|
| 2 |
|
| 3 |
-
*How
|
| 4 |
|
| 5 |
---
|
| 6 |
|
| 7 |
-
## Introduction:
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
## Why Most Detectors Fail
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
---
|
| 36 |
|
| 37 |
-
##
|
| 38 |
-
|
| 39 |
-
Rather than betting everything on one metric, we designed a system that analyzes text through six completely orthogonal dimensions—think of them as six expert judges, each looking at the text from a different angle.
|
| 40 |
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
-
|
| 48 |
-
Perplexity = \exp\left(-\frac{1}{N}\sum_{i=1}^N \log P(w_i\mid context)\right)
|
| 49 |
-
```
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
-
H(X) = -Σ p(x_i) * log₂ p(x_i)
|
| 65 |
-
```
|
| 66 |
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
| 76 |
|
| 77 |
-
|
| 78 |
|
| 79 |
-
|
| 80 |
-
```math
|
| 81 |
-
Burstiness = \frac{\sigma - \mu}{\sigma + \mu}
|
| 82 |
-
```
|
| 83 |
-
where:
|
| 84 |
-
- μ = mean sentence length
|
| 85 |
-
- σ = standard deviation of sentence length
|
| 86 |
|
| 87 |
-
|
| 88 |
-
```math
|
| 89 |
-
Uniformity = 1 - \frac{\sigma}{\mu}
|
| 90 |
-
```
|
| 91 |
|
| 92 |
-
|
| 93 |
-
- μ = mean sentence length
|
| 94 |
-
- σ = standard deviation of sentence length
|
| 95 |
|
|
|
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
**
|
| 100 |
|
| 101 |
-
|
| 102 |
|
| 103 |
-
|
| 104 |
|
| 105 |
-
|
| 106 |
|
| 107 |
-
|
| 108 |
-
Coherence = \frac{1}{n} \sum_{i=1}^{n-1} \cos(e_i, e_{i+1})
|
| 109 |
-
```
|
| 110 |
|
| 111 |
-
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
**
|
| 116 |
|
| 117 |
-
|
| 118 |
|
| 119 |
-
|
|
|
|
| 120 |
|
| 121 |
-
|
| 122 |
|
| 123 |
-
|
| 124 |
|
| 125 |
-
**
|
| 126 |
|
| 127 |
-
|
| 128 |
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
|
| 132 |
|
| 133 |
-
|
| 134 |
-
Stability = \frac{1}{n} \sum_{j} \left| \log P(x) - \log P(x_{perturbed_j}) \right|
|
| 135 |
-
```
|
| 136 |
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
**
|
| 140 |
|
| 141 |
---
|
| 142 |
|
| 143 |
-
|
| 144 |
|
| 145 |
-
|
| 146 |
|
| 147 |
-
|
| 148 |
|
| 149 |
-
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
|
| 152 |
|
| 153 |
-
|
| 154 |
-
weight_adjusted = base_weight × (1 / (1 + e^(-10(confidence - 0.5))))
|
| 155 |
-
```
|
| 156 |
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
| 160 |
|
| 161 |
-
|
| 162 |
|
| 163 |
-
|
| 164 |
-
- Increase the weight of linguistic complexity (formal writing demands it)
|
| 165 |
-
- Reduce perplexity sensitivity (structured writing is expected)
|
| 166 |
-
- Raise the AI probability threshold (be more conservative with accusations)
|
| 167 |
|
| 168 |
-
|
| 169 |
-
- Boost entropy and structural analysis weights (creativity shows variation)
|
| 170 |
-
- Adjust perplexity expectations (good fiction can be unpredictable)
|
| 171 |
-
- Focus on burstiness detection (rhythmic variation matters)
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
- Prioritize terminology consistency patterns
|
| 177 |
|
| 178 |
-
|
| 179 |
-
- Make perplexity the dominant signal (informal patterns are distinctive)
|
| 180 |
-
- Relax linguistic complexity requirements (casual grammar is normal)
|
| 181 |
-
- Accept higher entropy variation (internet language is wild)
|
| 182 |
|
| 183 |
-
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
|
| 187 |
-
|
| 188 |
|
| 189 |
-
|
| 190 |
|
| 191 |
-
|
| 192 |
-
Consensus = 1 - min(1.0, σ_predictions × 2)
|
| 193 |
-
```
|
| 194 |
|
| 195 |
-
|
| 196 |
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
|
| 200 |
|
| 201 |
-
|
| 202 |
|
| 203 |
-
|
| 204 |
-
**Confidence uncertainty** (30% weight): How confident is each individual metric?
|
| 205 |
-
**Decision uncertainty** (30% weight): How close is the final probability to 0.5 (the maximally uncertain point)?
|
| 206 |
|
| 207 |
-
|
| 208 |
-
Uncertainty = 0.4 × var(predictions) + 0.3 × (1 - mean(confidences)) + 0.3 × (1 - 2|P_AI - 0.5|)
|
| 209 |
-
```
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
|
| 214 |
|
| 215 |
-
|
| 216 |
|
| 217 |
-
|
| 218 |
|
| 219 |
-
|
| 220 |
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
|
| 224 |
-
- **Academic institutions**: Understanding which tools students are using
|
| 225 |
-
- **Publishers**: Identifying content farm sources
|
| 226 |
-
- **Research**: Tracking the spread of AI-generated content online
|
| 227 |
-
- **Forensics**: Investigating coordinated inauthentic behavior
|
| 228 |
|
| 229 |
-
|
| 230 |
|
| 231 |
-
|
| 232 |
|
| 233 |
-
|
|
|
|
|
|
|
| 234 |
|
| 235 |
-
|
| 236 |
|
| 237 |
-
|
| 238 |
|
| 239 |
-
|
| 240 |
|
| 241 |
-
|
| 242 |
-
- **Light red**: Moderate-high (60-80%)
|
| 243 |
-
- **Yellow**: Uncertain (40-60%)
|
| 244 |
-
- **Light green**: Moderate-low (20-40%)
|
| 245 |
-
- **Deep green**: Low AI probability (<20%)
|
| 246 |
|
| 247 |
-
|
|
|
|
|
|
|
| 248 |
|
| 249 |
-
|
| 250 |
|
| 251 |
-
|
| 252 |
|
| 253 |
-
|
| 254 |
|
| 255 |
-
|
| 256 |
-
-
|
| 257 |
-
-
|
| 258 |
-
- **Accountability**: Decisions can be reviewed and contested
|
| 259 |
-
- **Fairness**: Systematic biases become visible
|
| 260 |
|
| 261 |
---
|
| 262 |
|
| 263 |
-
##
|
| 264 |
|
| 265 |
-
|
| 266 |
|
| 267 |
-
|
| 268 |
-
**Medium texts** (500-2000 words): 3.5 seconds, 1.2 vCPU, 1GB RAM
|
| 269 |
-
**Long texts** (2000+ words): 7.8 seconds, 2.0 vCPU, 2GB RAM
|
| 270 |
|
| 271 |
-
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
|
| 274 |
|
| 275 |
-
|
| 276 |
|
| 277 |
-
|
|
|
|
|
|
|
| 278 |
|
| 279 |
-
|
| 280 |
|
| 281 |
-
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
|
| 284 |
|
| 285 |
-
|
| 286 |
|
| 287 |
-
|
|
|
|
|
|
|
| 288 |
|
| 289 |
-
|
| 290 |
-
- **Lightweight repository**: Clone times under 30 seconds
|
| 291 |
-
- **Version control**: Model versions are pinned in configuration
|
| 292 |
-
- **Offline operation**: Once downloaded, models cache locally
|
| 293 |
-
- **Reproducibility**: Same model versions across all environments
|
| 294 |
|
| 295 |
-
|
|
|
|
|
|
|
| 296 |
|
| 297 |
---
|
| 298 |
|
| 299 |
-
##
|
| 300 |
|
| 301 |
-
|
| 302 |
|
| 303 |
-
|
| 304 |
-
- Universities need academic integrity tools that are defensible in appeals
|
| 305 |
-
- False accusations destroy student trust—accuracy matters more than speed
|
| 306 |
-
- Need for integration with learning management systems (Canvas, Blackboard, Moodle)
|
| 307 |
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
- Integration with applicant tracking systems (Greenhouse, Lever, Workday)
|
| 312 |
|
| 313 |
-
|
| 314 |
-
- Publishers drowning in AI-generated submissions
|
| 315 |
-
- SEO platforms fighting content farms
|
| 316 |
-
- Media credibility depends on content authenticity
|
| 317 |
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
-
###
|
| 327 |
-
A React-based web application with real-time analysis dashboard, drag-and-drop file upload (supporting PDF, DOCX, TXT, MD), and batch processing interface. The UI updates progressively as metrics complete, rather than blocking until full analysis finishes.
|
| 328 |
|
| 329 |
-
|
| 330 |
-
FastAPI backend with JWT authentication, rate limiting (100 requests/hour for free tier), and intelligent request queuing. The gateway handles routing, auth, and implements backpressure mechanisms when the detection engine is overloaded.
|
| 331 |
|
| 332 |
-
|
| 333 |
-
The orchestrator manages the analysis pipeline: domain classification, text preprocessing, metric scheduling, ensemble coordination, and report generation. It implements circuit breakers for failing metrics and timeout handling for long-running analyses.
|
| 334 |
|
| 335 |
-
|
| 336 |
-
Each metric runs as an independent module with standardized interfaces. This pluggable architecture allows us to add new metrics without refactoring the ensemble logic. Metrics execute in parallel across a thread pool, with results aggregated as they complete.
|
| 337 |
|
| 338 |
-
###
|
| 339 |
-
The ensemble aggregates metric results using the confidence-calibrated, domain-aware logic described earlier. It's implemented with multiple aggregation strategies (confidence-calibrated, domain-adaptive, consensus-based) and automatically selects the most appropriate method.
|
| 340 |
|
| 341 |
-
|
| 342 |
-
PostgreSQL for structured data (user accounts, analysis history, feedback), Redis for caching (model outputs, intermediate results), and S3-compatible storage for reports and uploaded files.
|
| 343 |
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
-
|
| 347 |
|
| 348 |
-
|
| 349 |
|
| 350 |
-
|
| 351 |
|
| 352 |
-
|
|
|
|
| 353 |
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
-
|
|
|
|
| 359 |
|
| 360 |
-
|
|
|
|
| 361 |
|
| 362 |
-
---
|
| 363 |
|
| 364 |
-
## Ethical
|
| 365 |
|
| 366 |
-
|
| 367 |
|
| 368 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
-
|
| 371 |
|
| 372 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
-
|
| 377 |
|
| 378 |
-
The
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
-
|
| 383 |
|
| 384 |
-
|
| 385 |
|
| 386 |
-
|
| 387 |
|
| 388 |
-
|
| 389 |
|
| 390 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
|
| 392 |
---
|
| 393 |
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
-
|
|
|
|
|
|
|
| 397 |
|
| 398 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Building TEXT-AUTH: An Evidence-First System for Forensic Text Analysis
|
| 2 |
|
| 3 |
+
> *How a multi-metric, domain-aware forensic platform was implemented for probabilistic text authenticity assessment — without authorship claims.*
|
| 4 |
|
| 5 |
---
|
| 6 |
|
| 7 |
+
## Introduction: Why Text Forensics Needs a Rethink
|
| 8 |
|
| 9 |
+
The widespread availability of high-quality generative language systems has fundamentally altered the landscape of written communication. In education, publishing, journalism, and enterprise domains, stakeholders increasingly confront a complex forensic question:
|
| 10 |
|
| 11 |
+
> Does this text exhibit statistical, linguistic, and semantic patterns consistent with organically composed language, or does it display measurable characteristics associated with algorithmically regularized generation?
|
| 12 |
|
| 13 |
+
Traditional detection systems attempt to answer this with binary classifications: "Human" or "AI." This approach fails in practice because real-world text exists along a continuum—often hybrid, domain-specific, edited, paraphrased, or collaboratively produced.
|
| 14 |
|
| 15 |
+
TEXT-AUTH was conceived not as another detector, but as a forensic analysis system that evaluates observable textual properties and reports probabilistic consistency signals with explicit reasoning and uncertainty quantification. It provides evidence, not verdicts.
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
---
|
| 18 |
|
| 19 |
+
## Design Philosophy: Evidence, Not Attribution
|
| 20 |
|
| 21 |
+
At its core, TEXT-AUTH operates under a strict methodological constraint:
|
| 22 |
|
| 23 |
+
> The system does not determine who wrote a text.
|
| 24 |
+
> It evaluates measurable linguistic and statistical patterns present in the text.
|
| 25 |
|
| 26 |
+
This distinction is both technical and ethical. By focusing on patterns rather than provenance, the system avoids the philosophical quagmire of authorship attribution while providing actionable forensic intelligence.
|
| 27 |
|
| 28 |
+
All outputs are framed as probabilistic assessments accompanied by:
|
| 29 |
|
| 30 |
+
- Explicit confidence intervals
|
| 31 |
+
- Quantified uncertainty scores
|
| 32 |
+
- Domain-specific calibration context
|
| 33 |
+
- Sentence-level evidence annotation
|
| 34 |
|
| 35 |
+
This architecture makes TEXT-AUTH suitable for high-stakes workflows where explainability, auditability, and human judgment remain essential components of decision-making.
|
| 36 |
|
| 37 |
---
|
| 38 |
|
| 39 |
+
## Core Architectural Principles
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
TEXT-AUTH implements five foundational principles that differentiate it from conventional detection systems:
|
| 42 |
|
| 43 |
+
### 1. Multi-Dimensional Analysis
|
| 44 |
|
| 45 |
+
Instead of relying on a single metric (typically perplexity), the system evaluates six orthogonal forensic signals, each capturing distinct aspects of textual consistency. This multi-dimensional approach provides robustness against adversarial manipulation—while individual metrics can be gamed, simultaneously gaming all six requires sophisticated effort that often produces other detectable anomalies.
|
| 46 |
|
| 47 |
+
### 2. Domain-Aware Calibration
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
The system recognizes that different writing genres exhibit different baseline characteristics. Academic papers naturally demonstrate lower perplexity than creative fiction. Legal documents show different structural patterns than social media posts. TEXT-AUTH implements sixteen domain-specific configurations, each with calibrated thresholds and metric weights, reducing false positives by 15–20% compared to generic detection approaches.
|
| 50 |
|
| 51 |
+
### 3. Explicit Uncertainty Modeling
|
| 52 |
|
| 53 |
+
Rather than forcing certainty, the system explicitly quantifies and reports uncertainty through a composite score combining:
|
| 54 |
|
| 55 |
+
- Inter-metric disagreement (variance)
|
| 56 |
+
- Individual metric confidence levels
|
| 57 |
+
- Distance from decision boundaries
|
| 58 |
|
| 59 |
+
High uncertainty triggers explicit recommendations for human review rather than automated decisions.
|
| 60 |
|
| 61 |
+
### 4. Granular Sentence-Level Analysis
|
| 62 |
|
| 63 |
+
Instead of providing a single document-level score, the system performs sentence-by-sentence forensic evaluation, producing color-coded visualizations that highlight where statistical anomalies occur. This granular approach provides actionable insights for editing, revision, and targeted review.
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
### 5. Transparent, Explainable Reasoning
|
| 66 |
|
| 67 |
+
Every analysis includes human-readable explanations detailing:
|
| 68 |
|
| 69 |
+
- Which metrics contributed most to the assessment
|
| 70 |
+
- Specific text patterns that triggered detection
|
| 71 |
+
- Domain context considerations
|
| 72 |
+
- Uncertainty sources and confidence factors
|
| 73 |
|
| 74 |
+
This transparency builds trust and enables informed decision-making.
|
| 75 |
|
| 76 |
+
---
|
| 77 |
|
| 78 |
+
## The Forensic Model: Six Orthogonal Signals
|
| 79 |
|
| 80 |
+
TEXT-AUTH evaluates text through six independent analytical lenses, each examining different dimensions of linguistic behavior. These metrics were selected based on their statistical independence, computational feasibility, and demonstrated discriminative power across text genres.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
### 1. Statistical Predictability Analysis (Perplexity)
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
**What it measures**: The average negative log-likelihood of tokens given their preceding context, quantifying how "surprised" a reference language model is by the text sequence.
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
**Mathematical Foundation**:
|
| 87 |
|
| 88 |
+
$$
|
| 89 |
+
\text{Perplexity}(T) = \exp\left(-\frac{1}{N}\sum_{i=1}^{N} \log P(w_i \mid w_{<i})\right)
|
| 90 |
+
$$
|
| 91 |
|
| 92 |
+
**Forensic Insight**: Language models generate text by selecting tokens with high conditional probabilities, creating sequences that occupy high-probability regions of the language distribution. Human writing, in contrast, includes unexpected lexical choices, creative expressions, and domain-specific jargon that models find statistically "surprising."
|
| 93 |
|
| 94 |
+
**Domain Calibration**: Expected perplexity ranges differ significantly by genre. Academic writing naturally exhibits lower perplexity due to formal structure and technical terminology. Creative writing shows higher baseline perplexity due to stylistic variation. Social media content displays the highest natural perplexity due to informal language and idiosyncratic expression.
|
| 95 |
|
| 96 |
+
---
|
| 97 |
|
| 98 |
+
### 2. Information Diversity Measurement (Entropy)
|
| 99 |
|
| 100 |
+
**What it measures**: The dispersion and unpredictability of token usage throughout the text, quantifying lexical richness and variation.
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
**Mathematical Foundation**:
|
| 103 |
|
| 104 |
+
$$
|
| 105 |
+
H(X) = -\sum_{i=1}^{n} p(x_i) \log_2 p(x_i)
|
| 106 |
+
$$
|
| 107 |
|
| 108 |
+
**Forensic Insight**: Human-authored text typically exhibits higher lexical entropy due to expressive variation, nuanced vocabulary selection, and contextual adaptation. Algorithmically regularized text often shows more concentrated token distributions, with certain words and phrases appearing with unnatural frequency.
|
| 109 |
|
| 110 |
+
**Visual Representation**:
|
| 111 |
|
| 112 |
+
- Authentic Writing: ██░░░░░░░░ (High entropy, diverse distribution)
|
| 113 |
+
- Synthetic Generation: ██████░░░░ (Lower entropy, concentrated distribution)
|
| 114 |
|
| 115 |
+
---
|
| 116 |
|
| 117 |
+
### 3. Structural Rhythm Analysis (Burstiness and Uniformity)
|
| 118 |
|
| 119 |
+
**What it measures**: Sentence-level variation patterns through two complementary metrics.
|
| 120 |
|
| 121 |
+
**Burstiness Coefficient**:
|
| 122 |
|
| 123 |
+
$$
|
| 124 |
+
B = \frac{\sigma_L - \mu_L}{\sigma_L + \mu_L} \quad \text{where } B \in [-1, 1]
|
| 125 |
+
$$
|
| 126 |
|
| 127 |
+
Positive burstiness indicates varied sentence lengths; negative values indicate uniformity.
|
| 128 |
|
| 129 |
+
**Uniformity Metric**:
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
$$
|
| 132 |
+
U = 1 - \frac{\sigma_L}{\mu_L} \quad \text{for } \mu_L > 0
|
| 133 |
+
$$
|
| 134 |
|
| 135 |
+
**Forensic Insight**: Human writing exhibits natural rhythm—short, punchy sentences for emphasis followed by longer, complex sentences for elaboration. This creates characteristic "burstiness." Language model outputs tend toward more uniform sentence structures, creating a metronome-like consistency that lacks natural rhythmic variation.
|
| 136 |
|
| 137 |
---
|
| 138 |
|
| 139 |
+
### 4. Semantic Flow Evaluation (Coherence)
|
| 140 |
|
| 141 |
+
**What it measures**: The consistency of meaning between consecutive sentences using semantic embedding similarity.
|
| 142 |
|
| 143 |
+
**Mathematical Foundation**:
|
| 144 |
|
| 145 |
+
$$
|
| 146 |
+
\text{Coherence}(D) = \frac{1}{N_s-1} \sum_{i=1}^{N_s-1} \frac{\mathbf{e}_i \cdot \mathbf{e}_{i+1}}{\|\mathbf{e}_i\|\|\mathbf{e}_{i+1}\|}
|
| 147 |
+
$$
|
| 148 |
|
| 149 |
+
where $\mathbf{e}_i$ represents the embedding vector for sentence $i$.
|
| 150 |
|
| 151 |
+
**Forensic Insight**: Ironically, excessively high coherence can indicate algorithmic generation. Language models maintain remarkably consistent semantic flow through attention mechanisms. Human writing includes natural digressions, associative leaps, topic shifts, and rhetorical devices that create more variable coherence patterns.
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
**The Coherence Paradox**: In many contexts, better coherence actually provides evidence toward synthetic generation rather than organic composition.
|
| 154 |
|
| 155 |
+
---
|
| 156 |
|
| 157 |
+
### 5. Linguistic Pattern Analysis (Syntactic Complexity)
|
| 158 |
|
| 159 |
+
**What it measures**: Grammatical sophistication and syntactic variation through multiple sub-metrics:
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
**Part-of-Speech Diversity**:
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
$$
|
| 164 |
+
\text{POS}_{\text{diversity}} = \frac{|\{\text{POS tags}\}|}{N_{\text{tokens}}}
|
| 165 |
+
$$
|
|
|
|
| 166 |
|
| 167 |
+
**Parse Tree Depth Distribution**:
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
$$
|
| 170 |
+
D_{\text{syntactic}} = \frac{1}{N_{\text{sentences}}} \sum_{i=1}^{N_{\text{sentences}}} \max_{\text{tokens}} \text{depth}(t)
|
| 171 |
+
$$
|
| 172 |
|
| 173 |
+
**Forensic Insight**: Different writing styles exhibit characteristic syntactic fingerprints. Language models demonstrate systematic preferences for certain grammatical constructions, clause embeddings, and transitional patterns. Human writing shows greater syntactic irregularity, especially in longer passages where stylistic variation becomes more pronounced.
|
| 174 |
|
| 175 |
+
---
|
| 176 |
|
| 177 |
+
### 6. Stability Under Perturbation
|
| 178 |
|
| 179 |
+
**What it measures**: How text probability changes under meaning-preserving modifications, based on DetectGPT principles.
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
**Mathematical Foundation**:
|
| 182 |
|
| 183 |
+
$$
|
| 184 |
+
\Delta_{\text{logp}} = \frac{1}{k} \sum_{j=1}^k \left| \log P(T) - \log P(T'_{\epsilon_j}) \right|
|
| 185 |
+
$$
|
| 186 |
|
| 187 |
+
**Forensic Insight**: Text generated by language models occupies characteristic "curvature" regions in probability space—local maxima where small perturbations cause predictable probability decreases. Human-written text, not originating from these probability distributions, shows different perturbation sensitivity patterns.
|
| 188 |
|
| 189 |
+
**Computational Consideration**: This is the most resource-intensive metric, so TEXT-AUTH implements conditional execution, reserving it for cases where other metrics provide insufficient confidence.
|
| 190 |
|
| 191 |
+
## Ensemble Aggregation Methodology
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
Each of the six metrics produces:
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
- A synthetic probability estimate $p_i \in [0,1]$
|
| 196 |
+
- An internal confidence score $c_i \in [0,1]$
|
| 197 |
+
- An evidence strength classification (weak/moderate/strong)
|
| 198 |
|
| 199 |
+
The aggregation process follows a sophisticated multi-stage approach:
|
| 200 |
|
| 201 |
+
### Stage 1: Domain-Specific Base Weighting
|
| 202 |
|
| 203 |
+
Each of the sixteen supported domains has pre-calibrated base weights reflecting metric importance for that genre:
|
| 204 |
|
| 205 |
+
**Academic Domain Weights**:
|
| 206 |
|
| 207 |
+
- Perplexity: 22%
|
| 208 |
+
- Entropy: 18%
|
| 209 |
+
- Structural: 15%
|
| 210 |
+
- Semantic: 15%
|
| 211 |
+
- Linguistic: 20%
|
| 212 |
+
- Stability: 10%
|
| 213 |
|
| 214 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
+
### Stage 2: Confidence-Adjusted Dynamic Weighting
|
| 217 |
|
| 218 |
+
Base weights are dynamically adjusted based on each metric's confidence using a sigmoid scaling function:
|
| 219 |
|
| 220 |
+
$$
|
| 221 |
+
w_i^{\text{(adjusted)}} = w_i^{\text{(base)}} \cdot \left( \frac{1}{1 + e^{-\gamma(c_i - 0.5)}} \right)
|
| 222 |
+
$$
|
| 223 |
|
| 224 |
+
where $\gamma = 10$ controls adjustment sensitivity.
|
| 225 |
|
| 226 |
+
---
|
| 227 |
|
| 228 |
+
### Stage 3: Normalization and Aggregation
|
| 229 |
|
| 230 |
+
Adjusted weights are normalized to sum to 1.0, then used for weighted probability calculation:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
$$
|
| 233 |
+
P_{\text{synthetic}} = \sum_{i=1}^6 w_i^{\text{(final)}} \cdot p_i
|
| 234 |
+
$$
|
| 235 |
|
| 236 |
+
---
|
| 237 |
|
| 238 |
+
### Stage 4: Consensus Analysis
|
| 239 |
|
| 240 |
+
The system evaluates inter-metric agreement:
|
| 241 |
|
| 242 |
+
- High consensus increases overall confidence
|
| 243 |
+
- Low consensus triggers uncertainty flags
|
| 244 |
+
- Extreme disagreement may indicate adversarial manipulation or domain misclassification
|
|
|
|
|
|
|
| 245 |
|
| 246 |
---
|
| 247 |
|
| 248 |
+
## Uncertainty Quantification Framework
|
| 249 |
|
| 250 |
+
TEXT-AUTH explicitly models uncertainty through a three-component composite score:
|
| 251 |
|
| 252 |
+
### 1. Metric Disagreement Uncertainty
|
|
|
|
|
|
|
| 253 |
|
| 254 |
+
$$
|
| 255 |
+
U_{\text{variance}} = \min(1.0, \sigma_P \cdot 2)
|
| 256 |
+
$$
|
| 257 |
|
| 258 |
+
where $\sigma_P$ is the standard deviation of the six metric probabilities.
|
| 259 |
|
| 260 |
+
### 2. Confidence-Based Uncertainty
|
| 261 |
|
| 262 |
+
$$
|
| 263 |
+
U_{\text{confidence}} = 1 - \frac{1}{6} \sum_{i=1}^6 c_i
|
| 264 |
+
$$
|
| 265 |
|
| 266 |
+
### 3. Decision Boundary Uncertainty
|
| 267 |
|
| 268 |
+
$$
|
| 269 |
+
U_{\text{decision}} = 1 - 2 \cdot |P_{\text{synthetic}} - 0.5|
|
| 270 |
+
$$
|
| 271 |
|
| 272 |
+
This component captures how close the final probability is to the maximally uncertain point (0.5).
|
| 273 |
|
| 274 |
+
### Composite Uncertainty Score
|
| 275 |
|
| 276 |
+
$$
|
| 277 |
+
U_{\text{total}} = 0.4U_{\text{variance}} + 0.3U_{\text{confidence}} + 0.3U_{\text{decision}}
|
| 278 |
+
$$
|
| 279 |
|
| 280 |
+
**Interpretation Guidelines**:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
+
- **< 0.20**: High confidence, reliable assessment
|
| 283 |
+
- **0.20 – 0.40**: Moderate confidence, use with appropriate caution
|
| 284 |
+
- **> 0.40**: Low confidence, inconclusive—recommend human review
|
| 285 |
|
| 286 |
---
|
| 287 |
|
| 288 |
+
## Domain-Aware Calibration System
|
| 289 |
|
| 290 |
+
The system recognizes that different writing genres have different normative characteristics. Sixteen domains are supported, each with specialized configurations.
|
| 291 |
|
| 292 |
+
### Domain Classification Process
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
+
1. **Feature Extraction**: Analyze text for domain indicators including formality, technical terminology, citation patterns, punctuation usage, and structural complexity
|
| 295 |
+
2. **Probabilistic Classification**: Use heuristic and optional pre-trained model-assisted inference to estimate domain probabilities
|
| 296 |
+
3. **Threshold Selection**: Apply domain-specific detection thresholds and metric weights
|
|
|
|
| 297 |
|
| 298 |
+
### Example Domain Configurations
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
+
**Academic Domain (Conservative thresholds)**:
|
| 301 |
+
- Higher linguistic complexity expectations
|
| 302 |
+
- Reduced sensitivity to low perplexity
|
| 303 |
+
- Elevated synthetic probability threshold (0.75)
|
| 304 |
+
- Priority on minimizing false positives
|
| 305 |
|
| 306 |
+
**Creative Domain (Adaptive thresholds)**:
|
| 307 |
+
- Enhanced entropy and structural analysis
|
| 308 |
+
- Tolerance for high perplexity variation
|
| 309 |
+
- Balanced synthetic threshold (0.70)
|
| 310 |
+
- Focus on stylistic pattern detection
|
| 311 |
|
| 312 |
+
**Social Media Domain (Lenient thresholds)**:
|
| 313 |
+
- Perplexity as primary signal
|
| 314 |
+
- Relaxed linguistic requirements
|
| 315 |
+
- Lower synthetic threshold (0.65)
|
| 316 |
+
- Emphasis on conversational authenticity
|
| 317 |
|
| 318 |
+
**Technical Documentation (Strict thresholds)**:
|
| 319 |
+
- Semantic coherence prioritization
|
| 320 |
+
- Highest synthetic threshold (0.80)
|
| 321 |
+
- Structural pattern analysis
|
| 322 |
+
- Maximum emphasis on minimizing false accusations
|
| 323 |
|
| 324 |
+
### Calibration Methodology
|
|
|
|
| 325 |
|
| 326 |
+
Thresholds were optimized using ROC curve analysis on curated datasets of 10,000+ verified texts per domain, with cross-validation to ensure generalization. The optimization objective balanced precision and recall while prioritizing false positive minimization in high-stakes domains.
|
|
|
|
| 327 |
|
| 328 |
+
---
|
|
|
|
| 329 |
|
| 330 |
+
## Interpretability and Explainability
|
|
|
|
| 331 |
|
| 332 |
+
### Sentence-Level Forensic Highlighting
|
|
|
|
| 333 |
|
| 334 |
+
Text is analyzed at the sentence level, with each sentence receiving a color-coded classification:
|
|
|
|
| 335 |
|
| 336 |
+
- 🔴 **Deep Red**: Strong synthetic consistency signals (> 80% probability)
|
| 337 |
+
- 🟠 **Light Red**: Moderate synthetic signals (60–80% probability)
|
| 338 |
+
- 🟡 **Yellow**: Inconclusive or mixed signals (40–60% probability)
|
| 339 |
+
- 🟢 **Green**: Strong authentic consistency signals (< 40% probability)
|
| 340 |
|
| 341 |
+
Hover interactions reveal detailed forensic data for each sentence, including individual metric scores and contributing factors.
|
| 342 |
|
| 343 |
+
### Natural Language Reasoning Generation
|
| 344 |
|
| 345 |
+
Every analysis includes comprehensive human-readable explanations structured as:
|
| 346 |
|
| 347 |
+
#### Executive Summary
|
| 348 |
+
A concise overview of the forensic assessment, including final probability, confidence level, and primary findings.
|
| 349 |
|
| 350 |
+
#### Key Forensic Indicators
|
| 351 |
+
Specific text characteristics that contributed to the assessment, such as:
|
| 352 |
+
- "Unusually uniform sentence structure (burstiness: -0.12)"
|
| 353 |
+
- "Exceptionally high semantic coherence (mean: 0.91)"
|
| 354 |
+
- "Low perplexity variance indicating predictable token sequences"
|
| 355 |
|
| 356 |
+
#### Confidence Factors Analysis
|
| 357 |
+
Explicit discussion of:
|
| 358 |
+
- Supporting evidence (metrics showing strong signals)
|
| 359 |
+
- Contradicting evidence (metrics showing conflicting signals)
|
| 360 |
+
- Uncertainty sources (domain ambiguity, text length limitations, etc.)
|
| 361 |
|
| 362 |
+
#### Metric Contribution Breakdown
|
| 363 |
+
Percentage attribution showing how much each forensic signal contributed to the final assessment, helping users understand the analytical weighting.
|
| 364 |
|
| 365 |
+
#### Domain Context Considerations
|
| 366 |
+
Explanation of how the text's genre affected the analysis, including any domain-specific adjustments applied to thresholds or interpretations.
|
| 367 |
|
| 368 |
+
---
|
| 369 |
|
| 370 |
+
## Ethical Framework and Implementation Principles
|
| 371 |
|
| 372 |
+
### Core Ethical Commitments
|
| 373 |
|
| 374 |
+
- **Transparency Over Certainty**: The system explicitly acknowledges uncertainty rather than feigning omniscience. All outputs include confidence intervals and uncertainty quantification.
|
| 375 |
+
- **Evidence Over Attribution**: TEXT-AUTH reports statistical patterns, not authorship claims. This distinction is maintained throughout the user interface, documentation, and API responses.
|
| 376 |
+
- **Contextual Awareness**: Analyses consider domain, genre, language, and cultural factors that might affect interpretation. The system includes bias mitigation measures for protected writing styles.
|
| 377 |
+
- **Human-in-the-Loop Design**: Automated analysis supports rather than replaces human judgment. High-uncertainty cases explicitly recommend human review, and all high-stakes applications require human oversight.
|
| 378 |
+
- **Continuous Auditing**: The system implements regular fairness evaluations, performance monitoring, and bias detection to identify and address emerging issues.
|
| 379 |
|
| 380 |
+
### Responsible Use Guidelines
|
| 381 |
|
| 382 |
+
**Appropriate Applications**
|
| 383 |
+
- Academic integrity screening (with human review processes)
|
| 384 |
+
- Content verification in editorial workflows
|
| 385 |
+
- Resume authenticity checking (as part of holistic review)
|
| 386 |
+
- Research on text generation patterns
|
| 387 |
+
- Writing assistance tool calibration
|
| 388 |
|
| 389 |
+
**Inappropriate Applications**
|
| 390 |
+
- Sole determinant for academic penalties
|
| 391 |
+
- Automated rejection without appeal mechanisms
|
| 392 |
+
- Surveillance without consent or disclosure
|
| 393 |
+
- Cross-cultural comparison without proper calibration
|
| 394 |
+
- Real-time monitoring without transparency
|
| 395 |
|
| 396 |
+
### Bias Mitigation Strategies
|
| 397 |
|
| 398 |
+
The system implements multiple bias reduction techniques:
|
| 399 |
+
- **Domain normalization**: Genre-specific baselines reduce false positives against formal writing styles
|
| 400 |
+
- **Confidence thresholding**: Higher uncertainty triggers human review for edge cases
|
| 401 |
+
- **Protected style detection**: Identification of non-native, neurodivergent, or regional writing patterns with adjusted interpretation
|
| 402 |
+
- **Regular fairness auditing**: Scheduled evaluation of performance across demographic and stylistic subgroups
|
| 403 |
|
| 404 |
+
### Computational Performance
|
| 405 |
+
- Short texts (100–500 words): 1.2 seconds average processing
|
| 406 |
+
- Medium texts (500–2000 words): 3.5 seconds average
|
| 407 |
+
- Long texts (2000+ words): 7.8 seconds average
|
| 408 |
+
- Parallel execution: 2.9× speedup over sequential processing
|
| 409 |
+
- Memory footprint: 1.5–3.0 GB depending on configuration
|
| 410 |
|
| 411 |
+
---
|
| 412 |
|
| 413 |
+
## Conclusion: Toward Responsible Text Forensics
|
| 414 |
|
| 415 |
+
TEXT-AUTH represents a paradigm shift in text authenticity analysis—from binary classification to evidence-based forensic assessment. By combining orthogonal statistical signals with domain-aware calibration and transparent reasoning, the system provides actionable intelligence while acknowledging the inherent complexity and uncertainty of the problem.
|
| 416 |
|
| 417 |
+
### Key Contributions
|
| 418 |
|
| 419 |
+
- **Methodological Innovation**: A multi-metric, domain-calibrated approach that recognizes genre diversity in writing patterns
|
| 420 |
+
- **Uncertainty Quantification**: Explicit modeling of confidence and uncertainty prevents overconfident errors
|
| 421 |
+
- **Transparent Reasoning**: Comprehensive explainability builds trust and enables informed decision-making
|
| 422 |
+
- **Ethical Foundation**: Clear boundaries around appropriate use and acknowledgment of limitations
|
| 423 |
+
- **Production Engineering**: Parallel processing, efficient caching, and scalable architecture enable real-world deployment
|
| 424 |
|
| 425 |
---
|
| 426 |
|
| 427 |
+
### The Path Forward
|
| 428 |
+
|
| 429 |
+
Text authenticity assessment remains an evolving challenge in the age of generative AI. TEXT-AUTH provides a foundation for responsible forensic analysis, but continued development is essential:
|
| 430 |
+
|
| 431 |
+
- Multilingual expansion to support diverse linguistic contexts
|
| 432 |
+
- Real-time analysis capabilities for interactive writing environments
|
| 433 |
+
- Enhanced adversarial robustness against evolving evasion techniques
|
| 434 |
+
- Institutional calibration frameworks for organization-specific needs
|
| 435 |
+
- Collaborative research initiatives to advance the field collectively
|
| 436 |
|
| 437 |
+
Ultimately, the goal is not perfect detection—an unrealistic standard in an adversarial environment—but rather the development of tools that make authenticity analysis more transparent, more nuanced, and more accountable than previous approaches.
|
| 438 |
+
|
| 439 |
+
By focusing on evidence rather than attribution, uncertainty rather than false certainty, and support rather than replacement of human judgment, TEXT-AUTH contributes to building trust in written communication in the generative AI era.
|
| 440 |
|
| 441 |
---
|
| 442 |
+
|
| 443 |
+
**TEXT-AUTH Forensic Text Analysis Platform**
|
| 444 |
+
Version 1.0 — December 2025
|
| 445 |
+
Author: Satyaki Mitra
|
| 446 |
+
_Evidence-based assessment, transparent reasoning, responsible implementation_
|
| 447 |
+
|
| 448 |
+
---
|
docs/WHITE_PAPER.md
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
example.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
# Complete detection + reporting pipeline
|
| 2 |
-
|
| 3 |
-
from detector.orchestrator import DetectionOrchestrator
|
| 4 |
-
from detector.attribution import ModelAttributor
|
| 5 |
-
from reporter.report_generator import ReportGenerator
|
| 6 |
-
|
| 7 |
-
# 1. Initialize components
|
| 8 |
-
orchestrator = DetectionOrchestrator()
|
| 9 |
-
orchestrator.initialize()
|
| 10 |
-
|
| 11 |
-
attributor = ModelAttributor()
|
| 12 |
-
attributor.initialize()
|
| 13 |
-
|
| 14 |
-
reporter = ReportGenerator()
|
| 15 |
-
|
| 16 |
-
# 2. Analyze text
|
| 17 |
-
text = """Perplexity measures how well a language model predicts a sample; lower perplexity indicates better predictive accuracy. In AI detection, models often exhibit unnaturally low perplexity because their outputs are statistically optimized rather than organically generated. Human writing tends to have higher variability and “burstiness”—irregular patterns of word choice and sentence structure. By combining perplexity with burstiness analysis and fine-tuned classifiers like RoBERTa, detectors can identify AI-generated text with greater confidence. Ensemble methods further improve reliability by aggregating multiple signals. This multi-layered approach reduces false positives and adapts to evolving AI models. Understanding these metrics helps users interpret detection scores meaningfully."""
|
| 18 |
-
|
| 19 |
-
detection_result = orchestrator.analyze(text)
|
| 20 |
-
|
| 21 |
-
# 3. Attribute model
|
| 22 |
-
attribution_result = attributor.attribute(
|
| 23 |
-
text=text,
|
| 24 |
-
processed_text=detection_result.processed_text,
|
| 25 |
-
metric_results=detection_result.metric_results,
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
# 4. Generate reports
|
| 29 |
-
report_files = reporter.generate_complete_report(
|
| 30 |
-
detection_result=detection_result,
|
| 31 |
-
attribution_result=attribution_result,
|
| 32 |
-
formats=["json", "pdf", "txt"],
|
| 33 |
-
filename_prefix="my_analysis",
|
| 34 |
-
)
|
| 35 |
-
|
| 36 |
-
print("Generated reports:")
|
| 37 |
-
for format_type, filepath in report_files.items():
|
| 38 |
-
print(f" {format_type.upper()}: {filepath}")
|
| 39 |
-
|
| 40 |
-
# Output:
|
| 41 |
-
# Generated reports:
|
| 42 |
-
# JSON: reports/output/my_analysis_20250101_143022.json
|
| 43 |
-
# HTML: reports/output/my_analysis_20250101_143022.html
|
| 44 |
-
# PDF: reports/output/my_analysis_20250101_143022.pdf
|
| 45 |
-
# TXT: reports/output/my_analysis_20250101_143022.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logs/application/app_2025-11-07.log
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/base_metric.py
CHANGED
|
@@ -1,89 +1,12 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
from abc import ABC
|
| 3 |
-
from enum import Enum
|
| 4 |
from typing import Any
|
| 5 |
from typing import Dict
|
| 6 |
-
from typing import Tuple
|
| 7 |
from loguru import logger
|
| 8 |
from typing import Optional
|
| 9 |
from abc import abstractmethod
|
| 10 |
-
from
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class MetricResult:
|
| 14 |
-
"""
|
| 15 |
-
Result from a metric calculation
|
| 16 |
-
"""
|
| 17 |
-
def __init__(self, metric_name: str, ai_probability: float, human_probability: float, mixed_probability: float, confidence: float, details: Optional[Dict[str, Any]] = None, error: Optional[str] = None):
|
| 18 |
-
self.metric_name = metric_name
|
| 19 |
-
self.ai_probability = max(0.0, min(1.0, ai_probability))
|
| 20 |
-
self.human_probability = max(0.0, min(1.0, human_probability))
|
| 21 |
-
self.mixed_probability = max(0.0, min(1.0, mixed_probability))
|
| 22 |
-
self.confidence = max(0.0, min(1.0, confidence))
|
| 23 |
-
self.details = details or {}
|
| 24 |
-
self.error = error
|
| 25 |
-
|
| 26 |
-
# Normalize probabilities to sum to 1
|
| 27 |
-
total = self.ai_probability + self.human_probability + self.mixed_probability
|
| 28 |
-
|
| 29 |
-
if (total > 0):
|
| 30 |
-
self.ai_probability /= total
|
| 31 |
-
self.human_probability /= total
|
| 32 |
-
self.mixed_probability /= total
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 36 |
-
"""
|
| 37 |
-
Convert to dictionary
|
| 38 |
-
"""
|
| 39 |
-
return {"metric_name" : self.metric_name,
|
| 40 |
-
"ai_probability" : round(self.ai_probability, 4),
|
| 41 |
-
"human_probability" : round(self.human_probability, 4),
|
| 42 |
-
"mixed_probability" : round(self.mixed_probability, 4),
|
| 43 |
-
"confidence" : round(self.confidence, 4),
|
| 44 |
-
"details" : self.details,
|
| 45 |
-
"error" : self.error,
|
| 46 |
-
"success" : self.error is None,
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
@property
|
| 51 |
-
def is_ai(self) -> bool:
|
| 52 |
-
"""
|
| 53 |
-
Check if classified as AI
|
| 54 |
-
"""
|
| 55 |
-
return self.ai_probability > max(self.human_probability, self.mixed_probability)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
@property
|
| 59 |
-
def is_human(self) -> bool:
|
| 60 |
-
"""
|
| 61 |
-
Check if classified as human
|
| 62 |
-
"""
|
| 63 |
-
return self.human_probability > max(self.ai_probability, self.mixed_probability)
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
@property
|
| 67 |
-
def is_mixed(self) -> bool:
|
| 68 |
-
"""
|
| 69 |
-
Check if classified as mixed
|
| 70 |
-
"""
|
| 71 |
-
return self.mixed_probability > max(self.ai_probability, self.human_probability)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
@property
|
| 75 |
-
def predicted_class(self) -> str:
|
| 76 |
-
"""
|
| 77 |
-
Get predicted class
|
| 78 |
-
"""
|
| 79 |
-
if self.is_ai:
|
| 80 |
-
return "AI"
|
| 81 |
-
|
| 82 |
-
elif self.is_human:
|
| 83 |
-
return "Human"
|
| 84 |
-
|
| 85 |
-
else:
|
| 86 |
-
return "Mixed"
|
| 87 |
|
| 88 |
|
| 89 |
class BaseMetric(ABC):
|
|
@@ -91,11 +14,15 @@ class BaseMetric(ABC):
|
|
| 91 |
Abstract base class for all detection metrics
|
| 92 |
"""
|
| 93 |
def __init__(self, name: str, description: str):
|
| 94 |
-
self.name
|
| 95 |
-
self.description
|
| 96 |
-
self.is_initialized
|
| 97 |
-
self._model
|
| 98 |
-
self._tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
@abstractmethod
|
|
@@ -178,13 +105,7 @@ class BaseMetric(ABC):
|
|
| 178 |
if not self.is_initialized:
|
| 179 |
logger.warning(f"{self.name}: Not initialized, initializing now...")
|
| 180 |
if not self.initialize():
|
| 181 |
-
return
|
| 182 |
-
ai_probability = 0.5,
|
| 183 |
-
human_probability = 0.5,
|
| 184 |
-
mixed_probability = 0.0,
|
| 185 |
-
confidence = 0.0,
|
| 186 |
-
error = "Failed to initialize metric",
|
| 187 |
-
)
|
| 188 |
|
| 189 |
result = self.compute(text, **kwargs)
|
| 190 |
return result
|
|
@@ -192,13 +113,7 @@ class BaseMetric(ABC):
|
|
| 192 |
|
| 193 |
except Exception as e:
|
| 194 |
logger.error(f"{self.name}: Error computing metric: {e}")
|
| 195 |
-
return
|
| 196 |
-
ai_probability = 0.5,
|
| 197 |
-
human_probability = 0.5,
|
| 198 |
-
mixed_probability = 0.0,
|
| 199 |
-
confidence = 0.0,
|
| 200 |
-
error = str(e),
|
| 201 |
-
)
|
| 202 |
|
| 203 |
|
| 204 |
def batch_compute(self, texts: list, **kwargs) -> list:
|
|
@@ -237,6 +152,19 @@ class BaseMetric(ABC):
|
|
| 237 |
def __repr__(self) -> str:
|
| 238 |
return f"{self.__class__.__name__}(name='{self.name}', initialized={self.is_initialized})"
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
|
| 242 |
class StatisticalMetric(BaseMetric):
|
|
@@ -255,6 +183,5 @@ class StatisticalMetric(BaseMetric):
|
|
| 255 |
|
| 256 |
# Export
|
| 257 |
__all__ = ["BaseMetric",
|
| 258 |
-
"MetricResult",
|
| 259 |
"StatisticalMetric",
|
| 260 |
]
|
|
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
from abc import ABC
|
|
|
|
| 3 |
from typing import Any
|
| 4 |
from typing import Dict
|
|
|
|
| 5 |
from loguru import logger
|
| 6 |
from typing import Optional
|
| 7 |
from abc import abstractmethod
|
| 8 |
+
from config.schemas import MetricResult
|
| 9 |
+
from config.constants import base_metric_params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class BaseMetric(ABC):
|
|
|
|
| 14 |
Abstract base class for all detection metrics
|
| 15 |
"""
|
| 16 |
def __init__(self, name: str, description: str):
|
| 17 |
+
self.name = name
|
| 18 |
+
self.description = description
|
| 19 |
+
self.is_initialized = False
|
| 20 |
+
self._model = None
|
| 21 |
+
self._tokenizer = None
|
| 22 |
+
self.default_synthetic_probability = base_metric_params.DEFAULT_SYNTHETIC_PROBABILITY
|
| 23 |
+
self.default_authentic_probability = base_metric_params.DEFAULT_AUTHENTIC_PROBABILITY
|
| 24 |
+
self.default_hybrid_probability = base_metric_params.DEFAULT_HYBRID_PROBABILITY
|
| 25 |
+
self.default_confidence = base_metric_params.DEFAULT_CONFIDENCE
|
| 26 |
|
| 27 |
|
| 28 |
@abstractmethod
|
|
|
|
| 105 |
if not self.is_initialized:
|
| 106 |
logger.warning(f"{self.name}: Not initialized, initializing now...")
|
| 107 |
if not self.initialize():
|
| 108 |
+
return self._default_result(error = "Failed to initialize metric")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
result = self.compute(text, **kwargs)
|
| 111 |
return result
|
|
|
|
| 113 |
|
| 114 |
except Exception as e:
|
| 115 |
logger.error(f"{self.name}: Error computing metric: {e}")
|
| 116 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
def batch_compute(self, texts: list, **kwargs) -> list:
|
|
|
|
| 152 |
def __repr__(self) -> str:
|
| 153 |
return f"{self.__class__.__name__}(name='{self.name}', initialized={self.is_initialized})"
|
| 154 |
|
| 155 |
+
|
| 156 |
+
def _default_result(self, error: Optional[str] = None) -> MetricResult:
|
| 157 |
+
"""
|
| 158 |
+
Default metric result for exception cases
|
| 159 |
+
"""
|
| 160 |
+
return MetricResult(metric_name = self.name,
|
| 161 |
+
synthetic_probability = self.default_synthetic_probability,
|
| 162 |
+
authentic_probability = self.default_authentic_probability,
|
| 163 |
+
hybrid_probability = self.default_hybrid_probability,
|
| 164 |
+
confidence = self.default_confidence,
|
| 165 |
+
error = error,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
|
| 169 |
|
| 170 |
class StatisticalMetric(BaseMetric):
|
|
|
|
| 183 |
|
| 184 |
# Export
|
| 185 |
__all__ = ["BaseMetric",
|
|
|
|
| 186 |
"StatisticalMetric",
|
| 187 |
]
|
metrics/entropy.py
CHANGED
|
@@ -6,10 +6,11 @@ from typing import Dict
|
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
from collections import Counter
|
|
|
|
|
|
|
| 9 |
from metrics.base_metric import BaseMetric
|
| 10 |
-
from config.threshold_config import Domain
|
| 11 |
-
from metrics.base_metric import MetricResult
|
| 12 |
from models.model_manager import get_model_manager
|
|
|
|
| 13 |
from config.threshold_config import get_threshold_for_domain
|
| 14 |
|
| 15 |
|
|
@@ -22,13 +23,14 @@ class EntropyMetric(BaseMetric):
|
|
| 22 |
- Word-level entropy and burstiness
|
| 23 |
- Token-level diversity and unpredictability in sequences
|
| 24 |
- Entropy distribution across text chunks
|
| 25 |
-
-
|
| 26 |
"""
|
| 27 |
def __init__(self):
|
| 28 |
super().__init__(name = "entropy",
|
| 29 |
description = "Token-level diversity and unpredictability in text sequences",
|
| 30 |
)
|
| 31 |
self.tokenizer = None
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def initialize(self) -> bool:
|
|
@@ -40,7 +42,7 @@ class EntropyMetric(BaseMetric):
|
|
| 40 |
|
| 41 |
# Load tokenizer for token-level analysis
|
| 42 |
model_manager = get_model_manager()
|
| 43 |
-
gpt_model = model_manager.load_model("
|
| 44 |
|
| 45 |
if isinstance(gpt_model, tuple):
|
| 46 |
self.tokenizer = gpt_model[1]
|
|
@@ -62,108 +64,105 @@ class EntropyMetric(BaseMetric):
|
|
| 62 |
Compute enhanced entropy measures for text with FULL DOMAIN THRESHOLD INTEGRATION
|
| 63 |
"""
|
| 64 |
try:
|
| 65 |
-
if (not text or (len(text.strip()) <
|
| 66 |
-
return MetricResult(metric_name
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
confidence
|
| 71 |
-
error
|
| 72 |
)
|
| 73 |
|
| 74 |
# Get domain-specific thresholds
|
| 75 |
-
domain
|
| 76 |
-
domain_thresholds
|
| 77 |
-
entropy_thresholds
|
| 78 |
|
| 79 |
# Calculate comprehensive entropy features
|
| 80 |
-
features
|
| 81 |
|
| 82 |
# Calculate raw entropy score (0-1 scale)
|
| 83 |
-
raw_entropy_score, confidence
|
| 84 |
|
| 85 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# Apply confidence multiplier from domain thresholds
|
| 89 |
-
confidence
|
| 90 |
-
confidence
|
| 91 |
|
| 92 |
-
return MetricResult(metric_name
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
confidence
|
| 97 |
-
details
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
)
|
| 104 |
|
| 105 |
except Exception as e:
|
| 106 |
logger.error(f"Error in entropy computation: {repr(e)}")
|
| 107 |
-
return
|
| 108 |
-
ai_probability = 0.5,
|
| 109 |
-
human_probability = 0.5,
|
| 110 |
-
mixed_probability = 0.0,
|
| 111 |
-
confidence = 0.0,
|
| 112 |
-
error = str(e),
|
| 113 |
-
)
|
| 114 |
|
| 115 |
|
| 116 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 117 |
"""
|
| 118 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 119 |
"""
|
| 120 |
-
|
| 121 |
-
|
| 122 |
|
| 123 |
# Calculate probabilities based on threshold distances
|
| 124 |
-
if (raw_score >=
|
| 125 |
-
# Above
|
| 126 |
-
distance_from_threshold = raw_score -
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
elif (raw_score <=
|
| 131 |
-
# Below
|
| 132 |
-
distance_from_threshold =
|
| 133 |
-
|
| 134 |
-
|
| 135 |
|
| 136 |
else:
|
| 137 |
# Between thresholds - uncertain zone
|
| 138 |
-
range_width =
|
| 139 |
-
if (range_width >
|
| 140 |
-
position_in_range = (raw_score -
|
| 141 |
-
|
| 142 |
-
|
| 143 |
|
| 144 |
else:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
# Ensure probabilities are valid
|
| 149 |
-
|
| 150 |
-
|
| 151 |
|
| 152 |
-
# Calculate
|
| 153 |
-
|
| 154 |
|
| 155 |
# Normalize to sum to 1.0
|
| 156 |
-
total
|
| 157 |
|
| 158 |
-
if (total >
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
|
| 163 |
-
return
|
| 164 |
|
| 165 |
|
| 166 |
-
def
|
| 167 |
"""
|
| 168 |
Calculate comprehensive entropy measures including document-required features
|
| 169 |
"""
|
|
@@ -179,15 +178,15 @@ class EntropyMetric(BaseMetric):
|
|
| 179 |
sequence_unpredictability = self._calculate_sequence_unpredictability(text)
|
| 180 |
|
| 181 |
# Chunk-based analysis for whole-text understanding
|
| 182 |
-
chunk_entropies = self._calculate_chunk_entropy(text
|
| 183 |
entropy_variance = np.var(chunk_entropies) if chunk_entropies else 0.0
|
| 184 |
avg_chunk_entropy = np.mean(chunk_entropies) if chunk_entropies else 0.0
|
| 185 |
|
| 186 |
-
#
|
| 187 |
-
|
| 188 |
|
| 189 |
# Predictability measures
|
| 190 |
-
predictability = 1.0 - min(1.0, char_entropy /
|
| 191 |
|
| 192 |
return {"char_entropy" : round(char_entropy, 4),
|
| 193 |
"word_entropy" : round(word_entropy, 4),
|
|
@@ -197,7 +196,7 @@ class EntropyMetric(BaseMetric):
|
|
| 197 |
"entropy_variance" : round(entropy_variance, 4),
|
| 198 |
"avg_chunk_entropy" : round(avg_chunk_entropy, 4),
|
| 199 |
"predictability_score" : round(predictability, 4),
|
| 200 |
-
"
|
| 201 |
"num_chunks_analyzed" : len(chunk_entropies),
|
| 202 |
}
|
| 203 |
|
|
@@ -221,7 +220,8 @@ class EntropyMetric(BaseMetric):
|
|
| 221 |
|
| 222 |
for count in char_counts.values():
|
| 223 |
probability = count / total_chars
|
| 224 |
-
|
|
|
|
| 225 |
|
| 226 |
return entropy
|
| 227 |
|
|
@@ -231,7 +231,7 @@ class EntropyMetric(BaseMetric):
|
|
| 231 |
Calculate word-level entropy
|
| 232 |
"""
|
| 233 |
words = text.lower().split()
|
| 234 |
-
if (len(words) <
|
| 235 |
return 0.0
|
| 236 |
|
| 237 |
word_counts = Counter(words)
|
|
@@ -241,7 +241,8 @@ class EntropyMetric(BaseMetric):
|
|
| 241 |
|
| 242 |
for count in word_counts.values():
|
| 243 |
probability = count / total_words
|
| 244 |
-
|
|
|
|
| 245 |
|
| 246 |
return entropy
|
| 247 |
|
|
@@ -255,7 +256,7 @@ class EntropyMetric(BaseMetric):
|
|
| 255 |
return 0.0
|
| 256 |
|
| 257 |
# Length check before tokenization
|
| 258 |
-
if (len(text.strip()) <
|
| 259 |
return 0.0
|
| 260 |
|
| 261 |
# Tokenize text
|
|
@@ -264,7 +265,7 @@ class EntropyMetric(BaseMetric):
|
|
| 264 |
truncation = True,
|
| 265 |
)
|
| 266 |
|
| 267 |
-
if (len(tokens) <
|
| 268 |
return 0.0
|
| 269 |
|
| 270 |
token_counts = Counter(tokens)
|
|
@@ -274,7 +275,8 @@ class EntropyMetric(BaseMetric):
|
|
| 274 |
|
| 275 |
for count in token_counts.values():
|
| 276 |
probability = count / total_tokens
|
| 277 |
-
|
|
|
|
| 278 |
|
| 279 |
return entropy
|
| 280 |
|
|
@@ -285,14 +287,14 @@ class EntropyMetric(BaseMetric):
|
|
| 285 |
|
| 286 |
def _calculate_token_diversity(self, text: str) -> float:
|
| 287 |
"""
|
| 288 |
-
Calculate token-level diversity : Higher diversity = more
|
| 289 |
"""
|
| 290 |
if not self.tokenizer:
|
| 291 |
return 0.0
|
| 292 |
|
| 293 |
try:
|
| 294 |
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
| 295 |
-
if (len(tokens) <
|
| 296 |
return 0.0
|
| 297 |
|
| 298 |
unique_tokens = len(set(tokens))
|
|
@@ -317,7 +319,7 @@ class EntropyMetric(BaseMetric):
|
|
| 317 |
|
| 318 |
try:
|
| 319 |
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
| 320 |
-
if (len(tokens) <
|
| 321 |
return 0.0
|
| 322 |
|
| 323 |
# Calculate bigram unpredictability
|
|
@@ -329,11 +331,12 @@ class EntropyMetric(BaseMetric):
|
|
| 329 |
sequence_entropy = 0.0
|
| 330 |
|
| 331 |
for count in bigram_counts.values():
|
| 332 |
-
probability
|
| 333 |
-
|
|
|
|
| 334 |
|
| 335 |
-
# Normalize to 0-1 scale
|
| 336 |
-
normalized_entropy = min(1.0, sequence_entropy /
|
| 337 |
|
| 338 |
return normalized_entropy
|
| 339 |
|
|
@@ -342,28 +345,32 @@ class EntropyMetric(BaseMetric):
|
|
| 342 |
return 0.0
|
| 343 |
|
| 344 |
|
| 345 |
-
def _calculate_chunk_entropy(self, text: str
|
| 346 |
"""
|
| 347 |
Calculate entropy distribution across text chunks
|
| 348 |
"""
|
| 349 |
-
chunks
|
| 350 |
-
words
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
# Create overlapping chunks for better analysis
|
| 353 |
-
for i in range(0, len(words),
|
| 354 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 355 |
|
| 356 |
# Minimum chunk size
|
| 357 |
-
if (len(chunk) >
|
| 358 |
entropy = self._calculate_character_entropy(chunk)
|
| 359 |
-
|
|
|
|
| 360 |
|
| 361 |
return chunks
|
| 362 |
|
| 363 |
|
| 364 |
-
def
|
| 365 |
"""
|
| 366 |
-
Detect
|
| 367 |
"""
|
| 368 |
patterns_detected = 0
|
| 369 |
total_patterns = 4
|
|
@@ -371,30 +378,30 @@ class EntropyMetric(BaseMetric):
|
|
| 371 |
# Overly consistent character distribution
|
| 372 |
char_entropy = self._calculate_character_entropy(text)
|
| 373 |
|
| 374 |
-
#
|
| 375 |
-
if (char_entropy <
|
| 376 |
patterns_detected += 1
|
| 377 |
|
| 378 |
# Low token diversity
|
| 379 |
token_diversity = self._calculate_token_diversity(text)
|
| 380 |
|
| 381 |
-
#
|
| 382 |
-
if (token_diversity <
|
| 383 |
patterns_detected += 1
|
| 384 |
|
| 385 |
# Predictable sequences
|
| 386 |
sequence_unpredictability = self._calculate_sequence_unpredictability(text)
|
| 387 |
|
| 388 |
-
#
|
| 389 |
-
if (sequence_unpredictability <
|
| 390 |
patterns_detected += 1
|
| 391 |
|
| 392 |
# Low entropy variance across chunks
|
| 393 |
-
chunk_entropies = self._calculate_chunk_entropy(text
|
| 394 |
entropy_variance = np.var(chunk_entropies) if chunk_entropies else 0.0
|
| 395 |
|
| 396 |
-
#
|
| 397 |
-
if (entropy_variance <
|
| 398 |
patterns_detected += 1
|
| 399 |
|
| 400 |
return patterns_detected / total_patterns
|
|
@@ -407,120 +414,129 @@ class EntropyMetric(BaseMetric):
|
|
| 407 |
"""
|
| 408 |
# Check feature validity
|
| 409 |
valid_features = [score for score in [features.get('char_entropy', 0),
|
|
|
|
| 410 |
features.get('token_diversity', 0),
|
| 411 |
features.get('sequence_unpredictability', 0),
|
| 412 |
-
features.get('
|
| 413 |
-
] if score >
|
| 414 |
]
|
| 415 |
|
| 416 |
-
if (len(valid_features) <
|
| 417 |
# Low confidence if insufficient features
|
| 418 |
-
return
|
| 419 |
|
| 420 |
-
|
| 421 |
|
| 422 |
-
#
|
| 423 |
-
if (features['char_entropy'] <
|
| 424 |
-
# Strong
|
| 425 |
-
|
| 426 |
|
| 427 |
-
elif (features['char_entropy'] <
|
| 428 |
-
# Moderate
|
| 429 |
-
|
| 430 |
|
| 431 |
else:
|
| 432 |
-
# Weak
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
-
elif (features['entropy_variance'] <
|
| 441 |
# Neutral
|
| 442 |
-
|
| 443 |
|
| 444 |
else:
|
| 445 |
-
# Strong
|
| 446 |
-
|
| 447 |
|
| 448 |
-
# Low token diversity suggests
|
| 449 |
-
if (features['token_diversity'] <
|
| 450 |
-
|
| 451 |
|
| 452 |
-
elif (features['token_diversity'] <
|
| 453 |
-
|
| 454 |
|
| 455 |
else:
|
| 456 |
-
|
| 457 |
|
| 458 |
-
# Low sequence unpredictability suggests
|
| 459 |
-
if (features['sequence_unpredictability'] <
|
| 460 |
-
|
| 461 |
|
| 462 |
-
elif (features['sequence_unpredictability'] <
|
| 463 |
-
|
| 464 |
|
| 465 |
else:
|
| 466 |
-
|
| 467 |
|
| 468 |
-
# High
|
| 469 |
-
if (features['
|
| 470 |
-
|
| 471 |
|
| 472 |
-
elif (features['
|
| 473 |
-
|
| 474 |
|
| 475 |
else:
|
| 476 |
-
|
| 477 |
|
| 478 |
# Calculate raw score and confidence
|
| 479 |
-
raw_score = np.mean(
|
| 480 |
-
confidence = 1.0 - (np.std(
|
| 481 |
-
confidence = max(
|
| 482 |
|
| 483 |
return raw_score, confidence
|
| 484 |
|
| 485 |
|
| 486 |
-
def
|
| 487 |
"""
|
| 488 |
-
Calculate probability of
|
| 489 |
"""
|
| 490 |
-
|
| 491 |
|
| 492 |
# High entropy variance suggests mixed content
|
| 493 |
entropy_variance = features.get('entropy_variance', 0)
|
| 494 |
|
| 495 |
-
if (entropy_variance >
|
| 496 |
# Strong mixed indicator
|
| 497 |
-
|
| 498 |
|
| 499 |
-
elif (entropy_variance >
|
| 500 |
-
|
| 501 |
|
| 502 |
else:
|
| 503 |
-
|
| 504 |
|
| 505 |
# Inconsistent patterns across different entropy measures
|
| 506 |
char_entropy = features.get('char_entropy', 0)
|
| 507 |
word_entropy = features.get('word_entropy', 0)
|
| 508 |
|
| 509 |
-
if ((char_entropy >
|
| 510 |
entropy_discrepancy = abs(char_entropy - word_entropy)
|
| 511 |
|
| 512 |
# Large discrepancy suggests mixing
|
| 513 |
-
if (entropy_discrepancy >
|
| 514 |
-
|
| 515 |
|
| 516 |
-
# Moderate
|
| 517 |
-
|
| 518 |
-
if (
|
| 519 |
-
|
| 520 |
|
| 521 |
-
|
| 522 |
|
| 523 |
-
return
|
| 524 |
|
| 525 |
|
| 526 |
def cleanup(self):
|
|
@@ -533,4 +549,4 @@ class EntropyMetric(BaseMetric):
|
|
| 533 |
|
| 534 |
|
| 535 |
# Export
|
| 536 |
-
__all__ = ["EntropyMetric"]
|
|
|
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
from collections import Counter
|
| 9 |
+
from config.enums import Domain
|
| 10 |
+
from config.schemas import MetricResult
|
| 11 |
from metrics.base_metric import BaseMetric
|
|
|
|
|
|
|
| 12 |
from models.model_manager import get_model_manager
|
| 13 |
+
from config.constants import entropy_metric_params
|
| 14 |
from config.threshold_config import get_threshold_for_domain
|
| 15 |
|
| 16 |
|
|
|
|
| 23 |
- Word-level entropy and burstiness
|
| 24 |
- Token-level diversity and unpredictability in sequences
|
| 25 |
- Entropy distribution across text chunks
|
| 26 |
+
- Synthetic-specific pattern detection
|
| 27 |
"""
|
| 28 |
def __init__(self):
|
| 29 |
super().__init__(name = "entropy",
|
| 30 |
description = "Token-level diversity and unpredictability in text sequences",
|
| 31 |
)
|
| 32 |
self.tokenizer = None
|
| 33 |
+
self.params = entropy_metric_params
|
| 34 |
|
| 35 |
|
| 36 |
def initialize(self) -> bool:
|
|
|
|
| 42 |
|
| 43 |
# Load tokenizer for token-level analysis
|
| 44 |
model_manager = get_model_manager()
|
| 45 |
+
gpt_model = model_manager.load_model("perplexity_reference_lm")
|
| 46 |
|
| 47 |
if isinstance(gpt_model, tuple):
|
| 48 |
self.tokenizer = gpt_model[1]
|
|
|
|
| 64 |
Compute enhanced entropy measures for text with FULL DOMAIN THRESHOLD INTEGRATION
|
| 65 |
"""
|
| 66 |
try:
|
| 67 |
+
if (not text or (len(text.strip()) < self.params.MIN_TEXT_LENGTH_FOR_ANALYSIS)):
|
| 68 |
+
return MetricResult(metric_name = self.name,
|
| 69 |
+
synthetic_probability = self.params.NEUTRAL_PROBABILITY,
|
| 70 |
+
authentic_probability = self.params.NEUTRAL_PROBABILITY,
|
| 71 |
+
hybrid_probability = self.params.MIN_PROBABILITY,
|
| 72 |
+
confidence = self.params.MIN_CONFIDENCE,
|
| 73 |
+
error = "Text too short for entropy analysis",
|
| 74 |
)
|
| 75 |
|
| 76 |
# Get domain-specific thresholds
|
| 77 |
+
domain = kwargs.get('domain', Domain.GENERAL)
|
| 78 |
+
domain_thresholds = get_threshold_for_domain(domain)
|
| 79 |
+
entropy_thresholds = domain_thresholds.entropy
|
| 80 |
|
| 81 |
# Calculate comprehensive entropy features
|
| 82 |
+
features = self._calculate_entropy_features(text = text)
|
| 83 |
|
| 84 |
# Calculate raw entropy score (0-1 scale)
|
| 85 |
+
raw_entropy_score, confidence = self._analyze_entropy_patterns(features = features)
|
| 86 |
|
| 87 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 88 |
+
synthetic_prob, authentic_prob, hybrid_prob = self._apply_domain_thresholds(raw_score = raw_entropy_score,
|
| 89 |
+
thresholds = entropy_thresholds,
|
| 90 |
+
features = features,
|
| 91 |
+
)
|
| 92 |
|
| 93 |
# Apply confidence multiplier from domain thresholds
|
| 94 |
+
confidence *= entropy_thresholds.confidence_multiplier
|
| 95 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 96 |
|
| 97 |
+
return MetricResult(metric_name = self.name,
|
| 98 |
+
synthetic_probability = synthetic_prob,
|
| 99 |
+
authentic_probability = authentic_prob,
|
| 100 |
+
hybrid_probability = hybrid_prob,
|
| 101 |
+
confidence = confidence,
|
| 102 |
+
details = {**features,
|
| 103 |
+
'domain_used' : domain.value,
|
| 104 |
+
'synthetic_threshold': entropy_thresholds.synthetic_threshold,
|
| 105 |
+
'authentic_threshold': entropy_thresholds.authentic_threshold,
|
| 106 |
+
'raw_score' : raw_entropy_score,
|
| 107 |
+
},
|
| 108 |
)
|
| 109 |
|
| 110 |
except Exception as e:
|
| 111 |
logger.error(f"Error in entropy computation: {repr(e)}")
|
| 112 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 116 |
"""
|
| 117 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 118 |
"""
|
| 119 |
+
synthetic_threshold = thresholds.synthetic_threshold
|
| 120 |
+
authentic_threshold = thresholds.authentic_threshold
|
| 121 |
|
| 122 |
# Calculate probabilities based on threshold distances
|
| 123 |
+
if (raw_score >= synthetic_threshold):
|
| 124 |
+
# Above synthetic threshold - strongly synthetic
|
| 125 |
+
distance_from_threshold = raw_score - synthetic_threshold
|
| 126 |
+
synthetic_prob = self.params.STRONG_SYNTHETIC_BASE_PROB + (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 127 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 128 |
+
|
| 129 |
+
elif (raw_score <= authentic_threshold):
|
| 130 |
+
# Below authentic threshold - strongly authentic
|
| 131 |
+
distance_from_threshold = authentic_threshold - raw_score
|
| 132 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START - (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 133 |
+
authentic_prob = self.params.STRONG_AUTHENTIC_BASE_PROB + (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 134 |
|
| 135 |
else:
|
| 136 |
# Between thresholds - uncertain zone
|
| 137 |
+
range_width = synthetic_threshold - authentic_threshold
|
| 138 |
+
if (range_width > self.params.ZERO_TOLERANCE):
|
| 139 |
+
position_in_range = (raw_score - authentic_threshold) / range_width
|
| 140 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START + (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 141 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 142 |
|
| 143 |
else:
|
| 144 |
+
synthetic_prob = self.params.NEUTRAL_PROBABILITY
|
| 145 |
+
authentic_prob = self.params.NEUTRAL_PROBABILITY
|
| 146 |
|
| 147 |
# Ensure probabilities are valid
|
| 148 |
+
synthetic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, synthetic_prob))
|
| 149 |
+
authentic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, authentic_prob))
|
| 150 |
|
| 151 |
+
# Calculate hybrid probability based on entropy variance
|
| 152 |
+
hybrid_prob = self._calculate_hybrid_probability(features)
|
| 153 |
|
| 154 |
# Normalize to sum to 1.0
|
| 155 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 156 |
|
| 157 |
+
if (total > self.params.ZERO_TOLERANCE):
|
| 158 |
+
synthetic_prob /= total
|
| 159 |
+
authentic_prob /= total
|
| 160 |
+
hybrid_prob /= total
|
| 161 |
|
| 162 |
+
return synthetic_prob, authentic_prob, hybrid_prob
|
| 163 |
|
| 164 |
|
| 165 |
+
def _calculate_entropy_features(self, text: str) -> Dict[str, Any]:
|
| 166 |
"""
|
| 167 |
Calculate comprehensive entropy measures including document-required features
|
| 168 |
"""
|
|
|
|
| 178 |
sequence_unpredictability = self._calculate_sequence_unpredictability(text)
|
| 179 |
|
| 180 |
# Chunk-based analysis for whole-text understanding
|
| 181 |
+
chunk_entropies = self._calculate_chunk_entropy(text)
|
| 182 |
entropy_variance = np.var(chunk_entropies) if chunk_entropies else 0.0
|
| 183 |
avg_chunk_entropy = np.mean(chunk_entropies) if chunk_entropies else 0.0
|
| 184 |
|
| 185 |
+
# Synthetic-specific pattern detection
|
| 186 |
+
synthetic_pattern_score = self._detect_synthetic_entropy_patterns(text)
|
| 187 |
|
| 188 |
# Predictability measures
|
| 189 |
+
predictability = 1.0 - min(1.0, char_entropy / self.params.MAX_CHAR_ENTROPY)
|
| 190 |
|
| 191 |
return {"char_entropy" : round(char_entropy, 4),
|
| 192 |
"word_entropy" : round(word_entropy, 4),
|
|
|
|
| 196 |
"entropy_variance" : round(entropy_variance, 4),
|
| 197 |
"avg_chunk_entropy" : round(avg_chunk_entropy, 4),
|
| 198 |
"predictability_score" : round(predictability, 4),
|
| 199 |
+
"synthetic_pattern_score" : round(synthetic_pattern_score, 4),
|
| 200 |
"num_chunks_analyzed" : len(chunk_entropies),
|
| 201 |
}
|
| 202 |
|
|
|
|
| 220 |
|
| 221 |
for count in char_counts.values():
|
| 222 |
probability = count / total_chars
|
| 223 |
+
if probability > self.params.ZERO_TOLERANCE:
|
| 224 |
+
entropy -= probability * math.log2(probability)
|
| 225 |
|
| 226 |
return entropy
|
| 227 |
|
|
|
|
| 231 |
Calculate word-level entropy
|
| 232 |
"""
|
| 233 |
words = text.lower().split()
|
| 234 |
+
if (len(words) < self.params.MIN_WORDS_FOR_ANALYSIS):
|
| 235 |
return 0.0
|
| 236 |
|
| 237 |
word_counts = Counter(words)
|
|
|
|
| 241 |
|
| 242 |
for count in word_counts.values():
|
| 243 |
probability = count / total_words
|
| 244 |
+
if probability > self.params.ZERO_TOLERANCE:
|
| 245 |
+
entropy -= probability * math.log2(probability)
|
| 246 |
|
| 247 |
return entropy
|
| 248 |
|
|
|
|
| 256 |
return 0.0
|
| 257 |
|
| 258 |
# Length check before tokenization
|
| 259 |
+
if (len(text.strip()) < self.params.MIN_SENTENCE_LENGTH):
|
| 260 |
return 0.0
|
| 261 |
|
| 262 |
# Tokenize text
|
|
|
|
| 265 |
truncation = True,
|
| 266 |
)
|
| 267 |
|
| 268 |
+
if (len(tokens) < self.params.MIN_TOKENS_FOR_ANALYSIS):
|
| 269 |
return 0.0
|
| 270 |
|
| 271 |
token_counts = Counter(tokens)
|
|
|
|
| 275 |
|
| 276 |
for count in token_counts.values():
|
| 277 |
probability = count / total_tokens
|
| 278 |
+
if probability > self.params.ZERO_TOLERANCE:
|
| 279 |
+
entropy -= probability * math.log2(probability)
|
| 280 |
|
| 281 |
return entropy
|
| 282 |
|
|
|
|
| 287 |
|
| 288 |
def _calculate_token_diversity(self, text: str) -> float:
|
| 289 |
"""
|
| 290 |
+
Calculate token-level diversity : Higher diversity = more authentic-like
|
| 291 |
"""
|
| 292 |
if not self.tokenizer:
|
| 293 |
return 0.0
|
| 294 |
|
| 295 |
try:
|
| 296 |
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
| 297 |
+
if (len(tokens) < self.params.MIN_TOKENS_FOR_ANALYSIS):
|
| 298 |
return 0.0
|
| 299 |
|
| 300 |
unique_tokens = len(set(tokens))
|
|
|
|
| 319 |
|
| 320 |
try:
|
| 321 |
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
| 322 |
+
if (len(tokens) < self.params.MIN_TOKENS_FOR_SEQUENCE):
|
| 323 |
return 0.0
|
| 324 |
|
| 325 |
# Calculate bigram unpredictability
|
|
|
|
| 331 |
sequence_entropy = 0.0
|
| 332 |
|
| 333 |
for count in bigram_counts.values():
|
| 334 |
+
probability = count / total_bigrams
|
| 335 |
+
if probability > self.params.ZERO_TOLERANCE:
|
| 336 |
+
sequence_entropy -= probability * math.log2(probability)
|
| 337 |
|
| 338 |
+
# Normalize to 0-1 scale
|
| 339 |
+
normalized_entropy = min(1.0, sequence_entropy / self.params.MAX_BIGRAM_ENTROPY)
|
| 340 |
|
| 341 |
return normalized_entropy
|
| 342 |
|
|
|
|
| 345 |
return 0.0
|
| 346 |
|
| 347 |
|
| 348 |
+
def _calculate_chunk_entropy(self, text: str) -> List[float]:
|
| 349 |
"""
|
| 350 |
Calculate entropy distribution across text chunks
|
| 351 |
"""
|
| 352 |
+
chunks = list()
|
| 353 |
+
words = text.split()
|
| 354 |
+
chunk_size = self.params.CHUNK_SIZE_WORDS
|
| 355 |
+
overlap = int(chunk_size * self.params.CHUNK_OVERLAP_RATIO)
|
| 356 |
+
step = max(1, chunk_size - overlap)
|
| 357 |
|
| 358 |
# Create overlapping chunks for better analysis
|
| 359 |
+
for i in range(0, len(words), step):
|
| 360 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 361 |
|
| 362 |
# Minimum chunk size
|
| 363 |
+
if (len(chunk) > self.params.MIN_CHUNK_LENGTH):
|
| 364 |
entropy = self._calculate_character_entropy(chunk)
|
| 365 |
+
if entropy > self.params.ZERO_TOLERANCE:
|
| 366 |
+
chunks.append(entropy)
|
| 367 |
|
| 368 |
return chunks
|
| 369 |
|
| 370 |
|
| 371 |
+
def _detect_synthetic_entropy_patterns(self, text: str) -> float:
|
| 372 |
"""
|
| 373 |
+
Detect synthetic-specific entropy patterns: synthetic text often shows specific entropy signatures
|
| 374 |
"""
|
| 375 |
patterns_detected = 0
|
| 376 |
total_patterns = 4
|
|
|
|
| 378 |
# Overly consistent character distribution
|
| 379 |
char_entropy = self._calculate_character_entropy(text)
|
| 380 |
|
| 381 |
+
# synthetic tends to be more consistent
|
| 382 |
+
if (char_entropy < self.params.CHAR_ENTROPY_LOW_THRESHOLD):
|
| 383 |
patterns_detected += 1
|
| 384 |
|
| 385 |
# Low token diversity
|
| 386 |
token_diversity = self._calculate_token_diversity(text)
|
| 387 |
|
| 388 |
+
# synthetic reuses tokens more
|
| 389 |
+
if (token_diversity < self.params.TOKEN_DIVERSITY_MEDIUM_THRESHOLD):
|
| 390 |
patterns_detected += 1
|
| 391 |
|
| 392 |
# Predictable sequences
|
| 393 |
sequence_unpredictability = self._calculate_sequence_unpredictability(text)
|
| 394 |
|
| 395 |
+
# synthetic sequences are more predictable
|
| 396 |
+
if (sequence_unpredictability < self.params.SEQUENCE_UNPREDICTABILITY_MEDIUM_THRESHOLD):
|
| 397 |
patterns_detected += 1
|
| 398 |
|
| 399 |
# Low entropy variance across chunks
|
| 400 |
+
chunk_entropies = self._calculate_chunk_entropy(text)
|
| 401 |
entropy_variance = np.var(chunk_entropies) if chunk_entropies else 0.0
|
| 402 |
|
| 403 |
+
# synthetic maintains consistent entropy
|
| 404 |
+
if (entropy_variance < self.params.ENTROPY_VARIANCE_LOW_THRESHOLD):
|
| 405 |
patterns_detected += 1
|
| 406 |
|
| 407 |
return patterns_detected / total_patterns
|
|
|
|
| 414 |
"""
|
| 415 |
# Check feature validity
|
| 416 |
valid_features = [score for score in [features.get('char_entropy', 0),
|
| 417 |
+
features.get('token_entropy', 0),
|
| 418 |
features.get('token_diversity', 0),
|
| 419 |
features.get('sequence_unpredictability', 0),
|
| 420 |
+
features.get('synthetic_pattern_score', 0)
|
| 421 |
+
] if score > self.params.ZERO_TOLERANCE
|
| 422 |
]
|
| 423 |
|
| 424 |
+
if (len(valid_features) < self.params.MIN_REQUIRED_FEATURES):
|
| 425 |
# Low confidence if insufficient features
|
| 426 |
+
return self.params.NEUTRAL_PROBABILITY, self.params.LOW_FEATURE_CONFIDENCE
|
| 427 |
|
| 428 |
+
synthetic_indicators = list()
|
| 429 |
|
| 430 |
+
# synthetic text often has lower character entropy (more predictable)
|
| 431 |
+
if (features['char_entropy'] < self.params.CHAR_ENTROPY_VERY_LOW_THRESHOLD):
|
| 432 |
+
# Strong synthetic indicator
|
| 433 |
+
synthetic_indicators.append(self.params.VERY_STRONG_SYNTHETIC_WEIGHT)
|
| 434 |
|
| 435 |
+
elif (features['char_entropy'] < self.params.CHAR_ENTROPY_LOW_THRESHOLD):
|
| 436 |
+
# Moderate synthetic indicator
|
| 437 |
+
synthetic_indicators.append(self.params.MODERATE_SYNTHETIC_WEIGHT)
|
| 438 |
|
| 439 |
else:
|
| 440 |
+
# Weak synthetic indicator
|
| 441 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 442 |
+
|
| 443 |
+
# Low token entropy suggests synthetic (limited vocabulary reuse)
|
| 444 |
+
if (features['token_entropy'] < self.params.TOKEN_ENTROPY_LOW_THRESHOLD):
|
| 445 |
+
synthetic_indicators.append(self.params.MODERATE_SYNTHETIC_WEIGHT)
|
| 446 |
|
| 447 |
+
else:
|
| 448 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# Low entropy variance suggests synthetic (consistent patterns)
|
| 452 |
+
if (features['entropy_variance'] < self.params.ENTROPY_VARIANCE_VERY_LOW_THRESHOLD):
|
| 453 |
+
# Very strong synthetic indicator
|
| 454 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 455 |
|
| 456 |
+
elif (features['entropy_variance'] < self.params.ENTROPY_VARIANCE_MEDIUM_THRESHOLD):
|
| 457 |
# Neutral
|
| 458 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 459 |
|
| 460 |
else:
|
| 461 |
+
# Strong authentic indicator
|
| 462 |
+
synthetic_indicators.append(self.params.VERY_LOW_SYNTHETIC_WEIGHT)
|
| 463 |
|
| 464 |
+
# Low token diversity suggests synthetic
|
| 465 |
+
if (features['token_diversity'] < self.params.TOKEN_DIVERSITY_LOW_THRESHOLD):
|
| 466 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 467 |
|
| 468 |
+
elif (features['token_diversity'] < self.params.TOKEN_DIVERSITY_MEDIUM_THRESHOLD):
|
| 469 |
+
synthetic_indicators.append(self.params.VERY_WEAK_SYNTHETIC_WEIGHT)
|
| 470 |
|
| 471 |
else:
|
| 472 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 473 |
|
| 474 |
+
# Low sequence unpredictability suggests synthetic
|
| 475 |
+
if (features['sequence_unpredictability'] < self.params.SEQUENCE_UNPREDICTABILITY_LOW_THRESHOLD):
|
| 476 |
+
synthetic_indicators.append(self.params.VERY_STRONG_SYNTHETIC_WEIGHT)
|
| 477 |
|
| 478 |
+
elif (features['sequence_unpredictability'] < self.params.SEQUENCE_UNPREDICTABILITY_MEDIUM_THRESHOLD):
|
| 479 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 480 |
|
| 481 |
else:
|
| 482 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 483 |
|
| 484 |
+
# High synthetic pattern score suggests synthetic
|
| 485 |
+
if (features['synthetic_pattern_score'] > self.params.SYNTHETIC_PATTERN_SCORE_HIGH_THRESHOLD):
|
| 486 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 487 |
|
| 488 |
+
elif (features['synthetic_pattern_score'] > self.params.SYNTHETIC_PATTERN_SCORE_MEDIUM_THRESHOLD):
|
| 489 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 490 |
|
| 491 |
else:
|
| 492 |
+
synthetic_indicators.append(self.params.LOW_SYNTHETIC_WEIGHT)
|
| 493 |
|
| 494 |
# Calculate raw score and confidence
|
| 495 |
+
raw_score = np.mean(synthetic_indicators) if synthetic_indicators else self.params.NEUTRAL_PROBABILITY
|
| 496 |
+
confidence = 1.0 - (np.std(synthetic_indicators) / self.params.CONFIDENCE_STD_NORMALIZER) if synthetic_indicators else self.params.NEUTRAL_CONFIDENCE
|
| 497 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 498 |
|
| 499 |
return raw_score, confidence
|
| 500 |
|
| 501 |
|
| 502 |
+
def _calculate_hybrid_probability(self, features: Dict[str, Any]) -> float:
|
| 503 |
"""
|
| 504 |
+
Calculate probability of hybrid synthetic/authentic content with better indicators
|
| 505 |
"""
|
| 506 |
+
hybrid_indicators = list()
|
| 507 |
|
| 508 |
# High entropy variance suggests mixed content
|
| 509 |
entropy_variance = features.get('entropy_variance', 0)
|
| 510 |
|
| 511 |
+
if (entropy_variance > self.params.ENTROPY_VARIANCE_HIGH_THRESHOLD):
|
| 512 |
# Strong mixed indicator
|
| 513 |
+
hybrid_indicators.append(self.params.STRONG_HYBRID_WEIGHT)
|
| 514 |
|
| 515 |
+
elif (entropy_variance > self.params.ENTROPY_VARIANCE_MIXED_THRESHOLD):
|
| 516 |
+
hybrid_indicators.append(self.params.MODERATE_HYBRID_WEIGHT)
|
| 517 |
|
| 518 |
else:
|
| 519 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 520 |
|
| 521 |
# Inconsistent patterns across different entropy measures
|
| 522 |
char_entropy = features.get('char_entropy', 0)
|
| 523 |
word_entropy = features.get('word_entropy', 0)
|
| 524 |
|
| 525 |
+
if ((char_entropy > self.params.ZERO_TOLERANCE) and (word_entropy > self.params.ZERO_TOLERANCE)):
|
| 526 |
entropy_discrepancy = abs(char_entropy - word_entropy)
|
| 527 |
|
| 528 |
# Large discrepancy suggests mixing
|
| 529 |
+
if (entropy_discrepancy > self.params.ENTROPY_DISCREPANCY_THRESHOLD):
|
| 530 |
+
hybrid_indicators.append(self.params.MODERATE_HYBRID_WEIGHT)
|
| 531 |
|
| 532 |
+
# Moderate synthetic pattern score might indicate mixing
|
| 533 |
+
synthetic_pattern_score = features.get('synthetic_pattern_score', 0)
|
| 534 |
+
if (self.params.SYNTHETIC_PATTERN_MIXED_MIN <= synthetic_pattern_score <= self.params.SYNTHETIC_PATTERN_MIXED_MAX):
|
| 535 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 536 |
|
| 537 |
+
hybrid_probability = min(self.params.MAX_HYBRID_PROBABILITY, np.mean(hybrid_indicators)) if hybrid_indicators else 0.0
|
| 538 |
|
| 539 |
+
return hybrid_probability
|
| 540 |
|
| 541 |
|
| 542 |
def cleanup(self):
|
|
|
|
| 549 |
|
| 550 |
|
| 551 |
# Export
|
| 552 |
+
__all__ = ["EntropyMetric"]
|
metrics/linguistic.py
CHANGED
|
@@ -7,10 +7,11 @@ from typing import List
|
|
| 7 |
from typing import Tuple
|
| 8 |
from loguru import logger
|
| 9 |
from collections import Counter
|
| 10 |
-
from config.
|
|
|
|
| 11 |
from metrics.base_metric import BaseMetric
|
| 12 |
-
from metrics.base_metric import MetricResult
|
| 13 |
from models.model_manager import get_model_manager
|
|
|
|
| 14 |
from config.threshold_config import get_threshold_for_domain
|
| 15 |
|
| 16 |
|
|
@@ -29,6 +30,7 @@ class LinguisticMetric(BaseMetric):
|
|
| 29 |
description = "POS tag diversity, syntactic complexity, and grammatical pattern analysis",
|
| 30 |
)
|
| 31 |
self.nlp = None
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def initialize(self) -> bool:
|
|
@@ -57,104 +59,95 @@ class LinguisticMetric(BaseMetric):
|
|
| 57 |
Compute linguistic analysis with FULL DOMAIN THRESHOLD INTEGRATION
|
| 58 |
"""
|
| 59 |
try:
|
| 60 |
-
if ((not text) or (len(text.strip()) <
|
| 61 |
-
return
|
| 62 |
-
|
| 63 |
-
human_probability = 0.5,
|
| 64 |
-
mixed_probability = 0.0,
|
| 65 |
-
confidence = 0.1,
|
| 66 |
-
error = "Text too short for linguistic analysis",
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
# Get domain-specific thresholds
|
| 70 |
-
domain
|
| 71 |
-
domain_thresholds
|
| 72 |
-
linguistic_thresholds
|
| 73 |
|
| 74 |
# Calculate comprehensive linguistic features
|
| 75 |
-
features
|
| 76 |
|
| 77 |
-
# Calculate raw linguistic score (0-1 scale)
|
| 78 |
-
raw_linguistic_score, confidence
|
| 79 |
|
| 80 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# Apply confidence multiplier from domain thresholds
|
| 84 |
-
confidence
|
| 85 |
-
confidence
|
| 86 |
|
| 87 |
-
return MetricResult(metric_name
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
confidence
|
| 92 |
-
details
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
)
|
| 99 |
|
| 100 |
except Exception as e:
|
| 101 |
logger.error(f"Error in linguistic computation: {repr(e)}")
|
| 102 |
-
return
|
| 103 |
-
ai_probability = 0.5,
|
| 104 |
-
human_probability = 0.5,
|
| 105 |
-
mixed_probability = 0.0,
|
| 106 |
-
confidence = 0.0,
|
| 107 |
-
error = str(e),
|
| 108 |
-
)
|
| 109 |
|
| 110 |
|
| 111 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 112 |
"""
|
| 113 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 114 |
"""
|
| 115 |
-
|
| 116 |
-
|
| 117 |
|
| 118 |
# Calculate probabilities based on threshold distances
|
| 119 |
-
if (raw_score >=
|
| 120 |
-
# Above
|
| 121 |
-
distance_from_threshold = raw_score -
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
elif (raw_score <=
|
| 126 |
-
# Below
|
| 127 |
-
distance_from_threshold =
|
| 128 |
-
|
| 129 |
-
|
| 130 |
|
| 131 |
else:
|
| 132 |
# Between thresholds - uncertain zone
|
| 133 |
-
range_width =
|
| 134 |
-
if (range_width >
|
| 135 |
-
position_in_range = (raw_score -
|
| 136 |
-
|
| 137 |
-
|
| 138 |
|
| 139 |
else:
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
# Ensure probabilities are valid
|
| 144 |
-
|
| 145 |
-
|
| 146 |
|
| 147 |
-
# Calculate
|
| 148 |
-
|
| 149 |
|
| 150 |
# Normalize to sum to 1.0
|
| 151 |
-
total
|
| 152 |
-
if (total >
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
|
| 157 |
-
return
|
| 158 |
|
| 159 |
|
| 160 |
def _calculate_linguistic_features(self, text: str) -> Dict[str, Any]:
|
|
@@ -170,7 +163,6 @@ class LinguisticMetric(BaseMetric):
|
|
| 170 |
|
| 171 |
# Extract POS tags and dependencies
|
| 172 |
pos_tags = [token.pos_ for token in doc]
|
| 173 |
-
dependencies = [token.dep_ for token in doc]
|
| 174 |
|
| 175 |
# Calculate POS diversity and patterns
|
| 176 |
pos_diversity = self._calculate_pos_diversity(pos_tags = pos_tags)
|
|
@@ -185,12 +177,14 @@ class LinguisticMetric(BaseMetric):
|
|
| 185 |
writing_style_score = self._analyze_writing_style(doc = doc)
|
| 186 |
|
| 187 |
# Chunk-based analysis for whole-text understanding
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
# Calculate specific
|
| 193 |
-
|
| 194 |
|
| 195 |
return {"pos_diversity" : round(pos_diversity, 4),
|
| 196 |
"pos_entropy" : round(pos_entropy, 4),
|
|
@@ -200,11 +194,11 @@ class LinguisticMetric(BaseMetric):
|
|
| 200 |
"transition_word_usage" : round(grammatical_patterns['transition_usage'], 4),
|
| 201 |
"passive_voice_ratio" : round(grammatical_patterns['passive_ratio'], 4),
|
| 202 |
"writing_style_score" : round(writing_style_score, 4),
|
| 203 |
-
"
|
| 204 |
-
"avg_chunk_complexity" : round(
|
| 205 |
-
"complexity_variance" : round(
|
| 206 |
"num_sentences" : len(list(doc.sents)),
|
| 207 |
-
"num_chunks_analyzed" :
|
| 208 |
}
|
| 209 |
|
| 210 |
except Exception as e:
|
|
@@ -230,7 +224,7 @@ class LinguisticMetric(BaseMetric):
|
|
| 230 |
"""
|
| 231 |
Calculate entropy of POS tag distribution
|
| 232 |
"""
|
| 233 |
-
if not pos_tags:
|
| 234 |
return 0.0
|
| 235 |
|
| 236 |
pos_counts = Counter(pos_tags)
|
|
@@ -239,7 +233,8 @@ class LinguisticMetric(BaseMetric):
|
|
| 239 |
entropy = 0.0
|
| 240 |
for count in pos_counts.values():
|
| 241 |
probability = count / total_tags
|
| 242 |
-
|
|
|
|
| 243 |
|
| 244 |
return entropy
|
| 245 |
|
|
@@ -260,7 +255,8 @@ class LinguisticMetric(BaseMetric):
|
|
| 260 |
if depths:
|
| 261 |
avg_depth = np.mean(depths)
|
| 262 |
max_depth = np.max(depths)
|
| 263 |
-
complexity = (avg_depth
|
|
|
|
| 264 |
complexities.append(complexity)
|
| 265 |
|
| 266 |
return np.mean(complexities) if complexities else 0.0
|
|
@@ -287,11 +283,10 @@ class LinguisticMetric(BaseMetric):
|
|
| 287 |
for sent in doc.sents:
|
| 288 |
# Simple complexity measure based on sentence length and structure
|
| 289 |
words = [token for token in sent if not token.is_punct]
|
| 290 |
-
num_clauses = len([token for token in sent if token.dep_ in
|
| 291 |
|
| 292 |
if (len(words) > 0):
|
| 293 |
-
complexity = (len(words) /
|
| 294 |
-
|
| 295 |
complexities.append(complexity)
|
| 296 |
|
| 297 |
return np.mean(complexities) if complexities else 0.0
|
|
@@ -307,21 +302,19 @@ class LinguisticMetric(BaseMetric):
|
|
| 307 |
transition_words = 0
|
| 308 |
total_sentences = 0
|
| 309 |
|
| 310 |
-
transition_words_set = {'however', 'therefore', 'moreover', 'furthermore', 'consequently', 'additionally', 'nevertheless', 'nonetheless', 'thus', 'hence'}
|
| 311 |
-
|
| 312 |
for sent in doc.sents:
|
| 313 |
total_sentences += 1
|
| 314 |
sent_text = sent.text.lower()
|
| 315 |
|
| 316 |
# Check for passive voice patterns
|
| 317 |
-
if (any(token.dep_ ==
|
| 318 |
passive_voice += 1
|
| 319 |
|
| 320 |
else:
|
| 321 |
-
active_voice += 1
|
| 322 |
|
| 323 |
-
# Count transition words
|
| 324 |
-
for word in
|
| 325 |
if word in sent_text:
|
| 326 |
transition_words += 1
|
| 327 |
break
|
|
@@ -331,7 +324,8 @@ class LinguisticMetric(BaseMetric):
|
|
| 331 |
transition_usage = transition_words / total_sentences if total_sentences > 0 else 0.0
|
| 332 |
|
| 333 |
# Calculate consistency (lower variance in patterns)
|
| 334 |
-
consistency = 1.0 - min(1.0, abs(passive_ratio -
|
|
|
|
| 335 |
|
| 336 |
return {'consistency' : max(0.0, consistency),
|
| 337 |
'passive_ratio' : passive_ratio,
|
|
@@ -350,24 +344,22 @@ class LinguisticMetric(BaseMetric):
|
|
| 350 |
|
| 351 |
if sent_lengths:
|
| 352 |
length_variation = np.std(sent_lengths) / np.mean(sent_lengths) if np.mean(sent_lengths) > 0 else 0.0
|
| 353 |
-
# Moderate variation is more
|
| 354 |
-
style_score = 1.0 - min(1.0, abs(length_variation -
|
| 355 |
-
|
| 356 |
style_indicators.append(style_score)
|
| 357 |
|
| 358 |
# Punctuation usage
|
| 359 |
punct_ratio = len([token for token in doc if token.is_punct]) / len(doc) if len(doc) > 0 else 0.0
|
| 360 |
-
# Balanced punctuation is more
|
| 361 |
-
punct_score = 1.0 - min(1.0, abs(punct_ratio -
|
| 362 |
-
|
| 363 |
style_indicators.append(punct_score)
|
| 364 |
|
| 365 |
return np.mean(style_indicators) if style_indicators else 0.5
|
| 366 |
|
| 367 |
|
| 368 |
-
def
|
| 369 |
"""
|
| 370 |
-
Detect
|
| 371 |
"""
|
| 372 |
patterns_detected = 0
|
| 373 |
total_patterns = 5
|
|
@@ -407,13 +399,12 @@ class LinguisticMetric(BaseMetric):
|
|
| 407 |
|
| 408 |
def _check_transition_overuse(self, doc) -> bool:
|
| 409 |
"""
|
| 410 |
-
Check for overuse of transition words (common
|
| 411 |
"""
|
| 412 |
-
|
| 413 |
-
transition_count = sum(1 for token in doc if token.lemma_.lower() in transition_words)
|
| 414 |
|
| 415 |
-
# More than
|
| 416 |
-
return transition_count / len(doc) >
|
| 417 |
|
| 418 |
|
| 419 |
def _check_unnatural_pos_sequences(self, doc) -> bool:
|
|
@@ -433,8 +424,8 @@ class LinguisticMetric(BaseMetric):
|
|
| 433 |
sequence_counts = Counter(pos_sequences)
|
| 434 |
most_common_freq = max(sequence_counts.values()) / len(pos_sequences) if pos_sequences else 0
|
| 435 |
|
| 436 |
-
# High frequency of specific sequences suggests
|
| 437 |
-
return (most_common_freq >
|
| 438 |
|
| 439 |
|
| 440 |
def _check_structure_consistency(self, doc) -> bool:
|
|
@@ -448,15 +439,15 @@ class LinguisticMetric(BaseMetric):
|
|
| 448 |
structure = tuple(token.dep_ for token in sent if token.dep_ not in ['punct', 'det'])
|
| 449 |
sent_structures.append(structure)
|
| 450 |
|
| 451 |
-
if (len(sent_structures) <
|
| 452 |
return False
|
| 453 |
|
| 454 |
# Calculate structure similarity
|
| 455 |
unique_structures = len(set(sent_structures))
|
| 456 |
similarity_ratio = unique_structures / len(sent_structures)
|
| 457 |
|
| 458 |
-
# Low diversity suggests
|
| 459 |
-
return (similarity_ratio <
|
| 460 |
|
| 461 |
|
| 462 |
def _check_unusual_grammar(self, doc) -> bool:
|
|
@@ -467,11 +458,11 @@ class LinguisticMetric(BaseMetric):
|
|
| 467 |
|
| 468 |
for token in doc:
|
| 469 |
# Check for unusual dependency relations i.e. less common relations
|
| 470 |
-
if token.dep_ in
|
| 471 |
unusual_constructions += 1
|
| 472 |
|
| 473 |
-
# More than
|
| 474 |
-
return (unusual_constructions / len(doc) >
|
| 475 |
|
| 476 |
|
| 477 |
def _check_repetitive_phrasing(self, doc) -> bool:
|
|
@@ -491,26 +482,29 @@ class LinguisticMetric(BaseMetric):
|
|
| 491 |
phrase_counts = Counter(phrases)
|
| 492 |
repeated_phrases = sum(1 for count in phrase_counts.values() if count > 1)
|
| 493 |
|
| 494 |
-
# High repetition suggests
|
| 495 |
-
return (repeated_phrases / len(phrases) >
|
| 496 |
|
| 497 |
|
| 498 |
-
def _calculate_chunk_linguistics(self, text: str
|
| 499 |
"""
|
| 500 |
Calculate linguistic features across text chunks
|
| 501 |
"""
|
| 502 |
complexities = list()
|
| 503 |
words = text.split()
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
| 506 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 507 |
|
| 508 |
-
if (len(chunk) >
|
| 509 |
try:
|
| 510 |
chunk_doc = self.nlp(chunk)
|
| 511 |
|
| 512 |
# Check if processing was successful
|
| 513 |
-
if (chunk_doc and (len(list(chunk_doc.sents)) >
|
| 514 |
complexity = self._calculate_syntactic_complexity(chunk_doc)
|
| 515 |
complexities.append(complexity)
|
| 516 |
|
|
@@ -518,141 +512,147 @@ class LinguisticMetric(BaseMetric):
|
|
| 518 |
logger.debug(f"Chunk linguistic analysis failed: {e}")
|
| 519 |
continue
|
| 520 |
|
| 521 |
-
return
|
| 522 |
|
| 523 |
|
| 524 |
def _analyze_linguistic_patterns(self, features: Dict[str, Any]) -> tuple:
|
| 525 |
"""
|
| 526 |
-
Analyze linguistic patterns to determine RAW linguistic score (0-1 scale) : Higher score = more
|
| 527 |
"""
|
| 528 |
# Check feature validity first
|
| 529 |
-
required_features = ['pos_diversity', 'syntactic_complexity', 'grammatical_consistency', 'transition_word_usage', '
|
| 530 |
|
| 531 |
-
valid_features = [features.get(feat, 0) for feat in required_features if features.get(feat, 0) >
|
| 532 |
|
| 533 |
-
if (len(valid_features) <
|
| 534 |
# Low confidence if insufficient features
|
| 535 |
-
return
|
| 536 |
|
| 537 |
-
# Initialize
|
| 538 |
-
|
| 539 |
|
| 540 |
-
# Low POS diversity suggests
|
| 541 |
-
if (features['pos_diversity'] <
|
| 542 |
-
|
| 543 |
|
| 544 |
-
elif (features['pos_diversity'] <
|
| 545 |
-
|
| 546 |
|
| 547 |
else:
|
| 548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
|
| 550 |
-
# Low syntactic complexity suggests
|
| 551 |
-
if (features['syntactic_complexity'] <
|
| 552 |
-
|
| 553 |
|
| 554 |
-
elif (features['syntactic_complexity'] <
|
| 555 |
-
|
| 556 |
|
| 557 |
else:
|
| 558 |
-
|
| 559 |
|
| 560 |
-
# High grammatical consistency suggests
|
| 561 |
-
if (features['grammatical_consistency'] >
|
| 562 |
-
|
| 563 |
|
| 564 |
-
elif (features['grammatical_consistency'] >
|
| 565 |
-
|
| 566 |
|
| 567 |
else:
|
| 568 |
-
|
| 569 |
|
| 570 |
-
# High transition word usage suggests
|
| 571 |
-
if (features['transition_word_usage'] >
|
| 572 |
-
|
| 573 |
|
| 574 |
-
elif (features['transition_word_usage'] >
|
| 575 |
-
|
| 576 |
|
| 577 |
else:
|
| 578 |
-
|
| 579 |
|
| 580 |
-
# High
|
| 581 |
-
if (features['
|
| 582 |
-
|
| 583 |
|
| 584 |
-
elif (features['
|
| 585 |
-
|
| 586 |
|
| 587 |
else:
|
| 588 |
-
|
| 589 |
|
| 590 |
-
# Low complexity variance suggests
|
| 591 |
-
if (features['complexity_variance'] <
|
| 592 |
-
|
| 593 |
|
| 594 |
-
elif (features['complexity_variance'] <
|
| 595 |
-
|
| 596 |
|
| 597 |
else:
|
| 598 |
-
|
| 599 |
|
| 600 |
# Calculate raw score and confidence
|
| 601 |
-
raw_score = np.mean(
|
| 602 |
-
confidence = 1.0 - (np.std(
|
| 603 |
-
confidence = max(
|
| 604 |
|
| 605 |
return raw_score, confidence
|
| 606 |
|
| 607 |
|
| 608 |
-
def
|
| 609 |
"""
|
| 610 |
-
Calculate probability of
|
| 611 |
"""
|
| 612 |
-
|
| 613 |
|
| 614 |
# Moderate POS diversity might indicate mixing
|
| 615 |
-
if (
|
| 616 |
-
|
| 617 |
|
| 618 |
else:
|
| 619 |
-
|
| 620 |
|
| 621 |
# High complexity variance suggests mixed content
|
| 622 |
-
if (features['complexity_variance'] >
|
| 623 |
-
|
| 624 |
|
| 625 |
-
elif (features['complexity_variance'] >
|
| 626 |
-
|
| 627 |
|
| 628 |
else:
|
| 629 |
-
|
| 630 |
|
| 631 |
-
# Inconsistent
|
| 632 |
-
if (
|
| 633 |
-
|
| 634 |
|
| 635 |
else:
|
| 636 |
-
|
| 637 |
|
| 638 |
-
|
|
|
|
| 639 |
|
| 640 |
|
| 641 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 642 |
"""
|
| 643 |
Return default features when analysis is not possible
|
| 644 |
"""
|
| 645 |
-
return {"pos_diversity" :
|
| 646 |
-
"pos_entropy" :
|
| 647 |
-
"syntactic_complexity" :
|
| 648 |
-
"avg_sentence_complexity" :
|
| 649 |
-
"grammatical_consistency" :
|
| 650 |
-
"transition_word_usage" :
|
| 651 |
-
"passive_voice_ratio" :
|
| 652 |
-
"writing_style_score" :
|
| 653 |
-
"
|
| 654 |
-
"avg_chunk_complexity" :
|
| 655 |
-
"complexity_variance" :
|
| 656 |
"num_sentences" : 0,
|
| 657 |
"num_chunks_analyzed" : 0,
|
| 658 |
}
|
|
|
|
| 7 |
from typing import Tuple
|
| 8 |
from loguru import logger
|
| 9 |
from collections import Counter
|
| 10 |
+
from config.enums import Domain
|
| 11 |
+
from config.schemas import MetricResult
|
| 12 |
from metrics.base_metric import BaseMetric
|
|
|
|
| 13 |
from models.model_manager import get_model_manager
|
| 14 |
+
from config.constants import linguistic_metric_params
|
| 15 |
from config.threshold_config import get_threshold_for_domain
|
| 16 |
|
| 17 |
|
|
|
|
| 30 |
description = "POS tag diversity, syntactic complexity, and grammatical pattern analysis",
|
| 31 |
)
|
| 32 |
self.nlp = None
|
| 33 |
+
self.params = linguistic_metric_params
|
| 34 |
|
| 35 |
|
| 36 |
def initialize(self) -> bool:
|
|
|
|
| 59 |
Compute linguistic analysis with FULL DOMAIN THRESHOLD INTEGRATION
|
| 60 |
"""
|
| 61 |
try:
|
| 62 |
+
if ((not text) or (len(text.strip()) < self.params.MIN_TEXT_LENGTH_FOR_ANALYSIS)):
|
| 63 |
+
return self._default_result(error = "Text too short for linguistic analysis")
|
| 64 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
# Get domain-specific thresholds
|
| 66 |
+
domain = kwargs.get('domain', Domain.GENERAL)
|
| 67 |
+
domain_thresholds = get_threshold_for_domain(domain)
|
| 68 |
+
linguistic_thresholds = domain_thresholds.linguistic
|
| 69 |
|
| 70 |
# Calculate comprehensive linguistic features
|
| 71 |
+
features = self._calculate_linguistic_features(text = text)
|
| 72 |
|
| 73 |
+
# Calculate raw linguistic score (0-1 scale) - higher = more synthetic-like
|
| 74 |
+
raw_linguistic_score, confidence = self._analyze_linguistic_patterns(features = features)
|
| 75 |
|
| 76 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 77 |
+
synthetic_prob, authentic_prob, hybrid_prob = self._apply_domain_thresholds(raw_score = raw_linguistic_score,
|
| 78 |
+
thresholds = linguistic_thresholds,
|
| 79 |
+
features = features,
|
| 80 |
+
)
|
| 81 |
|
| 82 |
# Apply confidence multiplier from domain thresholds
|
| 83 |
+
confidence *= linguistic_thresholds.confidence_multiplier
|
| 84 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 85 |
|
| 86 |
+
return MetricResult(metric_name = self.name,
|
| 87 |
+
synthetic_probability = synthetic_prob,
|
| 88 |
+
authentic_probability = authentic_prob,
|
| 89 |
+
hybrid_probability = hybrid_prob,
|
| 90 |
+
confidence = confidence,
|
| 91 |
+
details = {**features,
|
| 92 |
+
'domain_used' : domain.value,
|
| 93 |
+
'synthetic_threshold': linguistic_thresholds.synthetic_threshold,
|
| 94 |
+
'authentic_threshold': linguistic_thresholds.authentic_threshold,
|
| 95 |
+
'raw_score' : raw_linguistic_score,
|
| 96 |
+
},
|
| 97 |
)
|
| 98 |
|
| 99 |
except Exception as e:
|
| 100 |
logger.error(f"Error in linguistic computation: {repr(e)}")
|
| 101 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 105 |
"""
|
| 106 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 107 |
"""
|
| 108 |
+
synthetic_threshold = thresholds.synthetic_threshold
|
| 109 |
+
authentic_threshold = thresholds.authentic_threshold
|
| 110 |
|
| 111 |
# Calculate probabilities based on threshold distances
|
| 112 |
+
if (raw_score >= synthetic_threshold):
|
| 113 |
+
# Above synthetic threshold - strongly synthetic
|
| 114 |
+
distance_from_threshold = raw_score - synthetic_threshold
|
| 115 |
+
synthetic_prob = self.params.STRONG_SYNTHETIC_BASE_PROB + (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 116 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 117 |
+
|
| 118 |
+
elif (raw_score <= authentic_threshold):
|
| 119 |
+
# Below authentic threshold - strongly authentic
|
| 120 |
+
distance_from_threshold = authentic_threshold - raw_score
|
| 121 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START - (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 122 |
+
authentic_prob = self.params.STRONG_AUTHENTIC_BASE_PROB + (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 123 |
|
| 124 |
else:
|
| 125 |
# Between thresholds - uncertain zone
|
| 126 |
+
range_width = synthetic_threshold - authentic_threshold
|
| 127 |
+
if (range_width > self.params.ZERO_TOLERANCE):
|
| 128 |
+
position_in_range = (raw_score - authentic_threshold) / range_width
|
| 129 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START + (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 130 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 131 |
|
| 132 |
else:
|
| 133 |
+
synthetic_prob = self.params.NEUTRAL_PROBABILITY
|
| 134 |
+
authentic_prob = self.params.NEUTRAL_PROBABILITY
|
| 135 |
|
| 136 |
# Ensure probabilities are valid
|
| 137 |
+
synthetic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, synthetic_prob))
|
| 138 |
+
authentic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, authentic_prob))
|
| 139 |
|
| 140 |
+
# Calculate hybrid probability based on linguistic variance
|
| 141 |
+
hybrid_prob = self._calculate_hybrid_probability(features)
|
| 142 |
|
| 143 |
# Normalize to sum to 1.0
|
| 144 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 145 |
+
if (total > self.params.ZERO_TOLERANCE):
|
| 146 |
+
synthetic_prob /= total
|
| 147 |
+
authentic_prob /= total
|
| 148 |
+
hybrid_prob /= total
|
| 149 |
|
| 150 |
+
return synthetic_prob, authentic_prob, hybrid_prob
|
| 151 |
|
| 152 |
|
| 153 |
def _calculate_linguistic_features(self, text: str) -> Dict[str, Any]:
|
|
|
|
| 163 |
|
| 164 |
# Extract POS tags and dependencies
|
| 165 |
pos_tags = [token.pos_ for token in doc]
|
|
|
|
| 166 |
|
| 167 |
# Calculate POS diversity and patterns
|
| 168 |
pos_diversity = self._calculate_pos_diversity(pos_tags = pos_tags)
|
|
|
|
| 177 |
writing_style_score = self._analyze_writing_style(doc = doc)
|
| 178 |
|
| 179 |
# Chunk-based analysis for whole-text understanding
|
| 180 |
+
chunk_complexities = self._calculate_chunk_linguistics(text = text)
|
| 181 |
+
|
| 182 |
+
avg_chunk_complexity = np.mean(chunk_complexities) if chunk_complexities else 0.0
|
| 183 |
+
complexity_variance = np.var(chunk_complexities) if chunk_complexities else 0.0
|
| 184 |
+
num_chunks = len(chunk_complexities)
|
| 185 |
|
| 186 |
+
# Calculate specific synthetic linguistic patterns
|
| 187 |
+
synthetic_pattern_score = self._detect_synthetic_linguistic_patterns(doc = doc)
|
| 188 |
|
| 189 |
return {"pos_diversity" : round(pos_diversity, 4),
|
| 190 |
"pos_entropy" : round(pos_entropy, 4),
|
|
|
|
| 194 |
"transition_word_usage" : round(grammatical_patterns['transition_usage'], 4),
|
| 195 |
"passive_voice_ratio" : round(grammatical_patterns['passive_ratio'], 4),
|
| 196 |
"writing_style_score" : round(writing_style_score, 4),
|
| 197 |
+
"synthetic_pattern_score" : round(synthetic_pattern_score, 4),
|
| 198 |
+
"avg_chunk_complexity" : round(avg_chunk_complexity, 4),
|
| 199 |
+
"complexity_variance" : round(complexity_variance, 4),
|
| 200 |
"num_sentences" : len(list(doc.sents)),
|
| 201 |
+
"num_chunks_analyzed" : num_chunks,
|
| 202 |
}
|
| 203 |
|
| 204 |
except Exception as e:
|
|
|
|
| 224 |
"""
|
| 225 |
Calculate entropy of POS tag distribution
|
| 226 |
"""
|
| 227 |
+
if (not pos_tags) or (len(pos_tags) < self.params.MIN_TAGS_FOR_ENTROPY):
|
| 228 |
return 0.0
|
| 229 |
|
| 230 |
pos_counts = Counter(pos_tags)
|
|
|
|
| 233 |
entropy = 0.0
|
| 234 |
for count in pos_counts.values():
|
| 235 |
probability = count / total_tags
|
| 236 |
+
if probability > self.params.ZERO_TOLERANCE:
|
| 237 |
+
entropy -= probability * np.log2(probability)
|
| 238 |
|
| 239 |
return entropy
|
| 240 |
|
|
|
|
| 255 |
if depths:
|
| 256 |
avg_depth = np.mean(depths)
|
| 257 |
max_depth = np.max(depths)
|
| 258 |
+
complexity = (avg_depth * self.params.COMPLEXITY_WEIGHT_AVG +
|
| 259 |
+
max_depth * self.params.COMPLEXITY_WEIGHT_MAX)
|
| 260 |
complexities.append(complexity)
|
| 261 |
|
| 262 |
return np.mean(complexities) if complexities else 0.0
|
|
|
|
| 283 |
for sent in doc.sents:
|
| 284 |
# Simple complexity measure based on sentence length and structure
|
| 285 |
words = [token for token in sent if not token.is_punct]
|
| 286 |
+
num_clauses = len([token for token in sent if token.dep_ in self.params.CLAUSE_MARKERS])
|
| 287 |
|
| 288 |
if (len(words) > 0):
|
| 289 |
+
complexity = (len(words) / self.params.WORDS_PER_COMPLEXITY_UNIT) + (num_clauses * self.params.CLAUSE_COMPLEXITY_FACTOR)
|
|
|
|
| 290 |
complexities.append(complexity)
|
| 291 |
|
| 292 |
return np.mean(complexities) if complexities else 0.0
|
|
|
|
| 302 |
transition_words = 0
|
| 303 |
total_sentences = 0
|
| 304 |
|
|
|
|
|
|
|
| 305 |
for sent in doc.sents:
|
| 306 |
total_sentences += 1
|
| 307 |
sent_text = sent.text.lower()
|
| 308 |
|
| 309 |
# Check for passive voice patterns
|
| 310 |
+
if (any(token.dep_ == self.params.PASSIVE_DEPENDENCY for token in sent)):
|
| 311 |
passive_voice += 1
|
| 312 |
|
| 313 |
else:
|
| 314 |
+
active_voice += 1
|
| 315 |
|
| 316 |
+
# Count transition words``
|
| 317 |
+
for word in self.params.TRANSITION_WORDS_SET:
|
| 318 |
if word in sent_text:
|
| 319 |
transition_words += 1
|
| 320 |
break
|
|
|
|
| 324 |
transition_usage = transition_words / total_sentences if total_sentences > 0 else 0.0
|
| 325 |
|
| 326 |
# Calculate consistency (lower variance in patterns)
|
| 327 |
+
consistency = 1.0 - min(1.0, abs(passive_ratio - self.params.IDEAL_PASSIVE_RATIO) +
|
| 328 |
+
abs(transition_usage - self.params.IDEAL_TRANSITION_RATIO))
|
| 329 |
|
| 330 |
return {'consistency' : max(0.0, consistency),
|
| 331 |
'passive_ratio' : passive_ratio,
|
|
|
|
| 344 |
|
| 345 |
if sent_lengths:
|
| 346 |
length_variation = np.std(sent_lengths) / np.mean(sent_lengths) if np.mean(sent_lengths) > 0 else 0.0
|
| 347 |
+
# Moderate variation is more authentic-like
|
| 348 |
+
style_score = 1.0 - min(1.0, abs(length_variation - self.params.IDEAL_LENGTH_VARIATION))
|
|
|
|
| 349 |
style_indicators.append(style_score)
|
| 350 |
|
| 351 |
# Punctuation usage
|
| 352 |
punct_ratio = len([token for token in doc if token.is_punct]) / len(doc) if len(doc) > 0 else 0.0
|
| 353 |
+
# Balanced punctuation is more authentic-like
|
| 354 |
+
punct_score = 1.0 - min(1.0, abs(punct_ratio - self.params.IDEAL_PUNCTUATION_RATIO))
|
|
|
|
| 355 |
style_indicators.append(punct_score)
|
| 356 |
|
| 357 |
return np.mean(style_indicators) if style_indicators else 0.5
|
| 358 |
|
| 359 |
|
| 360 |
+
def _detect_synthetic_linguistic_patterns(self, doc) -> float:
|
| 361 |
"""
|
| 362 |
+
Detect synthetic-specific linguistic patterns
|
| 363 |
"""
|
| 364 |
patterns_detected = 0
|
| 365 |
total_patterns = 5
|
|
|
|
| 399 |
|
| 400 |
def _check_transition_overuse(self, doc) -> bool:
|
| 401 |
"""
|
| 402 |
+
Check for overuse of transition words (common synthetic pattern)
|
| 403 |
"""
|
| 404 |
+
transition_count = sum(1 for token in doc if token.lemma_.lower() in self.params.TRANSITION_WORDS_SET)
|
|
|
|
| 405 |
|
| 406 |
+
# More than threshold of words being transitions is suspicious
|
| 407 |
+
return transition_count / len(doc) > self.params.TRANSITION_OVERUSE_THRESHOLD if len(doc) > 0 else False
|
| 408 |
|
| 409 |
|
| 410 |
def _check_unnatural_pos_sequences(self, doc) -> bool:
|
|
|
|
| 424 |
sequence_counts = Counter(pos_sequences)
|
| 425 |
most_common_freq = max(sequence_counts.values()) / len(pos_sequences) if pos_sequences else 0
|
| 426 |
|
| 427 |
+
# High frequency of specific sequences suggests synthetic
|
| 428 |
+
return (most_common_freq > self.params.POS_SEQUENCE_FREQ_THRESHOLD)
|
| 429 |
|
| 430 |
|
| 431 |
def _check_structure_consistency(self, doc) -> bool:
|
|
|
|
| 439 |
structure = tuple(token.dep_ for token in sent if token.dep_ not in ['punct', 'det'])
|
| 440 |
sent_structures.append(structure)
|
| 441 |
|
| 442 |
+
if (len(sent_structures) < self.params.MIN_SENTENCES_FOR_STRUCTURE):
|
| 443 |
return False
|
| 444 |
|
| 445 |
# Calculate structure similarity
|
| 446 |
unique_structures = len(set(sent_structures))
|
| 447 |
similarity_ratio = unique_structures / len(sent_structures)
|
| 448 |
|
| 449 |
+
# Low diversity suggests synthetic
|
| 450 |
+
return (similarity_ratio < self.params.STRUCTURE_DIVERSITY_THRESHOLD)
|
| 451 |
|
| 452 |
|
| 453 |
def _check_unusual_grammar(self, doc) -> bool:
|
|
|
|
| 458 |
|
| 459 |
for token in doc:
|
| 460 |
# Check for unusual dependency relations i.e. less common relations
|
| 461 |
+
if token.dep_ in self.params.UNUSUAL_DEPENDENCIES:
|
| 462 |
unusual_constructions += 1
|
| 463 |
|
| 464 |
+
# More than threshold unusual constructions is suspicious
|
| 465 |
+
return (unusual_constructions / len(doc) > self.params.UNUSUAL_CONSTRUCTION_THRESHOLD) if (len(doc) > 0) else False
|
| 466 |
|
| 467 |
|
| 468 |
def _check_repetitive_phrasing(self, doc) -> bool:
|
|
|
|
| 482 |
phrase_counts = Counter(phrases)
|
| 483 |
repeated_phrases = sum(1 for count in phrase_counts.values() if count > 1)
|
| 484 |
|
| 485 |
+
# High repetition suggests synthetic
|
| 486 |
+
return (repeated_phrases / len(phrases) > self.params.REPETITIVE_PHRASING_THRESHOLD)
|
| 487 |
|
| 488 |
|
| 489 |
+
def _calculate_chunk_linguistics(self, text: str) -> List[float]:
|
| 490 |
"""
|
| 491 |
Calculate linguistic features across text chunks
|
| 492 |
"""
|
| 493 |
complexities = list()
|
| 494 |
words = text.split()
|
| 495 |
+
chunk_size = self.params.CHUNK_SIZE_WORDS
|
| 496 |
+
overlap = int(chunk_size * self.params.CHUNK_OVERLAP_RATIO)
|
| 497 |
+
step = max(1, chunk_size - overlap)
|
| 498 |
+
|
| 499 |
+
for i in range(0, len(words), step):
|
| 500 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 501 |
|
| 502 |
+
if (len(chunk) > self.params.MIN_CHUNK_LENGTH):
|
| 503 |
try:
|
| 504 |
chunk_doc = self.nlp(chunk)
|
| 505 |
|
| 506 |
# Check if processing was successful
|
| 507 |
+
if (chunk_doc and (len(list(chunk_doc.sents)) > self.params.MIN_SENTENCES_FOR_ANALYSIS)):
|
| 508 |
complexity = self._calculate_syntactic_complexity(chunk_doc)
|
| 509 |
complexities.append(complexity)
|
| 510 |
|
|
|
|
| 512 |
logger.debug(f"Chunk linguistic analysis failed: {e}")
|
| 513 |
continue
|
| 514 |
|
| 515 |
+
return complexities
|
| 516 |
|
| 517 |
|
| 518 |
def _analyze_linguistic_patterns(self, features: Dict[str, Any]) -> tuple:
|
| 519 |
"""
|
| 520 |
+
Analyze linguistic patterns to determine RAW linguistic score (0-1 scale) : Higher score = more synthetic-like
|
| 521 |
"""
|
| 522 |
# Check feature validity first
|
| 523 |
+
required_features = ['pos_diversity', 'pos_entropy', 'syntactic_complexity', 'grammatical_consistency', 'transition_word_usage', 'synthetic_pattern_score', 'complexity_variance']
|
| 524 |
|
| 525 |
+
valid_features = [features.get(feat, 0) for feat in required_features if features.get(feat, 0) > self.params.ZERO_TOLERANCE]
|
| 526 |
|
| 527 |
+
if (len(valid_features) < self.params.MIN_REQUIRED_FEATURES):
|
| 528 |
# Low confidence if insufficient features
|
| 529 |
+
return self.params.NEUTRAL_PROBABILITY, self.params.LOW_FEATURE_CONFIDENCE
|
| 530 |
|
| 531 |
+
# Initialize synthetic_indicator list
|
| 532 |
+
synthetic_indicators = list()
|
| 533 |
|
| 534 |
+
# Low POS diversity suggests synthetic
|
| 535 |
+
if (features['pos_diversity'] < self.params.POS_DIVERSITY_LOW_THRESHOLD):
|
| 536 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 537 |
|
| 538 |
+
elif (features['pos_diversity'] < self.params.POS_DIVERSITY_MEDIUM_THRESHOLD):
|
| 539 |
+
synthetic_indicators.append(self.params.MODERATE_SYNTHETIC_WEIGHT)
|
| 540 |
|
| 541 |
else:
|
| 542 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
# Low POS entropy suggests templated / synthetic language
|
| 546 |
+
if (features['pos_entropy'] < self.params.POS_ENTROPY_LOW_THRESHOLD):
|
| 547 |
+
synthetic_indicators.append(self.params.MODERATE_SYNTHETIC_WEIGHT)
|
| 548 |
|
| 549 |
+
# Low syntactic complexity suggests synthetic
|
| 550 |
+
if (features['syntactic_complexity'] < self.params.SYNTACTIC_COMPLEXITY_LOW_THRESHOLD):
|
| 551 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 552 |
|
| 553 |
+
elif (features['syntactic_complexity'] < self.params.SYNTACTIC_COMPLEXITY_MEDIUM_THRESHOLD):
|
| 554 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 555 |
|
| 556 |
else:
|
| 557 |
+
synthetic_indicators.append(self.params.VERY_LOW_SYNTHETIC_WEIGHT)
|
| 558 |
|
| 559 |
+
# High grammatical consistency suggests synthetic (unnaturally consistent)
|
| 560 |
+
if (features['grammatical_consistency'] > self.params.GRAMMATICAL_CONSISTENCY_HIGH_THRESHOLD):
|
| 561 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 562 |
|
| 563 |
+
elif (features['grammatical_consistency'] > self.params.GRAMMATICAL_CONSISTENCY_MEDIUM_THRESHOLD):
|
| 564 |
+
synthetic_indicators.append(self.params.MODERATE_SYNTHETIC_WEIGHT)
|
| 565 |
|
| 566 |
else:
|
| 567 |
+
synthetic_indicators.append(self.params.LOW_SYNTHETIC_WEIGHT)
|
| 568 |
|
| 569 |
+
# High transition word usage suggests synthetic
|
| 570 |
+
if (features['transition_word_usage'] > self.params.TRANSITION_USAGE_HIGH_THRESHOLD):
|
| 571 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 572 |
|
| 573 |
+
elif (features['transition_word_usage'] > self.params.TRANSITION_USAGE_MEDIUM_THRESHOLD):
|
| 574 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 575 |
|
| 576 |
else:
|
| 577 |
+
synthetic_indicators.append(self.params.VERY_LOW_SYNTHETIC_WEIGHT)
|
| 578 |
|
| 579 |
+
# High synthetic pattern score suggests synthetic
|
| 580 |
+
if (features['synthetic_pattern_score'] > self.params.SYNTHETIC_PATTERN_HIGH_THRESHOLD):
|
| 581 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 582 |
|
| 583 |
+
elif (features['synthetic_pattern_score'] > self.params.SYNTHETIC_PATTERN_MEDIUM_THRESHOLD):
|
| 584 |
+
synthetic_indicators.append(self.params.MODERATE_SYNTHETIC_WEIGHT)
|
| 585 |
|
| 586 |
else:
|
| 587 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 588 |
|
| 589 |
+
# Low complexity variance suggests synthetic
|
| 590 |
+
if (features['complexity_variance'] < self.params.COMPLEXITY_VARIANCE_LOW_THRESHOLD):
|
| 591 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 592 |
|
| 593 |
+
elif (features['complexity_variance'] < self.params.COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD):
|
| 594 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 595 |
|
| 596 |
else:
|
| 597 |
+
synthetic_indicators.append(self.params.VERY_LOW_SYNTHETIC_WEIGHT)
|
| 598 |
|
| 599 |
# Calculate raw score and confidence
|
| 600 |
+
raw_score = np.mean(synthetic_indicators) if synthetic_indicators else self.params.NEUTRAL_PROBABILITY
|
| 601 |
+
confidence = 1.0 - (np.std(synthetic_indicators) / self.params.CONFIDENCE_STD_NORMALIZER) if synthetic_indicators else self.params.NEUTRAL_CONFIDENCE
|
| 602 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 603 |
|
| 604 |
return raw_score, confidence
|
| 605 |
|
| 606 |
|
| 607 |
+
def _calculate_hybrid_probability(self, features: Dict[str, Any]) -> float:
|
| 608 |
"""
|
| 609 |
+
Calculate probability of hybrid synthetic/authentic content
|
| 610 |
"""
|
| 611 |
+
hybrid_indicators = list()
|
| 612 |
|
| 613 |
# Moderate POS diversity might indicate mixing
|
| 614 |
+
if (self.params.POS_DIVERSITY_MIXED_MIN <= features['pos_diversity'] <= self.params.POS_DIVERSITY_MIXED_MAX):
|
| 615 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 616 |
|
| 617 |
else:
|
| 618 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 619 |
|
| 620 |
# High complexity variance suggests mixed content
|
| 621 |
+
if (features['complexity_variance'] > self.params.COMPLEXITY_VARIANCE_HIGH_THRESHOLD):
|
| 622 |
+
hybrid_indicators.append(self.params.MODERATE_HYBRID_WEIGHT)
|
| 623 |
|
| 624 |
+
elif (features['complexity_variance'] > self.params.COMPLEXITY_VARIANCE_MEDIUM_THRESHOLD):
|
| 625 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 626 |
|
| 627 |
else:
|
| 628 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 629 |
|
| 630 |
+
# Inconsistent synthetic pattern detection
|
| 631 |
+
if (self.params.SYNTHETIC_PATTERN_MIXED_MIN <= features['synthetic_pattern_score'] <= self.params.SYNTHETIC_PATTERN_MIXED_MAX):
|
| 632 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 633 |
|
| 634 |
else:
|
| 635 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 636 |
|
| 637 |
+
hybrid_prob = np.mean(hybrid_indicators) if hybrid_indicators else 0.0
|
| 638 |
+
return min(self.params.MAX_HYBRID_PROBABILITY, hybrid_prob)
|
| 639 |
|
| 640 |
|
| 641 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 642 |
"""
|
| 643 |
Return default features when analysis is not possible
|
| 644 |
"""
|
| 645 |
+
return {"pos_diversity" : self.params.DEFAULT_POS_DIVERSITY,
|
| 646 |
+
"pos_entropy" : self.params.DEFAULT_POS_ENTROPY,
|
| 647 |
+
"syntactic_complexity" : self.params.DEFAULT_SYNTACTIC_COMPLEXITY,
|
| 648 |
+
"avg_sentence_complexity" : self.params.DEFAULT_SENTENCE_COMPLEXITY,
|
| 649 |
+
"grammatical_consistency" : self.params.DEFAULT_GRAMMATICAL_CONSISTENCY,
|
| 650 |
+
"transition_word_usage" : self.params.DEFAULT_TRANSITION_USAGE,
|
| 651 |
+
"passive_voice_ratio" : self.params.DEFAULT_PASSIVE_RATIO,
|
| 652 |
+
"writing_style_score" : self.params.DEFAULT_WRITING_STYLE_SCORE,
|
| 653 |
+
"synthetic_pattern_score" : self.params.DEFAULT_SYNTHETIC_PATTERN_SCORE,
|
| 654 |
+
"avg_chunk_complexity" : self.params.DEFAULT_CHUNK_COMPLEXITY,
|
| 655 |
+
"complexity_variance" : self.params.DEFAULT_COMPLEXITY_VARIANCE,
|
| 656 |
"num_sentences" : 0,
|
| 657 |
"num_chunks_analyzed" : 0,
|
| 658 |
}
|
metrics/multi_perturbation_stability.py
CHANGED
|
@@ -1,25 +1,23 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
-
import re
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
from typing import Any
|
| 6 |
from typing import Dict
|
| 7 |
from typing import List
|
| 8 |
from loguru import logger
|
| 9 |
-
from
|
| 10 |
-
from config.
|
| 11 |
from metrics.base_metric import BaseMetric
|
| 12 |
-
from metrics.base_metric import MetricResult
|
| 13 |
from models.model_manager import get_model_manager
|
| 14 |
from config.threshold_config import get_threshold_for_domain
|
| 15 |
-
|
| 16 |
|
| 17 |
|
| 18 |
class MultiPerturbationStabilityMetric(BaseMetric):
|
| 19 |
"""
|
| 20 |
Multi-Perturbation Stability Metric (MPSM)
|
| 21 |
|
| 22 |
-
A hybrid approach for combining multiple perturbation techniques for robust
|
| 23 |
|
| 24 |
Measures:
|
| 25 |
- Text stability under random perturbations
|
|
@@ -27,7 +25,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 27 |
- Masked token prediction analysis
|
| 28 |
|
| 29 |
Perturbation Methods:
|
| 30 |
-
- Word
|
| 31 |
- RoBERTa mask filling
|
| 32 |
- Synonym replacement
|
| 33 |
- Chunk-based stability Analysis
|
|
@@ -42,6 +40,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 42 |
self.mask_model = None
|
| 43 |
self.mask_tokenizer = None
|
| 44 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def initialize(self) -> bool:
|
|
@@ -145,14 +144,8 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 145 |
Compute MultiPerturbationStability analysis with FULL DOMAIN THRESHOLD INTEGRATION
|
| 146 |
"""
|
| 147 |
try:
|
| 148 |
-
if ((not text) or (len(text.strip()) <
|
| 149 |
-
return
|
| 150 |
-
ai_probability = 0.5,
|
| 151 |
-
human_probability = 0.5,
|
| 152 |
-
mixed_probability = 0.0,
|
| 153 |
-
confidence = 0.1,
|
| 154 |
-
error = "Text too short for MultiPerturbationStability analysis",
|
| 155 |
-
)
|
| 156 |
|
| 157 |
# Get domain-specific thresholds
|
| 158 |
domain = kwargs.get('domain', Domain.GENERAL)
|
|
@@ -162,105 +155,91 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 162 |
# Check if we should run this computationally expensive metric
|
| 163 |
if (kwargs.get('skip_expensive', False)):
|
| 164 |
logger.info("Skipping MultiPerturbationStability due to computational constraints")
|
| 165 |
-
|
| 166 |
-
return MetricResult(metric_name = self.name,
|
| 167 |
-
ai_probability = 0.5,
|
| 168 |
-
human_probability = 0.5,
|
| 169 |
-
mixed_probability = 0.0,
|
| 170 |
-
confidence = 0.3,
|
| 171 |
-
error = "Skipped for performance",
|
| 172 |
-
)
|
| 173 |
|
| 174 |
# Calculate MultiPerturbationStability features
|
| 175 |
-
features
|
| 176 |
|
| 177 |
# Calculate raw MultiPerturbationStability score (0-1 scale)
|
| 178 |
-
raw_stability_score, confidence
|
| 179 |
|
| 180 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
# Apply confidence multiplier from domain thresholds
|
| 187 |
-
confidence
|
| 188 |
-
confidence
|
| 189 |
-
|
| 190 |
-
return MetricResult(metric_name
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
confidence
|
| 195 |
-
details
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
)
|
| 202 |
|
| 203 |
except Exception as e:
|
| 204 |
logger.error(f"Error in MultiPerturbationStability computation: {repr(e)}")
|
| 205 |
-
|
| 206 |
-
return MetricResult(metric_name = self.name,
|
| 207 |
-
ai_probability = 0.5,
|
| 208 |
-
human_probability = 0.5,
|
| 209 |
-
mixed_probability = 0.0,
|
| 210 |
-
confidence = 0.0,
|
| 211 |
-
error = str(e),
|
| 212 |
-
)
|
| 213 |
|
| 214 |
|
| 215 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 216 |
"""
|
| 217 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 218 |
"""
|
| 219 |
-
|
| 220 |
-
|
| 221 |
|
| 222 |
# Calculate probabilities based on threshold distances
|
| 223 |
-
if (raw_score >=
|
| 224 |
-
# Above
|
| 225 |
-
distance_from_threshold = raw_score -
|
| 226 |
-
|
| 227 |
-
|
| 228 |
|
| 229 |
-
elif (raw_score <=
|
| 230 |
-
# Below
|
| 231 |
-
distance_from_threshold =
|
| 232 |
-
|
| 233 |
-
|
| 234 |
|
| 235 |
else:
|
| 236 |
# Between thresholds - uncertain zone
|
| 237 |
-
range_width
|
| 238 |
|
| 239 |
-
if (range_width >
|
| 240 |
-
position_in_range = (raw_score -
|
| 241 |
-
|
| 242 |
-
|
| 243 |
|
| 244 |
else:
|
| 245 |
-
|
| 246 |
-
|
| 247 |
|
| 248 |
# Ensure probabilities are valid
|
| 249 |
-
|
| 250 |
-
|
| 251 |
|
| 252 |
-
# Calculate
|
| 253 |
-
|
| 254 |
|
| 255 |
# Normalize to sum to 1.0
|
| 256 |
-
total
|
| 257 |
|
| 258 |
-
if (total >
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
|
| 263 |
-
return
|
| 264 |
|
| 265 |
|
| 266 |
def _calculate_stability_features(self, text: str) -> Dict[str, Any]:
|
|
@@ -279,9 +258,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 279 |
logger.debug(f"Original likelihood: {original_likelihood:.4f}")
|
| 280 |
|
| 281 |
# Generate perturbations and calculate perturbed likelihoods
|
| 282 |
-
perturbations = self._generate_perturbations(text
|
| 283 |
-
num_perturbations = 10,
|
| 284 |
-
)
|
| 285 |
logger.debug(f"Generated {len(perturbations)} perturbations")
|
| 286 |
|
| 287 |
perturbed_likelihoods = list()
|
|
@@ -290,14 +267,14 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 290 |
if (perturbed_text and (perturbed_text != processed_text)):
|
| 291 |
likelihood = self._calculate_likelihood(text = perturbed_text)
|
| 292 |
|
| 293 |
-
if (likelihood >
|
| 294 |
perturbed_likelihoods.append(likelihood)
|
| 295 |
logger.debug(f"Perturbation {idx}: likelihood={likelihood:.4f}")
|
| 296 |
|
| 297 |
logger.info(f"Valid perturbations: {len(perturbed_likelihoods)}/{len(perturbations)}")
|
| 298 |
|
| 299 |
# Calculate stability metrics
|
| 300 |
-
if perturbed_likelihoods:
|
| 301 |
stability_score = self._calculate_stability_score(original_likelihood = original_likelihood,
|
| 302 |
perturbed_likelihoods = perturbed_likelihoods,
|
| 303 |
)
|
|
@@ -313,27 +290,24 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 313 |
|
| 314 |
else:
|
| 315 |
# Use meaningful defaults when perturbations fail
|
| 316 |
-
stability_score =
|
| 317 |
-
curvature_score =
|
| 318 |
-
variance_score =
|
| 319 |
avg_perturbed_likelihood = original_likelihood * 0.9 # Assume some drop
|
| 320 |
logger.warning("No valid perturbations, using fallback values")
|
| 321 |
|
| 322 |
# Calculate likelihood ratio
|
| 323 |
-
likelihood_ratio = original_likelihood / avg_perturbed_likelihood if avg_perturbed_likelihood >
|
| 324 |
|
| 325 |
# Chunk-based analysis for whole-text understanding
|
| 326 |
-
chunk_stabilities = self._calculate_chunk_stability(text
|
| 327 |
-
|
| 328 |
-
)
|
| 329 |
-
|
| 330 |
-
stability_variance = np.var(chunk_stabilities) if chunk_stabilities else 0.1
|
| 331 |
avg_chunk_stability = np.mean(chunk_stabilities) if chunk_stabilities else stability_score
|
| 332 |
|
| 333 |
# Better normalization to prevent extreme values
|
| 334 |
normalized_stability = min(1.0, max(0.0, stability_score))
|
| 335 |
normalized_curvature = min(1.0, max(0.0, curvature_score))
|
| 336 |
-
normalized_likelihood_ratio = min(
|
| 337 |
|
| 338 |
return {"original_likelihood" : round(original_likelihood, 4),
|
| 339 |
"avg_perturbed_likelihood" : round(avg_perturbed_likelihood, 4),
|
|
@@ -361,12 +335,13 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 361 |
"""
|
| 362 |
try:
|
| 363 |
# Check text length before tokenization
|
| 364 |
-
if (len(text.strip()) <
|
| 365 |
-
|
|
|
|
| 366 |
|
| 367 |
if not self.gpt_model or not self.gpt_tokenizer:
|
| 368 |
logger.warning("GPT model not available for likelihood calculation")
|
| 369 |
-
return
|
| 370 |
|
| 371 |
# Ensure tokenizer has pad token
|
| 372 |
if self.gpt_tokenizer.pad_token is None:
|
|
@@ -376,7 +351,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 376 |
encodings = self.gpt_tokenizer(text,
|
| 377 |
return_tensors = 'pt',
|
| 378 |
truncation = True,
|
| 379 |
-
max_length =
|
| 380 |
padding = True,
|
| 381 |
return_attention_mask = True,
|
| 382 |
)
|
|
@@ -385,8 +360,8 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 385 |
attention_mask = encodings.attention_mask.to(self.device)
|
| 386 |
|
| 387 |
# Minimum tokens for meaningful analysis
|
| 388 |
-
if ((input_ids.numel() == 0) or (input_ids.size(1) <
|
| 389 |
-
return
|
| 390 |
|
| 391 |
# Calculate proper log-likelihood using token probabilities
|
| 392 |
with torch.no_grad():
|
|
@@ -419,18 +394,17 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 419 |
avg_log_likelihood = 0.0
|
| 420 |
|
| 421 |
# Convert to positive scale and normalize
|
| 422 |
-
|
| 423 |
-
# Higher normalized value = more likely text
|
| 424 |
-
normalized_likelihood = max(0.5, min(10.0, -avg_log_likelihood))
|
| 425 |
|
| 426 |
return normalized_likelihood
|
| 427 |
|
| 428 |
except Exception as e:
|
| 429 |
logger.warning(f"Likelihood calculation failed: {repr(e)}")
|
| 430 |
-
|
|
|
|
| 431 |
|
| 432 |
|
| 433 |
-
def _generate_perturbations(self, text: str
|
| 434 |
"""
|
| 435 |
Generate perturbed versions of the text using multiple techniques:
|
| 436 |
1. Word deletion (simple but effective)
|
|
@@ -439,21 +413,22 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 439 |
4. Synonym replacement (fallback)
|
| 440 |
"""
|
| 441 |
perturbations = list()
|
|
|
|
| 442 |
|
| 443 |
try:
|
| 444 |
# Pre-process text for perturbation
|
| 445 |
processed_text = self._preprocess_text_for_perturbation(text)
|
| 446 |
words = processed_text.split()
|
| 447 |
|
| 448 |
-
if (len(words) <
|
| 449 |
return [processed_text]
|
| 450 |
|
| 451 |
# Method 1: Simple word deletion (most reliable)
|
| 452 |
-
if (len(words) >
|
| 453 |
for _ in range(min(3, num_perturbations)):
|
| 454 |
try:
|
| 455 |
-
# Delete random words
|
| 456 |
-
delete_count = max(1, len(words)
|
| 457 |
indices_to_keep = np.random.choice(len(words), len(words) - delete_count, replace = False)
|
| 458 |
|
| 459 |
perturbed_words = [words[i] for i in sorted(indices_to_keep)]
|
|
@@ -490,9 +465,8 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 490 |
if (self.mask_model and self.mask_tokenizer and (len(words) > 4) and len(perturbations) < num_perturbations):
|
| 491 |
|
| 492 |
try:
|
| 493 |
-
roberta_perturbations = self._generate_roberta_masked_perturbations(text
|
| 494 |
-
words
|
| 495 |
-
max_perturbations = num_perturbations - len(perturbations),
|
| 496 |
)
|
| 497 |
perturbations.extend(roberta_perturbations)
|
| 498 |
|
|
@@ -502,10 +476,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 502 |
# Method 4: Synonym replacement as fallback
|
| 503 |
if (len(perturbations) < num_perturbations):
|
| 504 |
try:
|
| 505 |
-
synonym_perturbations = self._generate_synonym_perturbations(text
|
| 506 |
-
words = words,
|
| 507 |
-
max_perturbations = num_perturbations - len(perturbations),
|
| 508 |
-
)
|
| 509 |
perturbations.extend(synonym_perturbations)
|
| 510 |
|
| 511 |
except Exception as e:
|
|
@@ -533,12 +504,13 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 533 |
return [text] # Return at least the original text as fallback
|
| 534 |
|
| 535 |
|
| 536 |
-
def _generate_roberta_masked_perturbations(self, text: str, words: List[str]
|
| 537 |
"""
|
| 538 |
Generate perturbations using DistilRoBERTa mask filling
|
| 539 |
-
This is inspired by DetectGPT but uses a lighter model (DistilRoBERTa instead of T5)
|
| 540 |
"""
|
| 541 |
-
perturbations
|
|
|
|
| 542 |
|
| 543 |
try:
|
| 544 |
# Use the proper DistilRoBERTa mask token from tokenizer
|
|
@@ -546,13 +518,14 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 546 |
roberta_mask_token = self.mask_tokenizer.mask_token
|
| 547 |
|
| 548 |
else:
|
| 549 |
-
|
|
|
|
| 550 |
|
| 551 |
# Select words to mask (avoid very short words and punctuation)
|
| 552 |
-
candidate_positions = [i for i, word in enumerate(words) if (len(word) > 3) and word.isalpha() and word.lower() not in
|
| 553 |
|
| 554 |
if not candidate_positions:
|
| 555 |
-
candidate_positions = [i for i, word in enumerate(words) if len(word) > 2]
|
| 556 |
|
| 557 |
if not candidate_positions:
|
| 558 |
return perturbations
|
|
@@ -577,15 +550,15 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 577 |
masked_text += '.'
|
| 578 |
|
| 579 |
# Tokenize with DistilRoBERTa-specific settings
|
| 580 |
-
inputs
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
|
| 587 |
# Move to appropriate device
|
| 588 |
-
inputs
|
| 589 |
|
| 590 |
# Get model predictions
|
| 591 |
with torch.no_grad():
|
|
@@ -602,7 +575,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 602 |
|
| 603 |
# Get top prediction
|
| 604 |
probs = torch.nn.functional.softmax(predictions[0, mask_token_index], dim = -1)
|
| 605 |
-
top_tokens = torch.topk(probs,
|
| 606 |
|
| 607 |
for token_id in top_tokens.indices:
|
| 608 |
predicted_token = self.mask_tokenizer.decode(token_id).strip()
|
|
@@ -631,11 +604,12 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 631 |
return perturbations
|
| 632 |
|
| 633 |
|
| 634 |
-
def _generate_synonym_perturbations(self, text: str, words: List[str]
|
| 635 |
"""
|
| 636 |
Simple synonym replacement as fallback
|
| 637 |
"""
|
| 638 |
-
perturbations
|
|
|
|
| 639 |
|
| 640 |
try:
|
| 641 |
# Simple manual synonym dictionary for common words
|
|
@@ -653,7 +627,10 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 653 |
if not replaceable_positions:
|
| 654 |
return perturbations
|
| 655 |
|
| 656 |
-
positions_to_try = np.random.choice(replaceable_positions,
|
|
|
|
|
|
|
|
|
|
| 657 |
|
| 658 |
for pos in positions_to_try:
|
| 659 |
original_word = words[pos].lower()
|
|
@@ -702,17 +679,17 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 702 |
|
| 703 |
def _calculate_stability_score(self, original_likelihood: float, perturbed_likelihoods: List[float]) -> float:
|
| 704 |
"""
|
| 705 |
-
Calculate text stability score with
|
| 706 |
"""
|
| 707 |
-
if ((not perturbed_likelihoods) or (original_likelihood <=
|
| 708 |
-
# Assume more
|
| 709 |
-
return
|
| 710 |
|
| 711 |
# Calculate relative likelihood drops
|
| 712 |
relative_drops = list()
|
| 713 |
|
| 714 |
for pl in perturbed_likelihoods:
|
| 715 |
-
if (pl >
|
| 716 |
# Use relative drop to handle scale differences
|
| 717 |
relative_drop = (original_likelihood - pl) / original_likelihood
|
| 718 |
|
|
@@ -720,25 +697,25 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 720 |
relative_drops.append(max(0.0, min(1.0, relative_drop)))
|
| 721 |
|
| 722 |
if not relative_drops:
|
| 723 |
-
return
|
| 724 |
|
| 725 |
avg_relative_drop = np.mean(relative_drops)
|
| 726 |
|
| 727 |
-
# Normalization based on empirical observations :
|
| 728 |
-
if (avg_relative_drop >
|
| 729 |
-
# Strong
|
| 730 |
-
stability_score =
|
| 731 |
|
| 732 |
-
elif (avg_relative_drop >
|
| 733 |
-
#
|
| 734 |
-
stability_score =
|
| 735 |
|
| 736 |
-
elif (avg_relative_drop >
|
| 737 |
-
#
|
| 738 |
-
stability_score =
|
| 739 |
|
| 740 |
else:
|
| 741 |
-
#
|
| 742 |
stability_score = avg_relative_drop * 2.0
|
| 743 |
|
| 744 |
return min(1.0, max(0.0, stability_score))
|
|
@@ -748,51 +725,53 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 748 |
"""
|
| 749 |
Calculate likelihood curvature score with better scaling : Measures how "curved" the likelihood surface is around the text
|
| 750 |
"""
|
| 751 |
-
if ((not perturbed_likelihoods) or (original_likelihood <=
|
| 752 |
-
return
|
| 753 |
|
| 754 |
# Calculate variance of likelihood changes
|
| 755 |
likelihood_changes = [abs(original_likelihood - pl) for pl in perturbed_likelihoods]
|
| 756 |
|
| 757 |
if (len(likelihood_changes) < 2):
|
| 758 |
-
return
|
| 759 |
|
| 760 |
change_variance = np.var(likelihood_changes)
|
| 761 |
|
| 762 |
-
# Typical variance for meaningful analysis
|
| 763 |
-
curvature_score = min(1.0, change_variance *
|
| 764 |
|
| 765 |
return curvature_score
|
| 766 |
|
| 767 |
|
| 768 |
-
def _calculate_chunk_stability(self, text: str
|
| 769 |
"""
|
| 770 |
Calculate stability across text chunks for whole-text analysis
|
| 771 |
"""
|
| 772 |
stabilities = list()
|
| 773 |
words = text.split()
|
|
|
|
|
|
|
| 774 |
|
| 775 |
# Create overlapping chunks
|
| 776 |
-
for i in range(0, len(words), chunk_size
|
| 777 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 778 |
|
| 779 |
-
if (len(chunk) >
|
| 780 |
try:
|
| 781 |
chunk_likelihood = self._calculate_likelihood(text = chunk)
|
| 782 |
|
| 783 |
-
if (chunk_likelihood >
|
| 784 |
# Generate a simple perturbation for this chunk
|
| 785 |
chunk_words = chunk.split()
|
| 786 |
|
| 787 |
-
if (len(chunk_words) >
|
| 788 |
-
# Delete
|
| 789 |
-
delete_count = max(1, len(chunk_words)
|
| 790 |
indices_to_keep = np.random.choice(len(chunk_words), len(chunk_words) - delete_count, replace=False)
|
| 791 |
perturbed_chunk = ' '.join([chunk_words[i] for i in sorted(indices_to_keep)])
|
| 792 |
|
| 793 |
perturbed_likelihood = self._calculate_likelihood(text = perturbed_chunk)
|
| 794 |
|
| 795 |
-
if (perturbed_likelihood >
|
| 796 |
stability = (chunk_likelihood - perturbed_likelihood) / chunk_likelihood
|
| 797 |
stabilities.append(min(1.0, max(0.0, stability)))
|
| 798 |
|
|
@@ -809,135 +788,132 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 809 |
# Check feature validity first
|
| 810 |
required_features = ['stability_score', 'curvature_score', 'normalized_likelihood_ratio', 'stability_variance', 'perturbation_variance']
|
| 811 |
|
| 812 |
-
valid_features = [features.get(feat, 0) for feat in required_features if features.get(feat, 0) >
|
| 813 |
|
| 814 |
-
if (len(valid_features) <
|
| 815 |
# Low confidence if insufficient features
|
| 816 |
-
return
|
| 817 |
|
| 818 |
|
| 819 |
-
# Initialize
|
| 820 |
-
|
| 821 |
|
| 822 |
# Better weighting based on feature reliability
|
| 823 |
-
stability_weight = 0.3
|
| 824 |
-
curvature_weight = 0.25
|
| 825 |
-
ratio_weight = 0.25
|
| 826 |
-
variance_weight = 0.2
|
| 827 |
-
|
| 828 |
-
# High stability score suggests AI (larger likelihood drops)
|
| 829 |
stability = features['stability_score']
|
| 830 |
-
if (stability >
|
| 831 |
-
|
| 832 |
|
| 833 |
-
elif (stability >
|
| 834 |
-
|
| 835 |
|
| 836 |
-
elif (stability >
|
| 837 |
-
|
| 838 |
|
| 839 |
else:
|
| 840 |
-
|
| 841 |
|
| 842 |
-
# High curvature score suggests
|
| 843 |
curvature = features['curvature_score']
|
| 844 |
-
if (curvature >
|
| 845 |
-
|
| 846 |
|
| 847 |
-
elif (curvature >
|
| 848 |
-
|
| 849 |
|
| 850 |
-
elif (curvature >
|
| 851 |
-
|
| 852 |
|
| 853 |
else:
|
| 854 |
-
|
| 855 |
|
| 856 |
-
# High likelihood ratio suggests
|
| 857 |
ratio = features['normalized_likelihood_ratio']
|
| 858 |
-
if (ratio >
|
| 859 |
-
|
| 860 |
|
| 861 |
-
elif (ratio >
|
| 862 |
-
|
| 863 |
|
| 864 |
-
elif (ratio >
|
| 865 |
-
|
| 866 |
|
| 867 |
else:
|
| 868 |
-
|
| 869 |
|
| 870 |
-
# Low stability variance suggests
|
| 871 |
stability_var = features['stability_variance']
|
| 872 |
-
if (stability_var <
|
| 873 |
-
|
| 874 |
|
| 875 |
-
elif (stability_var <
|
| 876 |
-
|
| 877 |
|
| 878 |
else:
|
| 879 |
-
|
| 880 |
|
| 881 |
# Calculate raw score and confidence
|
| 882 |
-
if
|
| 883 |
-
|
| 884 |
-
|
|
|
|
|
|
|
| 885 |
|
| 886 |
else:
|
| 887 |
-
raw_score =
|
| 888 |
-
confidence =
|
| 889 |
|
| 890 |
-
confidence = max(
|
| 891 |
|
| 892 |
return raw_score, confidence
|
| 893 |
|
| 894 |
|
| 895 |
-
def
|
| 896 |
"""
|
| 897 |
-
Calculate probability of
|
| 898 |
"""
|
| 899 |
-
|
| 900 |
|
| 901 |
# Moderate stability values might indicate mixing
|
| 902 |
-
if (
|
| 903 |
-
|
| 904 |
|
| 905 |
else:
|
| 906 |
-
|
| 907 |
|
| 908 |
# High stability variance suggests mixed content
|
| 909 |
-
if (features['stability_variance'] >
|
| 910 |
-
|
| 911 |
|
| 912 |
-
elif (features['stability_variance'] >
|
| 913 |
-
|
| 914 |
|
| 915 |
else:
|
| 916 |
-
|
| 917 |
|
| 918 |
# Inconsistent likelihood ratios
|
| 919 |
-
if (
|
| 920 |
-
|
| 921 |
|
| 922 |
else:
|
| 923 |
-
|
| 924 |
|
| 925 |
-
|
|
|
|
| 926 |
|
| 927 |
|
| 928 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 929 |
"""
|
| 930 |
Return more meaningful default features
|
| 931 |
"""
|
| 932 |
-
return {"original_likelihood" :
|
| 933 |
-
"avg_perturbed_likelihood" :
|
| 934 |
-
"likelihood_ratio" :
|
| 935 |
-
"normalized_likelihood_ratio" :
|
| 936 |
-
"stability_score" :
|
| 937 |
-
"curvature_score" :
|
| 938 |
-
"perturbation_variance" :
|
| 939 |
-
"avg_chunk_stability" :
|
| 940 |
-
"stability_variance" :
|
| 941 |
"num_perturbations" : 0,
|
| 942 |
"num_valid_perturbations" : 0,
|
| 943 |
"num_chunks_analyzed" : 0,
|
|
@@ -955,8 +931,8 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 955 |
text = ' '.join(text.split())
|
| 956 |
|
| 957 |
# Truncate very long texts
|
| 958 |
-
if len(text) >
|
| 959 |
-
text = text[:
|
| 960 |
|
| 961 |
return text
|
| 962 |
|
|
@@ -976,14 +952,14 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 976 |
text += '.'
|
| 977 |
|
| 978 |
# Truncate to safe length
|
| 979 |
-
if (len(text) >
|
| 980 |
sentences = text.split('. ')
|
| 981 |
if (len(sentences) > 1):
|
| 982 |
# Keep first few sentences
|
| 983 |
text = '. '.join(sentences[:3]) + '.'
|
| 984 |
|
| 985 |
else:
|
| 986 |
-
text = text[:
|
| 987 |
|
| 988 |
return text
|
| 989 |
|
|
@@ -1032,7 +1008,7 @@ class MultiPerturbationStabilityMetric(BaseMetric):
|
|
| 1032 |
return False
|
| 1033 |
|
| 1034 |
# Must have some actual content
|
| 1035 |
-
if len(perturbed_text.strip()) <
|
| 1036 |
return False
|
| 1037 |
|
| 1038 |
return True
|
|
|
|
| 1 |
# DEPENDENCIES
|
|
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
from typing import Any
|
| 5 |
from typing import Dict
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
+
from config.enums import Domain
|
| 9 |
+
from config.schemas import MetricResult
|
| 10 |
from metrics.base_metric import BaseMetric
|
|
|
|
| 11 |
from models.model_manager import get_model_manager
|
| 12 |
from config.threshold_config import get_threshold_for_domain
|
| 13 |
+
from config.constants import multi_perturbation_stability_metric_params
|
| 14 |
|
| 15 |
|
| 16 |
class MultiPerturbationStabilityMetric(BaseMetric):
|
| 17 |
"""
|
| 18 |
Multi-Perturbation Stability Metric (MPSM)
|
| 19 |
|
| 20 |
+
A hybrid approach for combining multiple perturbation techniques for robust synthetic-generated text detection
|
| 21 |
|
| 22 |
Measures:
|
| 23 |
- Text stability under random perturbations
|
|
|
|
| 25 |
- Masked token prediction analysis
|
| 26 |
|
| 27 |
Perturbation Methods:
|
| 28 |
+
- Word deletion & swapping
|
| 29 |
- RoBERTa mask filling
|
| 30 |
- Synonym replacement
|
| 31 |
- Chunk-based stability Analysis
|
|
|
|
| 40 |
self.mask_model = None
|
| 41 |
self.mask_tokenizer = None
|
| 42 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
|
| 43 |
+
self.params = multi_perturbation_stability_metric_params
|
| 44 |
|
| 45 |
|
| 46 |
def initialize(self) -> bool:
|
|
|
|
| 144 |
Compute MultiPerturbationStability analysis with FULL DOMAIN THRESHOLD INTEGRATION
|
| 145 |
"""
|
| 146 |
try:
|
| 147 |
+
if ((not text) or (len(text.strip()) < self.params.MIN_TEXT_LENGTH_FOR_ANALYSIS)):
|
| 148 |
+
return self._default_result(error = "Text too short for MultiPerturbationStability analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
# Get domain-specific thresholds
|
| 151 |
domain = kwargs.get('domain', Domain.GENERAL)
|
|
|
|
| 155 |
# Check if we should run this computationally expensive metric
|
| 156 |
if (kwargs.get('skip_expensive', False)):
|
| 157 |
logger.info("Skipping MultiPerturbationStability due to computational constraints")
|
| 158 |
+
return self._default_result(error = "Skipped for performance")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
# Calculate MultiPerturbationStability features
|
| 161 |
+
features = self._calculate_stability_features(text = text)
|
| 162 |
|
| 163 |
# Calculate raw MultiPerturbationStability score (0-1 scale)
|
| 164 |
+
raw_stability_score, confidence = self._analyze_stability_patterns(features = features)
|
| 165 |
|
| 166 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 167 |
+
synthetic_prob, authentic_prob, hybrid_prob = self._apply_domain_thresholds(raw_score = raw_stability_score,
|
| 168 |
+
thresholds = multi_perturbation_stability_thresholds,
|
| 169 |
+
features = features,
|
| 170 |
+
)
|
| 171 |
|
| 172 |
# Apply confidence multiplier from domain thresholds
|
| 173 |
+
confidence *= multi_perturbation_stability_thresholds.confidence_multiplier
|
| 174 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 175 |
+
|
| 176 |
+
return MetricResult(metric_name = self.name,
|
| 177 |
+
synthetic_probability = synthetic_prob,
|
| 178 |
+
authentic_probability = authentic_prob,
|
| 179 |
+
hybrid_probability = hybrid_prob,
|
| 180 |
+
confidence = confidence,
|
| 181 |
+
details = {**features,
|
| 182 |
+
'domain_used' : domain.value,
|
| 183 |
+
'synthetic_threshold': multi_perturbation_stability_thresholds.synthetic_threshold,
|
| 184 |
+
'authentic_threshold': multi_perturbation_stability_thresholds.authentic_threshold,
|
| 185 |
+
'raw_score' : raw_stability_score,
|
| 186 |
+
},
|
| 187 |
)
|
| 188 |
|
| 189 |
except Exception as e:
|
| 190 |
logger.error(f"Error in MultiPerturbationStability computation: {repr(e)}")
|
| 191 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
|
| 194 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 195 |
"""
|
| 196 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 197 |
"""
|
| 198 |
+
synthetic_threshold = thresholds.synthetic_threshold
|
| 199 |
+
authentic_threshold = thresholds.authentic_threshold
|
| 200 |
|
| 201 |
# Calculate probabilities based on threshold distances
|
| 202 |
+
if (raw_score >= synthetic_threshold):
|
| 203 |
+
# Above synthetic threshold - strongly synthetic
|
| 204 |
+
distance_from_threshold = raw_score - synthetic_threshold
|
| 205 |
+
synthetic_prob = self.params.STRONG_SYNTHETIC_BASE_PROB + (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 206 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 207 |
|
| 208 |
+
elif (raw_score <= authentic_threshold):
|
| 209 |
+
# Below authentic threshold - strongly authentic
|
| 210 |
+
distance_from_threshold = authentic_threshold - raw_score
|
| 211 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START - (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 212 |
+
authentic_prob = self.params.STRONG_AUTHENTIC_BASE_PROB + (distance_from_threshold * self.params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 213 |
|
| 214 |
else:
|
| 215 |
# Between thresholds - uncertain zone
|
| 216 |
+
range_width = synthetic_threshold - authentic_threshold
|
| 217 |
|
| 218 |
+
if (range_width > self.params.ZERO_TOLERANCE):
|
| 219 |
+
position_in_range = (raw_score - authentic_threshold) / range_width
|
| 220 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START + (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 221 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 222 |
|
| 223 |
else:
|
| 224 |
+
synthetic_prob = self.params.NEUTRAL_PROBABILITY
|
| 225 |
+
authentic_prob = self.params.NEUTRAL_PROBABILITY
|
| 226 |
|
| 227 |
# Ensure probabilities are valid
|
| 228 |
+
synthetic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, synthetic_prob))
|
| 229 |
+
authentic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, authentic_prob))
|
| 230 |
|
| 231 |
+
# Calculate hybrid probability based on stability variance
|
| 232 |
+
hybrid_prob = self._calculate_hybrid_probability(features)
|
| 233 |
|
| 234 |
# Normalize to sum to 1.0
|
| 235 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 236 |
|
| 237 |
+
if (total > self.params.ZERO_TOLERANCE):
|
| 238 |
+
synthetic_prob /= total
|
| 239 |
+
authentic_prob /= total
|
| 240 |
+
hybrid_prob /= total
|
| 241 |
|
| 242 |
+
return synthetic_prob, authentic_prob, hybrid_prob
|
| 243 |
|
| 244 |
|
| 245 |
def _calculate_stability_features(self, text: str) -> Dict[str, Any]:
|
|
|
|
| 258 |
logger.debug(f"Original likelihood: {original_likelihood:.4f}")
|
| 259 |
|
| 260 |
# Generate perturbations and calculate perturbed likelihoods
|
| 261 |
+
perturbations = self._generate_perturbations(text = processed_text)
|
|
|
|
|
|
|
| 262 |
logger.debug(f"Generated {len(perturbations)} perturbations")
|
| 263 |
|
| 264 |
perturbed_likelihoods = list()
|
|
|
|
| 267 |
if (perturbed_text and (perturbed_text != processed_text)):
|
| 268 |
likelihood = self._calculate_likelihood(text = perturbed_text)
|
| 269 |
|
| 270 |
+
if (likelihood > self.params.ZERO_TOLERANCE):
|
| 271 |
perturbed_likelihoods.append(likelihood)
|
| 272 |
logger.debug(f"Perturbation {idx}: likelihood={likelihood:.4f}")
|
| 273 |
|
| 274 |
logger.info(f"Valid perturbations: {len(perturbed_likelihoods)}/{len(perturbations)}")
|
| 275 |
|
| 276 |
# Calculate stability metrics
|
| 277 |
+
if perturbed_likelihoods and (len(perturbed_likelihoods) >= self.params.MIN_VALID_PERTURBATIONS):
|
| 278 |
stability_score = self._calculate_stability_score(original_likelihood = original_likelihood,
|
| 279 |
perturbed_likelihoods = perturbed_likelihoods,
|
| 280 |
)
|
|
|
|
| 290 |
|
| 291 |
else:
|
| 292 |
# Use meaningful defaults when perturbations fail
|
| 293 |
+
stability_score = self.params.DEFAULT_STABILITY_SCORE # Assume more authentic-like when no perturbations work
|
| 294 |
+
curvature_score = self.params.DEFAULT_CURVATURE_SCORE
|
| 295 |
+
variance_score = self.params.DEFAULT_PERTURBATION_VARIANCE
|
| 296 |
avg_perturbed_likelihood = original_likelihood * 0.9 # Assume some drop
|
| 297 |
logger.warning("No valid perturbations, using fallback values")
|
| 298 |
|
| 299 |
# Calculate likelihood ratio
|
| 300 |
+
likelihood_ratio = original_likelihood / avg_perturbed_likelihood if avg_perturbed_likelihood > self.params.ZERO_TOLERANCE else 1.0
|
| 301 |
|
| 302 |
# Chunk-based analysis for whole-text understanding
|
| 303 |
+
chunk_stabilities = self._calculate_chunk_stability(text = processed_text)
|
| 304 |
+
stability_variance = np.var(chunk_stabilities) if chunk_stabilities else self.params.DEFAULT_STABILITY_VARIANCE
|
|
|
|
|
|
|
|
|
|
| 305 |
avg_chunk_stability = np.mean(chunk_stabilities) if chunk_stabilities else stability_score
|
| 306 |
|
| 307 |
# Better normalization to prevent extreme values
|
| 308 |
normalized_stability = min(1.0, max(0.0, stability_score))
|
| 309 |
normalized_curvature = min(1.0, max(0.0, curvature_score))
|
| 310 |
+
normalized_likelihood_ratio = min(self.params.MAX_LIKELIHOOD_RATIO, max(self.params.MIN_LIKELIHOOD_RATIO, likelihood_ratio)) / self.params.MAX_LIKELIHOOD_RATIO
|
| 311 |
|
| 312 |
return {"original_likelihood" : round(original_likelihood, 4),
|
| 313 |
"avg_perturbed_likelihood" : round(avg_perturbed_likelihood, 4),
|
|
|
|
| 335 |
"""
|
| 336 |
try:
|
| 337 |
# Check text length before tokenization
|
| 338 |
+
if (len(text.strip()) < self.params.MIN_TEXT_LENGTH_FOR_PERTURBATION):
|
| 339 |
+
# Return reasonable baseline
|
| 340 |
+
return self.params.DEFAULT_LIKELIHOOD
|
| 341 |
|
| 342 |
if not self.gpt_model or not self.gpt_tokenizer:
|
| 343 |
logger.warning("GPT model not available for likelihood calculation")
|
| 344 |
+
return self.params.DEFAULT_LIKELIHOOD
|
| 345 |
|
| 346 |
# Ensure tokenizer has pad token
|
| 347 |
if self.gpt_tokenizer.pad_token is None:
|
|
|
|
| 351 |
encodings = self.gpt_tokenizer(text,
|
| 352 |
return_tensors = 'pt',
|
| 353 |
truncation = True,
|
| 354 |
+
max_length = self.params.MAX_TOKEN_LENGTH,
|
| 355 |
padding = True,
|
| 356 |
return_attention_mask = True,
|
| 357 |
)
|
|
|
|
| 360 |
attention_mask = encodings.attention_mask.to(self.device)
|
| 361 |
|
| 362 |
# Minimum tokens for meaningful analysis
|
| 363 |
+
if ((input_ids.numel() == 0) or (input_ids.size(1) < self.params.MIN_TOKENS_FOR_LIKELIHOOD)):
|
| 364 |
+
return self.params.DEFAULT_LIKELIHOOD
|
| 365 |
|
| 366 |
# Calculate proper log-likelihood using token probabilities
|
| 367 |
with torch.no_grad():
|
|
|
|
| 394 |
avg_log_likelihood = 0.0
|
| 395 |
|
| 396 |
# Convert to positive scale and normalize
|
| 397 |
+
normalized_likelihood = max(self.params.MIN_LIKELIHOOD, min(self.params.MAX_LIKELIHOOD, -avg_log_likelihood))
|
|
|
|
|
|
|
| 398 |
|
| 399 |
return normalized_likelihood
|
| 400 |
|
| 401 |
except Exception as e:
|
| 402 |
logger.warning(f"Likelihood calculation failed: {repr(e)}")
|
| 403 |
+
# Return reasonable baseline on error
|
| 404 |
+
return self.params.DEFAULT_LIKELIHOOD
|
| 405 |
|
| 406 |
|
| 407 |
+
def _generate_perturbations(self, text: str) -> List[str]:
|
| 408 |
"""
|
| 409 |
Generate perturbed versions of the text using multiple techniques:
|
| 410 |
1. Word deletion (simple but effective)
|
|
|
|
| 413 |
4. Synonym replacement (fallback)
|
| 414 |
"""
|
| 415 |
perturbations = list()
|
| 416 |
+
num_perturbations = self.params.NUM_PERTURBATIONS
|
| 417 |
|
| 418 |
try:
|
| 419 |
# Pre-process text for perturbation
|
| 420 |
processed_text = self._preprocess_text_for_perturbation(text)
|
| 421 |
words = processed_text.split()
|
| 422 |
|
| 423 |
+
if (len(words) < self.params.MIN_WORDS_FOR_PERTURBATION):
|
| 424 |
return [processed_text]
|
| 425 |
|
| 426 |
# Method 1: Simple word deletion (most reliable)
|
| 427 |
+
if (len(words) > self.params.MIN_WORDS_FOR_DELETION):
|
| 428 |
for _ in range(min(3, num_perturbations)):
|
| 429 |
try:
|
| 430 |
+
# Delete random words
|
| 431 |
+
delete_count = max(1, int(len(words) * self.params.PERTURBATION_DELETION_RATIO))
|
| 432 |
indices_to_keep = np.random.choice(len(words), len(words) - delete_count, replace = False)
|
| 433 |
|
| 434 |
perturbed_words = [words[i] for i in sorted(indices_to_keep)]
|
|
|
|
| 465 |
if (self.mask_model and self.mask_tokenizer and (len(words) > 4) and len(perturbations) < num_perturbations):
|
| 466 |
|
| 467 |
try:
|
| 468 |
+
roberta_perturbations = self._generate_roberta_masked_perturbations(text = processed_text,
|
| 469 |
+
words = words,
|
|
|
|
| 470 |
)
|
| 471 |
perturbations.extend(roberta_perturbations)
|
| 472 |
|
|
|
|
| 476 |
# Method 4: Synonym replacement as fallback
|
| 477 |
if (len(perturbations) < num_perturbations):
|
| 478 |
try:
|
| 479 |
+
synonym_perturbations = self._generate_synonym_perturbations(text = processed_text, words = words)
|
|
|
|
|
|
|
|
|
|
| 480 |
perturbations.extend(synonym_perturbations)
|
| 481 |
|
| 482 |
except Exception as e:
|
|
|
|
| 504 |
return [text] # Return at least the original text as fallback
|
| 505 |
|
| 506 |
|
| 507 |
+
def _generate_roberta_masked_perturbations(self, text: str, words: List[str]) -> List[str]:
|
| 508 |
"""
|
| 509 |
Generate perturbations using DistilRoBERTa mask filling
|
| 510 |
+
- This is inspired by DetectGPT but uses a lighter model (DistilRoBERTa instead of T5)
|
| 511 |
"""
|
| 512 |
+
perturbations = list()
|
| 513 |
+
max_perturbations = min(self.params.MAX_PERTURBATION_ATTEMPTS, self.params.NUM_PERTURBATIONS - len(perturbations))
|
| 514 |
|
| 515 |
try:
|
| 516 |
# Use the proper DistilRoBERTa mask token from tokenizer
|
|
|
|
| 518 |
roberta_mask_token = self.mask_tokenizer.mask_token
|
| 519 |
|
| 520 |
else:
|
| 521 |
+
# Fallback
|
| 522 |
+
roberta_mask_token = "<mask>"
|
| 523 |
|
| 524 |
# Select words to mask (avoid very short words and punctuation)
|
| 525 |
+
candidate_positions = [i for i, word in enumerate(words) if (len(word) > 3) and word.isalpha() and word.lower() not in self.params.COMMON_WORDS_TO_AVOID]
|
| 526 |
|
| 527 |
if not candidate_positions:
|
| 528 |
+
candidate_positions = [i for i, word in enumerate(words) if (len(word) > 2)]
|
| 529 |
|
| 530 |
if not candidate_positions:
|
| 531 |
return perturbations
|
|
|
|
| 550 |
masked_text += '.'
|
| 551 |
|
| 552 |
# Tokenize with DistilRoBERTa-specific settings
|
| 553 |
+
inputs = self.mask_tokenizer(masked_text,
|
| 554 |
+
return_tensors = "pt",
|
| 555 |
+
truncation = True,
|
| 556 |
+
max_length = min(self.params.MAX_ROBERTA_TOKEN_LENGTH, self.mask_tokenizer.model_max_length),
|
| 557 |
+
padding = True,
|
| 558 |
+
)
|
| 559 |
|
| 560 |
# Move to appropriate device
|
| 561 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 562 |
|
| 563 |
# Get model predictions
|
| 564 |
with torch.no_grad():
|
|
|
|
| 575 |
|
| 576 |
# Get top prediction
|
| 577 |
probs = torch.nn.functional.softmax(predictions[0, mask_token_index], dim = -1)
|
| 578 |
+
top_tokens = torch.topk(probs, self.params.ROBBERTA_TOP_K_PREDICTIONS, dim = -1)
|
| 579 |
|
| 580 |
for token_id in top_tokens.indices:
|
| 581 |
predicted_token = self.mask_tokenizer.decode(token_id).strip()
|
|
|
|
| 604 |
return perturbations
|
| 605 |
|
| 606 |
|
| 607 |
+
def _generate_synonym_perturbations(self, text: str, words: List[str]) -> List[str]:
|
| 608 |
"""
|
| 609 |
Simple synonym replacement as fallback
|
| 610 |
"""
|
| 611 |
+
perturbations = list()
|
| 612 |
+
max_perturbations = self.params.NUM_PERTURBATIONS - len(perturbations)
|
| 613 |
|
| 614 |
try:
|
| 615 |
# Simple manual synonym dictionary for common words
|
|
|
|
| 627 |
if not replaceable_positions:
|
| 628 |
return perturbations
|
| 629 |
|
| 630 |
+
positions_to_try = np.random.choice(replaceable_positions,
|
| 631 |
+
min(max_perturbations, len(replaceable_positions)),
|
| 632 |
+
replace = False,
|
| 633 |
+
)
|
| 634 |
|
| 635 |
for pos in positions_to_try:
|
| 636 |
original_word = words[pos].lower()
|
|
|
|
| 679 |
|
| 680 |
def _calculate_stability_score(self, original_likelihood: float, perturbed_likelihoods: List[float]) -> float:
|
| 681 |
"""
|
| 682 |
+
Calculate text stability score with normalization : synthetic text typically shows larger likelihood drops under perturbation than authentic text
|
| 683 |
"""
|
| 684 |
+
if ((not perturbed_likelihoods) or (original_likelihood <= self.params.ZERO_TOLERANCE)):
|
| 685 |
+
# Assume more authentic-like when no data
|
| 686 |
+
return self.params.DEFAULT_STABILITY_SCORE
|
| 687 |
|
| 688 |
# Calculate relative likelihood drops
|
| 689 |
relative_drops = list()
|
| 690 |
|
| 691 |
for pl in perturbed_likelihoods:
|
| 692 |
+
if (pl > self.params.ZERO_TOLERANCE):
|
| 693 |
# Use relative drop to handle scale differences
|
| 694 |
relative_drop = (original_likelihood - pl) / original_likelihood
|
| 695 |
|
|
|
|
| 697 |
relative_drops.append(max(0.0, min(1.0, relative_drop)))
|
| 698 |
|
| 699 |
if not relative_drops:
|
| 700 |
+
return self.params.DEFAULT_STABILITY_SCORE
|
| 701 |
|
| 702 |
avg_relative_drop = np.mean(relative_drops)
|
| 703 |
|
| 704 |
+
# Normalization based on empirical observations : synthetic text typically shows larger drops
|
| 705 |
+
if (avg_relative_drop > self.params.RELATIVE_DROP_HIGH_THRESHOLD):
|
| 706 |
+
# Strong synthetic indicator
|
| 707 |
+
stability_score = self.params.STABILITY_HIGH_THRESHOLD
|
| 708 |
|
| 709 |
+
elif (avg_relative_drop > self.params.RELATIVE_DROP_MEDIUM_THRESHOLD):
|
| 710 |
+
# Intermediate values
|
| 711 |
+
stability_score = self.params.STABILITY_MEDIUM_THRESHOLD + (avg_relative_drop - self.params.RELATIVE_DROP_MEDIUM_THRESHOLD) * 1.5
|
| 712 |
|
| 713 |
+
elif (avg_relative_drop > self.params.RELATIVE_DROP_LOW_THRESHOLD):
|
| 714 |
+
# Lower values
|
| 715 |
+
stability_score = self.params.STABILITY_LOW_THRESHOLD + (avg_relative_drop - self.params.RELATIVE_DROP_LOW_THRESHOLD) * 2.0
|
| 716 |
|
| 717 |
else:
|
| 718 |
+
# Very low values
|
| 719 |
stability_score = avg_relative_drop * 2.0
|
| 720 |
|
| 721 |
return min(1.0, max(0.0, stability_score))
|
|
|
|
| 725 |
"""
|
| 726 |
Calculate likelihood curvature score with better scaling : Measures how "curved" the likelihood surface is around the text
|
| 727 |
"""
|
| 728 |
+
if ((not perturbed_likelihoods) or (original_likelihood <= self.params.ZERO_TOLERANCE)):
|
| 729 |
+
return self.params.DEFAULT_CURVATURE_SCORE
|
| 730 |
|
| 731 |
# Calculate variance of likelihood changes
|
| 732 |
likelihood_changes = [abs(original_likelihood - pl) for pl in perturbed_likelihoods]
|
| 733 |
|
| 734 |
if (len(likelihood_changes) < 2):
|
| 735 |
+
return self.params.DEFAULT_CURVATURE_SCORE
|
| 736 |
|
| 737 |
change_variance = np.var(likelihood_changes)
|
| 738 |
|
| 739 |
+
# Typical variance for meaningful analysis
|
| 740 |
+
curvature_score = min(1.0, change_variance * self.params.CURVATURE_SCALING_FACTOR)
|
| 741 |
|
| 742 |
return curvature_score
|
| 743 |
|
| 744 |
|
| 745 |
+
def _calculate_chunk_stability(self, text: str) -> List[float]:
|
| 746 |
"""
|
| 747 |
Calculate stability across text chunks for whole-text analysis
|
| 748 |
"""
|
| 749 |
stabilities = list()
|
| 750 |
words = text.split()
|
| 751 |
+
chunk_size = self.params.CHUNK_SIZE_WORDS
|
| 752 |
+
overlap = int(chunk_size * self.params.CHUNK_OVERLAP_RATIO)
|
| 753 |
|
| 754 |
# Create overlapping chunks
|
| 755 |
+
for i in range(0, len(words), chunk_size - overlap):
|
| 756 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 757 |
|
| 758 |
+
if (len(chunk) > self.params.MIN_CHUNK_LENGTH):
|
| 759 |
try:
|
| 760 |
chunk_likelihood = self._calculate_likelihood(text = chunk)
|
| 761 |
|
| 762 |
+
if (chunk_likelihood > self.params.ZERO_TOLERANCE):
|
| 763 |
# Generate a simple perturbation for this chunk
|
| 764 |
chunk_words = chunk.split()
|
| 765 |
|
| 766 |
+
if (len(chunk_words) > self.params.MIN_WORDS_FOR_DELETION):
|
| 767 |
+
# Delete a percentage of words
|
| 768 |
+
delete_count = max(1, int(len(chunk_words) * self.params.CHUNK_DELETION_RATIO))
|
| 769 |
indices_to_keep = np.random.choice(len(chunk_words), len(chunk_words) - delete_count, replace=False)
|
| 770 |
perturbed_chunk = ' '.join([chunk_words[i] for i in sorted(indices_to_keep)])
|
| 771 |
|
| 772 |
perturbed_likelihood = self._calculate_likelihood(text = perturbed_chunk)
|
| 773 |
|
| 774 |
+
if (perturbed_likelihood > self.params.ZERO_TOLERANCE):
|
| 775 |
stability = (chunk_likelihood - perturbed_likelihood) / chunk_likelihood
|
| 776 |
stabilities.append(min(1.0, max(0.0, stability)))
|
| 777 |
|
|
|
|
| 788 |
# Check feature validity first
|
| 789 |
required_features = ['stability_score', 'curvature_score', 'normalized_likelihood_ratio', 'stability_variance', 'perturbation_variance']
|
| 790 |
|
| 791 |
+
valid_features = [features.get(feat, 0) for feat in required_features if features.get(feat, 0) > self.params.ZERO_TOLERANCE]
|
| 792 |
|
| 793 |
+
if (len(valid_features) < self.params.MIN_REQUIRED_FEATURES):
|
| 794 |
# Low confidence if insufficient features
|
| 795 |
+
return self.params.NEUTRAL_PROBABILITY, self.params.LOW_FEATURE_CONFIDENCE
|
| 796 |
|
| 797 |
|
| 798 |
+
# Initialize synthetic_indicator list
|
| 799 |
+
synthetic_indicators = list()
|
| 800 |
|
| 801 |
# Better weighting based on feature reliability
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
stability = features['stability_score']
|
| 803 |
+
if (stability > self.params.STABILITY_HIGH_THRESHOLD):
|
| 804 |
+
synthetic_indicators.append(self.params.STABILITY_STRONG_THRESHOLD * self.params.STABILITY_WEIGHT)
|
| 805 |
|
| 806 |
+
elif (stability > self.params.STABILITY_MEDIUM_THRESHOLD):
|
| 807 |
+
synthetic_indicators.append(self.params.STABILITY_MEDIUM_STRONG_THRESHOLD * self.params.STABILITY_WEIGHT)
|
| 808 |
|
| 809 |
+
elif (stability > self.params.STABILITY_LOW_THRESHOLD):
|
| 810 |
+
synthetic_indicators.append(self.params.STABILITY_MODERATE_THRESHOLD * self.params.STABILITY_WEIGHT)
|
| 811 |
|
| 812 |
else:
|
| 813 |
+
synthetic_indicators.append(self.params.STABILITY_WEAK_THRESHOLD * self.params.STABILITY_WEIGHT)
|
| 814 |
|
| 815 |
+
# High curvature score suggests synthetic
|
| 816 |
curvature = features['curvature_score']
|
| 817 |
+
if (curvature > self.params.CURVATURE_HIGH_THRESHOLD):
|
| 818 |
+
synthetic_indicators.append(self.params.CURVATURE_STRONG_THRESHOLD * self.params.CURVATURE_WEIGHT)
|
| 819 |
|
| 820 |
+
elif (curvature > self.params.CURVATURE_MEDIUM_THRESHOLD):
|
| 821 |
+
synthetic_indicators.append(self.params.CURVATURE_MEDIUM_THRESHOLD * self.params.CURVATURE_WEIGHT)
|
| 822 |
|
| 823 |
+
elif (curvature > self.params.CURVATURE_LOW_THRESHOLD):
|
| 824 |
+
synthetic_indicators.append(self.params.CURVATURE_MODERATE_THRESHOLD * self.params.CURVATURE_WEIGHT)
|
| 825 |
|
| 826 |
else:
|
| 827 |
+
synthetic_indicators.append(self.params.CURVATURE_WEAK_THRESHOLD * self.params.CURVATURE_WEIGHT)
|
| 828 |
|
| 829 |
+
# High likelihood ratio suggests synthetic (original much more likely than perturbations)
|
| 830 |
ratio = features['normalized_likelihood_ratio']
|
| 831 |
+
if (ratio > self.params.LIKELIHOOD_RATIO_HIGH_THRESHOLD):
|
| 832 |
+
synthetic_indicators.append(self.params.RATIO_STRONG_THRESHOLD * self.params.RATIO_WEIGHT)
|
| 833 |
|
| 834 |
+
elif (ratio > self.params.LIKELIHOOD_RATIO_MEDIUM_THRESHOLD):
|
| 835 |
+
synthetic_indicators.append(self.params.RATIO_MEDIUM_THRESHOLD * self.params.RATIO_WEIGHT)
|
| 836 |
|
| 837 |
+
elif (ratio > self.params.LIKELIHOOD_RATIO_LOW_THRESHOLD):
|
| 838 |
+
synthetic_indicators.append(self.params.RATIO_MODERATE_THRESHOLD * self.params.RATIO_WEIGHT)
|
| 839 |
|
| 840 |
else:
|
| 841 |
+
synthetic_indicators.append(self.params.RATIO_WEAK_THRESHOLD * self.params.RATIO_WEIGHT)
|
| 842 |
|
| 843 |
+
# Low stability variance suggests synthetic (consistent across chunks)
|
| 844 |
stability_var = features['stability_variance']
|
| 845 |
+
if (stability_var < self.params.STABILITY_VARIANCE_VERY_LOW):
|
| 846 |
+
synthetic_indicators.append(self.params.VARIANCE_STRONG_THRESHOLD * self.params.VARIANCE_WEIGHT)
|
| 847 |
|
| 848 |
+
elif (stability_var < self.params.STABILITY_VARIANCE_LOW):
|
| 849 |
+
synthetic_indicators.append(self.params.VARIANCE_MODERATE_THRESHOLD * self.params.VARIANCE_WEIGHT)
|
| 850 |
|
| 851 |
else:
|
| 852 |
+
synthetic_indicators.append(self.params.VARIANCE_WEAK_THRESHOLD * self.params.VARIANCE_WEIGHT)
|
| 853 |
|
| 854 |
# Calculate raw score and confidence
|
| 855 |
+
if synthetic_indicators:
|
| 856 |
+
total_weight = (self.params.STABILITY_WEIGHT + self.params.CURVATURE_WEIGHT + self.params.RATIO_WEIGHT + self.params.VARIANCE_WEIGHT)
|
| 857 |
+
raw_score = sum(synthetic_indicators) / total_weight
|
| 858 |
+
weights = [self.params.STABILITY_WEIGHT, self.params.CURVATURE_WEIGHT, self.params.RATIO_WEIGHT, self.params.VARIANCE_WEIGHT]
|
| 859 |
+
confidence = self.params.CONFIDENCE_BASE + (self.params.CONFIDENCE_STD_FACTOR * (1.0 - (np.std([x / weights[i] for i, x in enumerate(synthetic_indicators)]) if len(synthetic_indicators) > 1 else 0.5)))
|
| 860 |
|
| 861 |
else:
|
| 862 |
+
raw_score = self.params.NEUTRAL_PROBABILITY
|
| 863 |
+
confidence = self.params.LOW_FEATURE_CONFIDENCE
|
| 864 |
|
| 865 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 866 |
|
| 867 |
return raw_score, confidence
|
| 868 |
|
| 869 |
|
| 870 |
+
def _calculate_hybrid_probability(self, features: Dict[str, Any]) -> float:
|
| 871 |
"""
|
| 872 |
+
Calculate probability of hybrid synthetic/authentic content
|
| 873 |
"""
|
| 874 |
+
hybrid_indicators = list()
|
| 875 |
|
| 876 |
# Moderate stability values might indicate mixing
|
| 877 |
+
if (self.params.STABILITY_MIXED_MIN <= features['stability_score'] <= self.params.STABILITY_MIXED_MAX):
|
| 878 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 879 |
|
| 880 |
else:
|
| 881 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 882 |
|
| 883 |
# High stability variance suggests mixed content
|
| 884 |
+
if (features['stability_variance'] > self.params.STABILITY_VARIANCE_MIXED_HIGH):
|
| 885 |
+
hybrid_indicators.append(self.params.MODERATE_HYBRID_WEIGHT)
|
| 886 |
|
| 887 |
+
elif (features['stability_variance'] > self.params.STABILITY_VARIANCE_MIXED_MEDIUM):
|
| 888 |
+
hybrid_indicators.append(self.params.VERY_WEAK_HYBRID_WEIGHT)
|
| 889 |
|
| 890 |
else:
|
| 891 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 892 |
|
| 893 |
# Inconsistent likelihood ratios
|
| 894 |
+
if (self.params.LIKELIHOOD_RATIO_MIXED_MIN <= features['normalized_likelihood_ratio'] <= self.params.LIKELIHOOD_RATIO_MIXED_MAX):
|
| 895 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 896 |
|
| 897 |
else:
|
| 898 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 899 |
|
| 900 |
+
hybrid_prob = np.mean(hybrid_indicators) if hybrid_indicators else 0.0
|
| 901 |
+
return min(self.params.MAX_HYBRID_PROBABILITY, hybrid_prob)
|
| 902 |
|
| 903 |
|
| 904 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 905 |
"""
|
| 906 |
Return more meaningful default features
|
| 907 |
"""
|
| 908 |
+
return {"original_likelihood" : self.params.DEFAULT_ORIGINAL_LIKELIHOOD,
|
| 909 |
+
"avg_perturbed_likelihood" : self.params.DEFAULT_AVG_PERTURBED_LIKELIHOOD,
|
| 910 |
+
"likelihood_ratio" : self.params.DEFAULT_LIKELIHOOD_RATIO,
|
| 911 |
+
"normalized_likelihood_ratio" : self.params.DEFAULT_NORMALIZED_LIKELIHOOD_RATIO,
|
| 912 |
+
"stability_score" : self.params.DEFAULT_STABILITY_SCORE,
|
| 913 |
+
"curvature_score" : self.params.DEFAULT_CURVATURE_SCORE,
|
| 914 |
+
"perturbation_variance" : self.params.DEFAULT_PERTURBATION_VARIANCE,
|
| 915 |
+
"avg_chunk_stability" : self.params.DEFAULT_AVG_CHUNK_STABILITY,
|
| 916 |
+
"stability_variance" : self.params.DEFAULT_STABILITY_VARIANCE,
|
| 917 |
"num_perturbations" : 0,
|
| 918 |
"num_valid_perturbations" : 0,
|
| 919 |
"num_chunks_analyzed" : 0,
|
|
|
|
| 931 |
text = ' '.join(text.split())
|
| 932 |
|
| 933 |
# Truncate very long texts
|
| 934 |
+
if len(text) > self.params.MAX_TEXT_LENGTH_FOR_ANALYSIS:
|
| 935 |
+
text = text[:self.params.MAX_TEXT_LENGTH_FOR_ANALYSIS] + "..."
|
| 936 |
|
| 937 |
return text
|
| 938 |
|
|
|
|
| 952 |
text += '.'
|
| 953 |
|
| 954 |
# Truncate to safe length
|
| 955 |
+
if (len(text) > self.params.MAX_TEXT_LENGTH_FOR_PERTURBATION):
|
| 956 |
sentences = text.split('. ')
|
| 957 |
if (len(sentences) > 1):
|
| 958 |
# Keep first few sentences
|
| 959 |
text = '. '.join(sentences[:3]) + '.'
|
| 960 |
|
| 961 |
else:
|
| 962 |
+
text = text[:self.params.MAX_TEXT_LENGTH_FOR_PERTURBATION]
|
| 963 |
|
| 964 |
return text
|
| 965 |
|
|
|
|
| 1008 |
return False
|
| 1009 |
|
| 1010 |
# Must have some actual content
|
| 1011 |
+
if len(perturbed_text.strip()) < self.params.MIN_TEXT_LENGTH_FOR_PERTURBATION:
|
| 1012 |
return False
|
| 1013 |
|
| 1014 |
return True
|
metrics/perplexity.py
CHANGED
|
@@ -7,10 +7,11 @@ from typing import Any
|
|
| 7 |
from typing import Dict
|
| 8 |
from typing import List
|
| 9 |
from loguru import logger
|
| 10 |
-
from config.
|
|
|
|
| 11 |
from metrics.base_metric import BaseMetric
|
| 12 |
-
from metrics.base_metric import MetricResult
|
| 13 |
from models.model_manager import get_model_manager
|
|
|
|
| 14 |
from config.threshold_config import get_threshold_for_domain
|
| 15 |
|
| 16 |
|
|
@@ -19,7 +20,7 @@ class PerplexityMetric(BaseMetric):
|
|
| 19 |
Text predictability analysis using GPT-2 for perplexity calculation
|
| 20 |
|
| 21 |
Measures (Aligned with Documentation):
|
| 22 |
-
- Overall text perplexity (lower = more predictable = more
|
| 23 |
- Perplexity distribution across text chunks
|
| 24 |
- Sentence-level perplexity patterns
|
| 25 |
- Cross-entropy analysis
|
|
@@ -31,6 +32,7 @@ class PerplexityMetric(BaseMetric):
|
|
| 31 |
|
| 32 |
self.model = None
|
| 33 |
self.tokenizer = None
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def initialize(self) -> bool:
|
|
@@ -42,7 +44,7 @@ class PerplexityMetric(BaseMetric):
|
|
| 42 |
|
| 43 |
# Load GPT-2 model and tokenizer
|
| 44 |
model_manager = get_model_manager()
|
| 45 |
-
model_result = model_manager.load_model(model_name = "
|
| 46 |
|
| 47 |
if isinstance(model_result, tuple):
|
| 48 |
self.model, self.tokenizer = model_result
|
|
@@ -65,106 +67,101 @@ class PerplexityMetric(BaseMetric):
|
|
| 65 |
Compute perplexity measures with FULL DOMAIN THRESHOLD INTEGRATION
|
| 66 |
"""
|
| 67 |
try:
|
| 68 |
-
if not text or len(text.strip()) <
|
| 69 |
-
return MetricResult(metric_name
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
confidence
|
| 74 |
-
error
|
| 75 |
)
|
| 76 |
|
| 77 |
# Get domain-specific thresholds
|
| 78 |
-
domain
|
| 79 |
-
domain_thresholds
|
| 80 |
-
perplexity_thresholds
|
| 81 |
|
| 82 |
# Calculate comprehensive perplexity features
|
| 83 |
-
features
|
| 84 |
|
| 85 |
# Calculate raw perplexity score (0-1 scale)
|
| 86 |
-
raw_perplexity_score, confidence
|
| 87 |
|
| 88 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
# Apply confidence multiplier from domain thresholds
|
| 92 |
-
confidence
|
| 93 |
-
confidence
|
| 94 |
|
| 95 |
-
return MetricResult(metric_name
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
confidence
|
| 100 |
-
details
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
)
|
| 107 |
|
| 108 |
except Exception as e:
|
| 109 |
logger.error(f"Error in perplexity computation: {repr(e)}")
|
| 110 |
-
return
|
| 111 |
-
ai_probability = 0.5,
|
| 112 |
-
human_probability = 0.5,
|
| 113 |
-
mixed_probability = 0.0,
|
| 114 |
-
confidence = 0.0,
|
| 115 |
-
error = str(e),
|
| 116 |
-
)
|
| 117 |
|
| 118 |
|
| 119 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 120 |
"""
|
| 121 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 122 |
"""
|
| 123 |
-
|
| 124 |
-
|
| 125 |
|
| 126 |
# Calculate probabilities based on threshold distances
|
| 127 |
-
if (raw_score >=
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
human_prob = 0.7 + (distance_from_threshold * 0.3) # 0.7 to 1.0
|
| 138 |
-
|
| 139 |
else:
|
| 140 |
# Between thresholds - uncertain zone
|
| 141 |
-
range_width =
|
| 142 |
|
| 143 |
-
if (range_width >
|
| 144 |
-
position_in_range = (raw_score -
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
else:
|
| 149 |
-
|
| 150 |
-
|
| 151 |
|
| 152 |
# Ensure probabilities are valid
|
| 153 |
-
|
| 154 |
-
|
| 155 |
|
| 156 |
-
# Calculate
|
| 157 |
-
|
| 158 |
|
| 159 |
# Normalize to sum to 1.0
|
| 160 |
-
total
|
| 161 |
|
| 162 |
-
if (total >
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
|
| 167 |
-
return
|
| 168 |
|
| 169 |
|
| 170 |
def _calculate_perplexity_features(self, text: str) -> Dict[str, Any]:
|
|
@@ -186,10 +183,10 @@ class PerplexityMetric(BaseMetric):
|
|
| 186 |
|
| 187 |
for sentence in sentences:
|
| 188 |
# Minimum sentence length
|
| 189 |
-
if (len(sentence.strip()) >
|
| 190 |
sent_perplexity = self._calculate_perplexity(sentence)
|
| 191 |
|
| 192 |
-
if (sent_perplexity >
|
| 193 |
sentence_perplexities.append(sent_perplexity)
|
| 194 |
valid_sentences += 1
|
| 195 |
|
|
@@ -207,7 +204,7 @@ class PerplexityMetric(BaseMetric):
|
|
| 207 |
max_sentence_perplexity = overall_perplexity
|
| 208 |
|
| 209 |
# Chunk-based analysis for whole-text understanding
|
| 210 |
-
chunk_perplexities = self._calculate_chunk_perplexity(text
|
| 211 |
perplexity_variance = np.var(chunk_perplexities) if chunk_perplexities else 0.0
|
| 212 |
avg_chunk_perplexity = np.mean(chunk_perplexities) if chunk_perplexities else overall_perplexity
|
| 213 |
|
|
@@ -233,24 +230,24 @@ class PerplexityMetric(BaseMetric):
|
|
| 233 |
|
| 234 |
def _calculate_perplexity(self, text: str) -> float:
|
| 235 |
"""
|
| 236 |
-
Calculate perplexity for given text using GPT-2 : Lower perplexity = more predictable = more
|
| 237 |
"""
|
| 238 |
try:
|
| 239 |
# Check text length before tokenization
|
| 240 |
-
if (len(text.strip()) <
|
| 241 |
return 0.0
|
| 242 |
|
| 243 |
# Tokenize the text
|
| 244 |
encodings = self.tokenizer(text,
|
| 245 |
return_tensors = 'pt',
|
| 246 |
truncation = True,
|
| 247 |
-
max_length =
|
| 248 |
)
|
| 249 |
|
| 250 |
input_ids = encodings.input_ids
|
| 251 |
|
| 252 |
# Minimum tokens
|
| 253 |
-
if ((input_ids.numel() == 0) or (input_ids.size(1) <
|
| 254 |
return 0.0
|
| 255 |
|
| 256 |
# Calculate loss (cross-entropy)
|
|
@@ -272,44 +269,49 @@ class PerplexityMetric(BaseMetric):
|
|
| 272 |
"""
|
| 273 |
Split text into sentences
|
| 274 |
"""
|
| 275 |
-
sentences = re.split(
|
| 276 |
-
return [s.strip() for s in sentences if s.strip() and len(s.strip()) >
|
| 277 |
|
| 278 |
|
| 279 |
-
def _calculate_chunk_perplexity(self, text: str
|
| 280 |
"""
|
| 281 |
Calculate perplexity across text chunks for whole-text analysis
|
| 282 |
"""
|
| 283 |
chunks = list()
|
| 284 |
words = text.split()
|
|
|
|
|
|
|
| 285 |
|
| 286 |
# Ensure we have enough words for meaningful chunks
|
| 287 |
if (len(words) < chunk_size // 2):
|
| 288 |
return [self._calculate_perplexity(text)] if text.strip() else []
|
| 289 |
|
| 290 |
# Create overlapping chunks for better analysis
|
| 291 |
-
|
|
|
|
|
|
|
| 292 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 293 |
|
| 294 |
# Minimum chunk size
|
| 295 |
-
if (len(chunk) >
|
| 296 |
perplexity = self._calculate_perplexity(chunk)
|
| 297 |
|
| 298 |
# Reasonable range check
|
| 299 |
-
if ((perplexity >
|
| 300 |
chunks.append(perplexity)
|
| 301 |
|
| 302 |
-
|
|
|
|
| 303 |
|
| 304 |
|
| 305 |
def _normalize_perplexity(self, perplexity: float) -> float:
|
| 306 |
"""
|
| 307 |
Normalize perplexity using sigmoid transformation
|
| 308 |
|
| 309 |
-
Lower perplexity = higher normalized score = more
|
| 310 |
"""
|
| 311 |
-
# Use exponential normalization
|
| 312 |
-
normalized = 1.0 / (1.0 + np.exp((perplexity -
|
| 313 |
|
| 314 |
return normalized
|
| 315 |
|
|
@@ -319,7 +321,10 @@ class PerplexityMetric(BaseMetric):
|
|
| 319 |
Calculate cross-entropy as an alternative measure
|
| 320 |
"""
|
| 321 |
try:
|
| 322 |
-
encodings = self.tokenizer(text,
|
|
|
|
|
|
|
|
|
|
| 323 |
input_ids = encodings.input_ids
|
| 324 |
|
| 325 |
if (input_ids.numel() == 0):
|
|
@@ -329,9 +334,9 @@ class PerplexityMetric(BaseMetric):
|
|
| 329 |
outputs = self.model(input_ids, labels = input_ids)
|
| 330 |
loss = outputs.loss
|
| 331 |
|
| 332 |
-
# Normalize cross-entropy to 0-1 scale
|
| 333 |
cross_entropy = loss.item()
|
| 334 |
-
normalized_ce = min(1.0, cross_entropy /
|
| 335 |
|
| 336 |
return normalized_ce
|
| 337 |
|
|
@@ -342,130 +347,129 @@ class PerplexityMetric(BaseMetric):
|
|
| 342 |
|
| 343 |
def _analyze_perplexity_patterns(self, features: Dict[str, Any]) -> tuple:
|
| 344 |
"""
|
| 345 |
-
Analyze perplexity patterns to determine RAW perplexity score (0-1 scale) : Higher score = more
|
| 346 |
"""
|
| 347 |
# Check feature validity first
|
| 348 |
required_features = ['normalized_perplexity', 'perplexity_variance', 'std_sentence_perplexity', 'cross_entropy_score']
|
| 349 |
|
| 350 |
-
valid_features = [features.get(feat, 0) for feat in required_features if features.get(feat, 0) >
|
| 351 |
|
| 352 |
-
if (len(valid_features) <
|
| 353 |
# Low confidence if insufficient features
|
| 354 |
-
return
|
| 355 |
|
| 356 |
|
| 357 |
-
# Initialize
|
| 358 |
-
|
| 359 |
|
| 360 |
-
# Low overall perplexity suggests
|
| 361 |
-
if (features['normalized_perplexity'] >
|
| 362 |
-
# Very
|
| 363 |
-
|
| 364 |
|
| 365 |
-
elif (features['normalized_perplexity'] >
|
| 366 |
-
#
|
| 367 |
-
|
| 368 |
|
| 369 |
else:
|
| 370 |
-
#
|
| 371 |
-
|
| 372 |
|
| 373 |
-
# Low perplexity variance suggests
|
| 374 |
-
if (features['perplexity_variance'] <
|
| 375 |
-
|
| 376 |
|
| 377 |
-
elif (features['perplexity_variance'] <
|
| 378 |
-
|
| 379 |
|
| 380 |
else:
|
| 381 |
-
|
| 382 |
|
| 383 |
-
# Low sentence perplexity std suggests
|
| 384 |
-
if (features['std_sentence_perplexity'] <
|
| 385 |
-
|
| 386 |
|
| 387 |
-
elif (features['std_sentence_perplexity'] <
|
| 388 |
-
|
| 389 |
|
| 390 |
else:
|
| 391 |
-
|
| 392 |
|
| 393 |
-
# Low cross-entropy suggests
|
| 394 |
-
if (features['cross_entropy_score'] <
|
| 395 |
-
|
| 396 |
|
| 397 |
-
elif (features['cross_entropy_score'] <
|
| 398 |
-
|
| 399 |
|
| 400 |
else:
|
| 401 |
-
|
| 402 |
|
| 403 |
-
# Consistent chunk perplexity suggests
|
| 404 |
chunk_variance = features['perplexity_variance']
|
| 405 |
|
| 406 |
-
if (chunk_variance <
|
| 407 |
-
|
| 408 |
|
| 409 |
-
elif (chunk_variance <
|
| 410 |
-
|
| 411 |
|
| 412 |
else:
|
| 413 |
-
|
| 414 |
|
| 415 |
# Calculate raw score and confidence
|
| 416 |
-
raw_score = np.mean(
|
| 417 |
-
confidence = 1.0 - (np.std(
|
| 418 |
-
confidence = max(0.1, min(0.9, confidence))
|
| 419 |
-
|
| 420 |
return raw_score, confidence
|
| 421 |
|
| 422 |
|
| 423 |
-
def
|
| 424 |
"""
|
| 425 |
-
Calculate probability of
|
| 426 |
"""
|
| 427 |
-
|
| 428 |
|
| 429 |
# Moderate perplexity values might indicate mixing
|
| 430 |
-
if (
|
| 431 |
-
|
| 432 |
|
| 433 |
else:
|
| 434 |
-
|
| 435 |
|
| 436 |
# High perplexity variance suggests mixed content
|
| 437 |
-
if (features['perplexity_variance'] >
|
| 438 |
-
|
| 439 |
|
| 440 |
-
elif (features['perplexity_variance'] >
|
| 441 |
-
|
| 442 |
|
| 443 |
else:
|
| 444 |
-
|
| 445 |
|
| 446 |
# Inconsistent sentence perplexities
|
| 447 |
-
if (
|
| 448 |
-
|
| 449 |
|
| 450 |
else:
|
| 451 |
-
|
| 452 |
|
| 453 |
-
|
|
|
|
| 454 |
|
| 455 |
|
| 456 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 457 |
"""
|
| 458 |
Return default features when analysis is not possible
|
| 459 |
"""
|
| 460 |
-
return {"overall_perplexity" :
|
| 461 |
-
"normalized_perplexity" :
|
| 462 |
-
"avg_sentence_perplexity" :
|
| 463 |
-
"std_sentence_perplexity" :
|
| 464 |
-
"min_sentence_perplexity" :
|
| 465 |
-
"max_sentence_perplexity" :
|
| 466 |
-
"perplexity_variance" :
|
| 467 |
-
"avg_chunk_perplexity" :
|
| 468 |
-
"cross_entropy_score" :
|
| 469 |
"num_sentences_analyzed" : 0,
|
| 470 |
"num_chunks_analyzed" : 0,
|
| 471 |
}
|
|
@@ -482,4 +486,4 @@ class PerplexityMetric(BaseMetric):
|
|
| 482 |
|
| 483 |
|
| 484 |
# Export
|
| 485 |
-
__all__ = ["PerplexityMetric"]
|
|
|
|
| 7 |
from typing import Dict
|
| 8 |
from typing import List
|
| 9 |
from loguru import logger
|
| 10 |
+
from config.enums import Domain
|
| 11 |
+
from config.schemas import MetricResult
|
| 12 |
from metrics.base_metric import BaseMetric
|
|
|
|
| 13 |
from models.model_manager import get_model_manager
|
| 14 |
+
from config.constants import perplexity_metric_params
|
| 15 |
from config.threshold_config import get_threshold_for_domain
|
| 16 |
|
| 17 |
|
|
|
|
| 20 |
Text predictability analysis using GPT-2 for perplexity calculation
|
| 21 |
|
| 22 |
Measures (Aligned with Documentation):
|
| 23 |
+
- Overall text perplexity (lower = more predictable = more synthetic-like)
|
| 24 |
- Perplexity distribution across text chunks
|
| 25 |
- Sentence-level perplexity patterns
|
| 26 |
- Cross-entropy analysis
|
|
|
|
| 32 |
|
| 33 |
self.model = None
|
| 34 |
self.tokenizer = None
|
| 35 |
+
self.params = perplexity_metric_params
|
| 36 |
|
| 37 |
|
| 38 |
def initialize(self) -> bool:
|
|
|
|
| 44 |
|
| 45 |
# Load GPT-2 model and tokenizer
|
| 46 |
model_manager = get_model_manager()
|
| 47 |
+
model_result = model_manager.load_model(model_name = "perplexity_reference_lm")
|
| 48 |
|
| 49 |
if isinstance(model_result, tuple):
|
| 50 |
self.model, self.tokenizer = model_result
|
|
|
|
| 67 |
Compute perplexity measures with FULL DOMAIN THRESHOLD INTEGRATION
|
| 68 |
"""
|
| 69 |
try:
|
| 70 |
+
if (not text or len(text.strip()) < self.params.MIN_TEXT_LENGTH_FOR_ANALYSIS):
|
| 71 |
+
return MetricResult(metric_name = self.name,
|
| 72 |
+
synthetic_probability = self.params.NEUTRAL_PROBABILITY,
|
| 73 |
+
authentic_probability = self.params.NEUTRAL_PROBABILITY,
|
| 74 |
+
hybrid_probability = self.params.MIN_PROBABILITY,
|
| 75 |
+
confidence = self.params.MIN_CONFIDENCE,
|
| 76 |
+
error = "Text too short for perplexity analysis",
|
| 77 |
)
|
| 78 |
|
| 79 |
# Get domain-specific thresholds
|
| 80 |
+
domain = kwargs.get('domain', Domain.GENERAL)
|
| 81 |
+
domain_thresholds = get_threshold_for_domain(domain)
|
| 82 |
+
perplexity_thresholds = domain_thresholds.perplexity
|
| 83 |
|
| 84 |
# Calculate comprehensive perplexity features
|
| 85 |
+
features = self._calculate_perplexity_features(text = text)
|
| 86 |
|
| 87 |
# Calculate raw perplexity score (0-1 scale)
|
| 88 |
+
raw_perplexity_score, confidence = self._analyze_perplexity_patterns(features = features)
|
| 89 |
|
| 90 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 91 |
+
synthetic_prob, authentic_prob, hybrid_prob = self._apply_domain_thresholds(raw_score = raw_perplexity_score,
|
| 92 |
+
thresholds = perplexity_thresholds,
|
| 93 |
+
features = features,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
# Apply confidence multiplier from domain thresholds
|
| 97 |
+
confidence *= perplexity_thresholds.confidence_multiplier
|
| 98 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, confidence))
|
| 99 |
|
| 100 |
+
return MetricResult(metric_name = self.name,
|
| 101 |
+
synthetic_probability = synthetic_prob,
|
| 102 |
+
authentic_probability = authentic_prob,
|
| 103 |
+
hybrid_probability = hybrid_prob,
|
| 104 |
+
confidence = confidence,
|
| 105 |
+
details = {**features,
|
| 106 |
+
'domain_used' : domain.value,
|
| 107 |
+
'synthetic_threshold': perplexity_thresholds.synthetic_threshold,
|
| 108 |
+
'authentic_threshold': perplexity_thresholds.authentic_threshold,
|
| 109 |
+
'raw_score' : raw_perplexity_score,
|
| 110 |
+
},
|
| 111 |
)
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
logger.error(f"Error in perplexity computation: {repr(e)}")
|
| 115 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 119 |
"""
|
| 120 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 121 |
"""
|
| 122 |
+
synthetic_threshold = thresholds.synthetic_threshold
|
| 123 |
+
authentic_threshold = thresholds.authentic_threshold
|
| 124 |
|
| 125 |
# Calculate probabilities based on threshold distances
|
| 126 |
+
if (raw_score >= synthetic_threshold):
|
| 127 |
+
distance = raw_score - synthetic_threshold
|
| 128 |
+
synthetic_prob = self.params.STRONG_SYNTHETIC_BASE_PROB + distance * self.params.WEAK_PROBABILITY_ADJUSTMENT
|
| 129 |
+
authentic_prob = (self.params.MAX_PROBABILITY - self.params.STRONG_SYNTHETIC_BASE_PROB) - distance * self.params.WEAK_PROBABILITY_ADJUSTMENT
|
| 130 |
+
|
| 131 |
+
elif (raw_score <= authentic_threshold):
|
| 132 |
+
distance = authentic_threshold - raw_score
|
| 133 |
+
synthetic_prob = (self.params.MAX_PROBABILITY - self.params.STRONG_AUTHENTIC_BASE_PROB) - distance * self.params.WEAK_PROBABILITY_ADJUSTMENT
|
| 134 |
+
authentic_prob = self.params.STRONG_AUTHENTIC_BASE_PROB + distance * self.params.WEAK_PROBABILITY_ADJUSTMENT
|
| 135 |
+
|
|
|
|
|
|
|
| 136 |
else:
|
| 137 |
# Between thresholds - uncertain zone
|
| 138 |
+
range_width = synthetic_threshold - authentic_threshold
|
| 139 |
|
| 140 |
+
if (range_width > self.params.ZERO_TOLERANCE):
|
| 141 |
+
position_in_range = (raw_score - authentic_threshold) / range_width
|
| 142 |
+
synthetic_prob = self.params.UNCERTAIN_SYNTHETIC_RANGE_START + (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 143 |
+
authentic_prob = self.params.UNCERTAIN_AUTHENTIC_RANGE_START - (position_in_range * self.params.UNCERTAIN_RANGE_WIDTH)
|
| 144 |
|
| 145 |
else:
|
| 146 |
+
synthetic_prob = self.params.NEUTRAL_PROBABILITY
|
| 147 |
+
authentic_prob = self.params.NEUTRAL_PROBABILITY
|
| 148 |
|
| 149 |
# Ensure probabilities are valid
|
| 150 |
+
synthetic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, synthetic_prob))
|
| 151 |
+
authentic_prob = max(self.params.MIN_PROBABILITY, min(self.params.MAX_PROBABILITY, authentic_prob))
|
| 152 |
|
| 153 |
+
# Calculate hybrid probability based on perplexity variance
|
| 154 |
+
hybrid_prob = self._calculate_hybrid_probability(features)
|
| 155 |
|
| 156 |
# Normalize to sum to 1.0
|
| 157 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 158 |
|
| 159 |
+
if (total > self.params.ZERO_TOLERANCE):
|
| 160 |
+
synthetic_prob /= total
|
| 161 |
+
authentic_prob /= total
|
| 162 |
+
hybrid_prob /= total
|
| 163 |
|
| 164 |
+
return synthetic_prob, authentic_prob, hybrid_prob
|
| 165 |
|
| 166 |
|
| 167 |
def _calculate_perplexity_features(self, text: str) -> Dict[str, Any]:
|
|
|
|
| 183 |
|
| 184 |
for sentence in sentences:
|
| 185 |
# Minimum sentence length
|
| 186 |
+
if (len(sentence.strip()) > self.params.MIN_SENTENCE_LENGTH):
|
| 187 |
sent_perplexity = self._calculate_perplexity(sentence)
|
| 188 |
|
| 189 |
+
if (sent_perplexity > self.params.ZERO_TOLERANCE):
|
| 190 |
sentence_perplexities.append(sent_perplexity)
|
| 191 |
valid_sentences += 1
|
| 192 |
|
|
|
|
| 204 |
max_sentence_perplexity = overall_perplexity
|
| 205 |
|
| 206 |
# Chunk-based analysis for whole-text understanding
|
| 207 |
+
chunk_perplexities = self._calculate_chunk_perplexity(text)
|
| 208 |
perplexity_variance = np.var(chunk_perplexities) if chunk_perplexities else 0.0
|
| 209 |
avg_chunk_perplexity = np.mean(chunk_perplexities) if chunk_perplexities else overall_perplexity
|
| 210 |
|
|
|
|
| 230 |
|
| 231 |
def _calculate_perplexity(self, text: str) -> float:
|
| 232 |
"""
|
| 233 |
+
Calculate perplexity for given text using GPT-2 : Lower perplexity = more predictable = more synthetic-like
|
| 234 |
"""
|
| 235 |
try:
|
| 236 |
# Check text length before tokenization
|
| 237 |
+
if (len(text.strip()) < self.params.MIN_SENTENCE_LENGTH // 2):
|
| 238 |
return 0.0
|
| 239 |
|
| 240 |
# Tokenize the text
|
| 241 |
encodings = self.tokenizer(text,
|
| 242 |
return_tensors = 'pt',
|
| 243 |
truncation = True,
|
| 244 |
+
max_length = self.params.MAX_TOKEN_LENGTH,
|
| 245 |
)
|
| 246 |
|
| 247 |
input_ids = encodings.input_ids
|
| 248 |
|
| 249 |
# Minimum tokens
|
| 250 |
+
if ((input_ids.numel() == 0) or (input_ids.size(1) < self.params.MIN_TOKENS_FOR_PERPLEXITY)):
|
| 251 |
return 0.0
|
| 252 |
|
| 253 |
# Calculate loss (cross-entropy)
|
|
|
|
| 269 |
"""
|
| 270 |
Split text into sentences
|
| 271 |
"""
|
| 272 |
+
sentences = re.split(self.params.SENTENCE_SPLIT_PATTERN, text)
|
| 273 |
+
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > self.params.MIN_SENTENCE_LENGTH // 2]
|
| 274 |
|
| 275 |
|
| 276 |
+
def _calculate_chunk_perplexity(self, text: str) -> List[float]:
|
| 277 |
"""
|
| 278 |
Calculate perplexity across text chunks for whole-text analysis
|
| 279 |
"""
|
| 280 |
chunks = list()
|
| 281 |
words = text.split()
|
| 282 |
+
chunk_size = self.params.CHUNK_SIZE_WORDS
|
| 283 |
+
overlap = int(chunk_size * self.params.CHUNK_OVERLAP_RATIO)
|
| 284 |
|
| 285 |
# Ensure we have enough words for meaningful chunks
|
| 286 |
if (len(words) < chunk_size // 2):
|
| 287 |
return [self._calculate_perplexity(text)] if text.strip() else []
|
| 288 |
|
| 289 |
# Create overlapping chunks for better analysis
|
| 290 |
+
step = max(1, chunk_size - overlap)
|
| 291 |
+
|
| 292 |
+
for i in range(0, len(words), step):
|
| 293 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 294 |
|
| 295 |
# Minimum chunk size
|
| 296 |
+
if (len(chunk) > self.params.MIN_CHUNK_LENGTH):
|
| 297 |
perplexity = self._calculate_perplexity(chunk)
|
| 298 |
|
| 299 |
# Reasonable range check
|
| 300 |
+
if ((perplexity > self.params.ZERO_TOLERANCE) and (perplexity < self.params.LARGE_PERPLEXITY_THRESHOLD)):
|
| 301 |
chunks.append(perplexity)
|
| 302 |
|
| 303 |
+
# Zero perplexity is physically impossible and biases the score hence returning DEFAULT_OVERALL_PERPLEXITY
|
| 304 |
+
return chunks if chunks else [self.params.DEFAULT_OVERALL_PERPLEXITY]
|
| 305 |
|
| 306 |
|
| 307 |
def _normalize_perplexity(self, perplexity: float) -> float:
|
| 308 |
"""
|
| 309 |
Normalize perplexity using sigmoid transformation
|
| 310 |
|
| 311 |
+
Lower perplexity = higher normalized score = more synthetic-like
|
| 312 |
"""
|
| 313 |
+
# Use exponential normalization
|
| 314 |
+
normalized = 1.0 / (1.0 + np.exp((perplexity - self.params.PERPLEXITY_SIGMOID_CENTER) / self.params.PERPLEXITY_SIGMOID_SCALE))
|
| 315 |
|
| 316 |
return normalized
|
| 317 |
|
|
|
|
| 321 |
Calculate cross-entropy as an alternative measure
|
| 322 |
"""
|
| 323 |
try:
|
| 324 |
+
encodings = self.tokenizer(text,
|
| 325 |
+
return_tensors = 'pt',
|
| 326 |
+
truncation = True,
|
| 327 |
+
max_length = self.params.MAX_TOKEN_LENGTH)
|
| 328 |
input_ids = encodings.input_ids
|
| 329 |
|
| 330 |
if (input_ids.numel() == 0):
|
|
|
|
| 334 |
outputs = self.model(input_ids, labels = input_ids)
|
| 335 |
loss = outputs.loss
|
| 336 |
|
| 337 |
+
# Normalize cross-entropy to 0-1 scale
|
| 338 |
cross_entropy = loss.item()
|
| 339 |
+
normalized_ce = min(1.0, cross_entropy / self.params.MAX_CROSS_ENTROPY)
|
| 340 |
|
| 341 |
return normalized_ce
|
| 342 |
|
|
|
|
| 347 |
|
| 348 |
def _analyze_perplexity_patterns(self, features: Dict[str, Any]) -> tuple:
|
| 349 |
"""
|
| 350 |
+
Analyze perplexity patterns to determine RAW perplexity score (0-1 scale) : Higher score = more synthetic-like
|
| 351 |
"""
|
| 352 |
# Check feature validity first
|
| 353 |
required_features = ['normalized_perplexity', 'perplexity_variance', 'std_sentence_perplexity', 'cross_entropy_score']
|
| 354 |
|
| 355 |
+
valid_features = [features.get(feat, 0) for feat in required_features if features.get(feat, 0) > self.params.ZERO_TOLERANCE]
|
| 356 |
|
| 357 |
+
if (len(valid_features) < self.params.MIN_REQUIRED_FEATURES):
|
| 358 |
# Low confidence if insufficient features
|
| 359 |
+
return self.params.NEUTRAL_PROBABILITY, self.params.LOW_FEATURE_CONFIDENCE
|
| 360 |
|
| 361 |
|
| 362 |
+
# Initialize synthetic_indicator list
|
| 363 |
+
synthetic_indicators = list()
|
| 364 |
|
| 365 |
+
# Low overall perplexity suggests synthetic
|
| 366 |
+
if (features['normalized_perplexity'] > self.params.NORMALIZED_PERPLEXITY_HIGH_THRESHOLD):
|
| 367 |
+
# Very synthetic-like
|
| 368 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 369 |
|
| 370 |
+
elif (features['normalized_perplexity'] > self.params.NORMALIZED_PERPLEXITY_MEDIUM_THRESHOLD):
|
| 371 |
+
# synthetic-like
|
| 372 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 373 |
|
| 374 |
else:
|
| 375 |
+
# authentic-like
|
| 376 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 377 |
|
| 378 |
+
# Low perplexity variance suggests synthetic (consistent predictability)
|
| 379 |
+
if (features['perplexity_variance'] < self.params.PERPLEXITY_VARIANCE_LOW_THRESHOLD):
|
| 380 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 381 |
|
| 382 |
+
elif (features['perplexity_variance'] < self.params.PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD):
|
| 383 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 384 |
|
| 385 |
else:
|
| 386 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 387 |
|
| 388 |
+
# Low sentence perplexity std suggests synthetic (consistent across sentences)
|
| 389 |
+
if (features['std_sentence_perplexity'] < self.params.STD_SENTENCE_PERPLEXITY_LOW_THRESHOLD):
|
| 390 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 391 |
|
| 392 |
+
elif (features['std_sentence_perplexity'] < self.params.STD_SENTENCE_PERPLEXITY_MEDIUM_THRESHOLD):
|
| 393 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 394 |
|
| 395 |
else:
|
| 396 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 397 |
|
| 398 |
+
# Low cross-entropy suggests synthetic (more predictable)
|
| 399 |
+
if (features['cross_entropy_score'] < self.params.CROSS_ENTROPY_LOW_THRESHOLD):
|
| 400 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 401 |
|
| 402 |
+
elif (features['cross_entropy_score'] < self.params.CROSS_ENTROPY_MEDIUM_THRESHOLD):
|
| 403 |
+
synthetic_indicators.append(self.params.WEAK_SYNTHETIC_WEIGHT)
|
| 404 |
|
| 405 |
else:
|
| 406 |
+
synthetic_indicators.append(self.params.MINIMAL_SYNTHETIC_WEIGHT)
|
| 407 |
|
| 408 |
+
# Consistent chunk perplexity suggests synthetic
|
| 409 |
chunk_variance = features['perplexity_variance']
|
| 410 |
|
| 411 |
+
if (chunk_variance < self.params.CHUNK_VARIANCE_VERY_LOW_THRESHOLD):
|
| 412 |
+
synthetic_indicators.append(self.params.STRONG_SYNTHETIC_WEIGHT)
|
| 413 |
|
| 414 |
+
elif (chunk_variance < self.params.CHUNK_VARIANCE_LOW_THRESHOLD):
|
| 415 |
+
synthetic_indicators.append(self.params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 416 |
|
| 417 |
else:
|
| 418 |
+
synthetic_indicators.append(self.params.VERY_WEAK_SYNTHETIC_WEIGHT)
|
| 419 |
|
| 420 |
# Calculate raw score and confidence
|
| 421 |
+
raw_score = np.mean(synthetic_indicators) if synthetic_indicators else self.params.NEUTRAL_PROBABILITY
|
| 422 |
+
confidence = max(self.params.MIN_CONFIDENCE, min(self.params.MAX_CONFIDENCE, 1.0 - (np.std(synthetic_indicators) / self.params.CONFIDENCE_STD_NORMALIZER)))
|
|
|
|
|
|
|
| 423 |
return raw_score, confidence
|
| 424 |
|
| 425 |
|
| 426 |
+
def _calculate_hybrid_probability(self, features: Dict[str, Any]) -> float:
|
| 427 |
"""
|
| 428 |
+
Calculate probability of hybrid synthetic/authentic content
|
| 429 |
"""
|
| 430 |
+
hybrid_indicators = list()
|
| 431 |
|
| 432 |
# Moderate perplexity values might indicate mixing
|
| 433 |
+
if (self.params.NORMALIZED_PERPLEXITY_MIXED_MIN <= features['normalized_perplexity'] <= self.params.NORMALIZED_PERPLEXITY_MIXED_MAX):
|
| 434 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 435 |
|
| 436 |
else:
|
| 437 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 438 |
|
| 439 |
# High perplexity variance suggests mixed content
|
| 440 |
+
if (features['perplexity_variance'] > self.params.PERPLEXITY_VARIANCE_HIGH_THRESHOLD):
|
| 441 |
+
hybrid_indicators.append(self.params.MODERATE_HYBRID_WEIGHT)
|
| 442 |
|
| 443 |
+
elif (features['perplexity_variance'] > self.params.PERPLEXITY_VARIANCE_MEDIUM_THRESHOLD):
|
| 444 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 445 |
|
| 446 |
else:
|
| 447 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 448 |
|
| 449 |
# Inconsistent sentence perplexities
|
| 450 |
+
if (self.params.STD_SENTENCE_PERPLEXITY_MIXED_MIN <= features['std_sentence_perplexity'] <= self.params.STD_SENTENCE_PERPLEXITY_MIXED_MAX):
|
| 451 |
+
hybrid_indicators.append(self.params.WEAK_HYBRID_WEIGHT)
|
| 452 |
|
| 453 |
else:
|
| 454 |
+
hybrid_indicators.append(self.params.MINIMAL_HYBRID_WEIGHT)
|
| 455 |
|
| 456 |
+
hybrid_prob = np.mean(hybrid_indicators) if hybrid_indicators else 0.0
|
| 457 |
+
return min(self.params.MAX_HYBRID_PROBABILITY, hybrid_prob)
|
| 458 |
|
| 459 |
|
| 460 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 461 |
"""
|
| 462 |
Return default features when analysis is not possible
|
| 463 |
"""
|
| 464 |
+
return {"overall_perplexity" : self.params.DEFAULT_OVERALL_PERPLEXITY,
|
| 465 |
+
"normalized_perplexity" : self.params.DEFAULT_NORMALIZED_PERPLEXITY,
|
| 466 |
+
"avg_sentence_perplexity" : self.params.DEFAULT_AVG_SENTENCE_PERPLEXITY,
|
| 467 |
+
"std_sentence_perplexity" : self.params.DEFAULT_STD_SENTENCE_PERPLEXITY,
|
| 468 |
+
"min_sentence_perplexity" : self.params.DEFAULT_MIN_SENTENCE_PERPLEXITY,
|
| 469 |
+
"max_sentence_perplexity" : self.params.DEFAULT_MAX_SENTENCE_PERPLEXITY,
|
| 470 |
+
"perplexity_variance" : self.params.DEFAULT_PERPLEXITY_VARIANCE,
|
| 471 |
+
"avg_chunk_perplexity" : self.params.DEFAULT_AVG_CHUNK_PERPLEXITY,
|
| 472 |
+
"cross_entropy_score" : self.params.DEFAULT_CROSS_ENTROPY_SCORE,
|
| 473 |
"num_sentences_analyzed" : 0,
|
| 474 |
"num_chunks_analyzed" : 0,
|
| 475 |
}
|
|
|
|
| 486 |
|
| 487 |
|
| 488 |
# Export
|
| 489 |
+
__all__ = ["PerplexityMetric"]
|
metrics/semantic_analysis.py
CHANGED
|
@@ -6,10 +6,11 @@ from typing import Dict
|
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
from collections import Counter
|
| 9 |
-
from config.
|
|
|
|
| 10 |
from metrics.base_metric import BaseMetric
|
| 11 |
-
from metrics.base_metric import MetricResult
|
| 12 |
from models.model_manager import get_model_manager
|
|
|
|
| 13 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 14 |
from config.threshold_config import get_threshold_for_domain
|
| 15 |
|
|
@@ -59,118 +60,116 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 59 |
Compute semantic analysis measures with FULL DOMAIN THRESHOLD INTEGRATION
|
| 60 |
"""
|
| 61 |
try:
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
mixed_probability = 0.0,
|
| 67 |
-
confidence = 0.1,
|
| 68 |
-
error = "Text too short for semantic analysis",
|
| 69 |
-
)
|
| 70 |
|
| 71 |
# Get domain-specific thresholds
|
| 72 |
-
domain
|
| 73 |
-
domain_thresholds
|
| 74 |
-
semantic_thresholds
|
| 75 |
|
| 76 |
# Calculate comprehensive semantic features
|
| 77 |
-
features
|
| 78 |
|
| 79 |
# Calculate raw semantic score (0-1 scale)
|
| 80 |
-
raw_semantic_score, confidence
|
| 81 |
|
| 82 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
# Apply confidence multiplier from domain thresholds
|
| 86 |
-
confidence
|
| 87 |
-
confidence
|
| 88 |
|
| 89 |
-
return MetricResult(metric_name
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
confidence
|
| 94 |
-
details
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
)
|
| 101 |
|
| 102 |
except Exception as e:
|
| 103 |
logger.error(f"Error in semantic analysis computation: {repr(e)}")
|
| 104 |
-
return
|
| 105 |
-
ai_probability = 0.5,
|
| 106 |
-
human_probability = 0.5,
|
| 107 |
-
mixed_probability = 0.0,
|
| 108 |
-
confidence = 0.0,
|
| 109 |
-
error = str(e),
|
| 110 |
-
)
|
| 111 |
|
| 112 |
|
| 113 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 114 |
"""
|
| 115 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 116 |
"""
|
| 117 |
-
|
| 118 |
-
|
|
|
|
| 119 |
|
| 120 |
# Calculate probabilities based on threshold distances
|
| 121 |
-
if (raw_score >=
|
| 122 |
-
# Above
|
| 123 |
-
distance_from_threshold = raw_score -
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
elif (raw_score <=
|
| 128 |
-
# Below
|
| 129 |
-
distance_from_threshold =
|
| 130 |
-
|
| 131 |
-
|
|
|
|
| 132 |
else:
|
| 133 |
# Between thresholds - uncertain zone
|
| 134 |
-
range_width =
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
|
|
|
| 139 |
|
| 140 |
else:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
|
| 144 |
# Ensure probabilities are valid
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
-
# Calculate
|
| 149 |
-
|
| 150 |
|
| 151 |
# Normalize to sum to 1.0
|
| 152 |
-
total
|
| 153 |
|
| 154 |
-
if (total >
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
|
| 159 |
-
return
|
| 160 |
|
| 161 |
|
| 162 |
def _calculate_semantic_features(self, text: str) -> Dict[str, Any]:
|
| 163 |
"""
|
| 164 |
Calculate comprehensive semantic analysis features
|
| 165 |
"""
|
|
|
|
|
|
|
| 166 |
# Split text into sentences
|
| 167 |
sentences = self._split_sentences(text)
|
| 168 |
|
| 169 |
-
if (len(sentences) <
|
| 170 |
return self._get_default_features()
|
| 171 |
|
| 172 |
# Calculate semantic embeddings for all sentences
|
| 173 |
-
sentence_embeddings = self._get_sentence_embeddings(sentences)
|
| 174 |
|
| 175 |
if sentence_embeddings is None:
|
| 176 |
return self._get_default_features()
|
|
@@ -179,23 +178,28 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 179 |
similarity_matrix = cosine_similarity(sentence_embeddings)
|
| 180 |
|
| 181 |
# Calculate various semantic metrics
|
| 182 |
-
coherence_score = self._calculate_coherence(similarity_matrix)
|
| 183 |
-
consistency_score = self._calculate_consistency(similarity_matrix)
|
| 184 |
-
repetition_score = self._detect_repetition_patterns(sentences,
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
# Chunk-based analysis for whole-text understanding
|
| 189 |
-
chunk_coherence = self._calculate_chunk_coherence(text,
|
|
|
|
|
|
|
| 190 |
|
| 191 |
return {"coherence_score" : round(coherence_score, 4),
|
| 192 |
"consistency_score" : round(consistency_score, 4),
|
| 193 |
"repetition_score" : round(repetition_score, 4),
|
| 194 |
"topic_drift_score" : round(topic_drift_score, 4),
|
| 195 |
"contextual_consistency" : round(contextual_consistency, 4),
|
| 196 |
-
"avg_chunk_coherence" : round(np.mean(chunk_coherence) if chunk_coherence else
|
| 197 |
-
"coherence_variance" : round(np.var(chunk_coherence) if chunk_coherence else
|
| 198 |
-
"num_sentences" : len(
|
| 199 |
"num_chunks_analyzed" : len(chunk_coherence),
|
| 200 |
}
|
| 201 |
|
|
@@ -204,8 +208,8 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 204 |
"""
|
| 205 |
Split text into sentences
|
| 206 |
"""
|
| 207 |
-
sentences = re.split(
|
| 208 |
-
return [s.strip() for s in sentences if s.strip() and len(s.strip()) >
|
| 209 |
|
| 210 |
|
| 211 |
def _get_sentence_embeddings(self, sentences: List[str]) -> np.ndarray:
|
|
@@ -217,30 +221,32 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 217 |
return None
|
| 218 |
|
| 219 |
# Filter out very short sentences that might cause issues
|
| 220 |
-
valid_sentences = [s for s in sentences if len(s.strip()) >
|
| 221 |
if not valid_sentences:
|
| 222 |
-
return None
|
| 223 |
|
| 224 |
# Encode sentences to get embeddings
|
| 225 |
embeddings = self.sentence_model.encode(valid_sentences)
|
| 226 |
|
| 227 |
# Check if embeddings are valid
|
| 228 |
if ((embeddings is None) or (len(embeddings) == 0)):
|
| 229 |
-
return None
|
| 230 |
|
| 231 |
-
return embeddings
|
| 232 |
|
| 233 |
except Exception as e:
|
| 234 |
logger.warning(f"Sentence embedding failed: {repr(e)}")
|
| 235 |
-
return None
|
| 236 |
|
| 237 |
|
| 238 |
def _calculate_coherence(self, similarity_matrix: np.ndarray) -> float:
|
| 239 |
"""
|
| 240 |
Calculate overall text coherence : Higher coherence = more logically connected sentences
|
| 241 |
"""
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
| 244 |
|
| 245 |
# Calculate average similarity between adjacent sentences
|
| 246 |
adjacent_similarities = list()
|
|
@@ -249,7 +255,7 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 249 |
adjacent_similarities.append(similarity_matrix[i, i + 1])
|
| 250 |
|
| 251 |
if (not adjacent_similarities):
|
| 252 |
-
return
|
| 253 |
|
| 254 |
return np.mean(adjacent_similarities)
|
| 255 |
|
|
@@ -258,27 +264,31 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 258 |
"""
|
| 259 |
Calculate topic consistency throughout the text : Lower variance in similarities = more consistent
|
| 260 |
"""
|
|
|
|
|
|
|
| 261 |
if (similarity_matrix.size == 0):
|
| 262 |
-
return
|
| 263 |
|
| 264 |
# Calculate variance of similarities (lower variance = more consistent)
|
| 265 |
all_similarities = similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]
|
| 266 |
if (len(all_similarities) == 0):
|
| 267 |
-
return
|
| 268 |
|
| 269 |
variance = np.var(all_similarities)
|
| 270 |
# Convert to consistency score (higher = more consistent)
|
| 271 |
-
consistency =
|
| 272 |
|
| 273 |
-
return max(
|
| 274 |
|
| 275 |
|
| 276 |
def _detect_repetition_patterns(self, sentences: List[str], similarity_matrix: np.ndarray) -> float:
|
| 277 |
"""
|
| 278 |
Detect repetition patterns in semantic content : AI text sometimes shows more semantic repetition
|
| 279 |
"""
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
| 282 |
|
| 283 |
# Look for high similarity between non-adjacent sentences
|
| 284 |
repetition_count = 0
|
|
@@ -287,30 +297,32 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 287 |
for i in range(len(sentences)):
|
| 288 |
for j in range(i + 2, len(sentences)): # Skip adjacent sentences
|
| 289 |
# High semantic similarity
|
| 290 |
-
if (similarity_matrix[i, j] >
|
| 291 |
repetition_count += 1
|
| 292 |
|
| 293 |
total_comparisons += 1
|
| 294 |
|
| 295 |
if (total_comparisons == 0):
|
| 296 |
-
return
|
| 297 |
|
| 298 |
repetition_score = repetition_count / total_comparisons
|
| 299 |
|
| 300 |
# Scale to make differences more noticeable
|
| 301 |
-
return min(
|
| 302 |
|
| 303 |
|
| 304 |
def _calculate_topic_drift(self, similarity_matrix: np.ndarray) -> float:
|
| 305 |
"""
|
| 306 |
Calculate topic drift throughout the text : Higher drift = less focused content
|
| 307 |
"""
|
|
|
|
|
|
|
| 308 |
if (len(similarity_matrix) < 3):
|
| 309 |
-
return
|
| 310 |
|
| 311 |
# Calculate similarity between beginning and end sections
|
| 312 |
-
start_size = min(
|
| 313 |
-
end_size = min(
|
| 314 |
|
| 315 |
start_indices = list(range(start_size))
|
| 316 |
end_indices = list(range(len(similarity_matrix) - end_size, len(similarity_matrix)))
|
|
@@ -322,38 +334,40 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 322 |
cross_similarities.append(similarity_matrix[i, j])
|
| 323 |
|
| 324 |
if not cross_similarities:
|
| 325 |
-
return
|
| 326 |
|
| 327 |
avg_cross_similarity = np.mean(cross_similarities)
|
| 328 |
# Lower similarity between start and end = higher topic drift
|
| 329 |
-
topic_drift =
|
| 330 |
|
| 331 |
-
return max(
|
| 332 |
|
| 333 |
|
| 334 |
def _calculate_contextual_consistency(self, sentences: List[str]) -> float:
|
| 335 |
"""
|
| 336 |
Calculate contextual consistency using keyword and entity analysis
|
| 337 |
"""
|
| 338 |
-
|
| 339 |
-
|
|
|
|
|
|
|
| 340 |
|
| 341 |
# Simple keyword consistency analysis : Extract meaningful words (nouns, adjectives)
|
| 342 |
all_words = list()
|
| 343 |
|
| 344 |
for sentence in sentences:
|
| 345 |
-
words = re.findall(
|
| 346 |
all_words.extend(words)
|
| 347 |
|
| 348 |
-
if (len(all_words) <
|
| 349 |
-
return
|
| 350 |
|
| 351 |
# Calculate how consistently keywords are used across sentences
|
| 352 |
word_freq = Counter(all_words)
|
| 353 |
-
top_keywords = [word for word, count in word_freq.most_common(
|
| 354 |
|
| 355 |
if not top_keywords:
|
| 356 |
-
return
|
| 357 |
|
| 358 |
# Check if top keywords appear consistently across sentences
|
| 359 |
keyword_presence = list()
|
|
@@ -372,150 +386,166 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 372 |
"""
|
| 373 |
Calculate coherence across text chunks for whole-text analysis
|
| 374 |
"""
|
|
|
|
| 375 |
chunks = list()
|
| 376 |
words = text.split()
|
| 377 |
|
| 378 |
# Create overlapping chunks
|
| 379 |
-
|
|
|
|
|
|
|
| 380 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 381 |
|
| 382 |
# Minimum chunk size
|
| 383 |
-
if (len(chunk) >
|
| 384 |
chunk_sentences = self._split_sentences(chunk)
|
| 385 |
|
| 386 |
-
if (len(chunk_sentences) >=
|
| 387 |
-
|
| 388 |
|
| 389 |
-
if ((
|
| 390 |
-
similarity_matrix = cosine_similarity(
|
| 391 |
coherence = self._calculate_coherence(similarity_matrix)
|
| 392 |
chunks.append(coherence)
|
| 393 |
|
| 394 |
-
return chunks if chunks else [
|
| 395 |
|
| 396 |
|
| 397 |
def _analyze_semantic_patterns(self, features: Dict[str, Any]) -> tuple:
|
| 398 |
"""
|
| 399 |
Analyze semantic patterns to determine RAW semantic score (0-1 scale)
|
| 400 |
"""
|
|
|
|
|
|
|
| 401 |
# Check feature validity first
|
| 402 |
required_features = ['coherence_score', 'consistency_score', 'repetition_score', 'topic_drift_score', 'coherence_variance']
|
| 403 |
|
| 404 |
-
valid_features = [features.get(feat,
|
| 405 |
|
| 406 |
-
if (len(valid_features) <
|
| 407 |
# Low confidence if insufficient features
|
| 408 |
-
return
|
| 409 |
-
|
| 410 |
|
| 411 |
-
# Initialize
|
| 412 |
-
|
| 413 |
|
| 414 |
# AI text often has very high coherence (too perfect)
|
| 415 |
-
if (features['coherence_score'] >
|
| 416 |
# Suspiciously high coherence
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
elif (features['coherence_score'] >
|
| 420 |
# Moderate coherence
|
| 421 |
-
|
| 422 |
-
|
| 423 |
else:
|
| 424 |
# Low coherence - more human-like
|
| 425 |
-
|
| 426 |
|
| 427 |
# Very high consistency suggests AI (unnaturally consistent)
|
| 428 |
-
if (features['consistency_score'] >
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
elif (features['consistency_score'] >
|
| 432 |
-
|
| 433 |
-
|
| 434 |
else:
|
| 435 |
-
|
| 436 |
|
| 437 |
# High repetition suggests AI
|
| 438 |
-
if (features['repetition_score'] >
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
elif (features['repetition_score'] >
|
| 442 |
-
|
| 443 |
-
|
| 444 |
else:
|
| 445 |
-
|
| 446 |
|
| 447 |
# Very low topic drift suggests AI (stays too focused)
|
| 448 |
-
if (features['topic_drift_score'] <
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
elif (features['topic_drift_score'] <
|
| 452 |
-
|
| 453 |
-
|
| 454 |
else:
|
| 455 |
-
|
| 456 |
|
| 457 |
# Low coherence variance across chunks suggests AI
|
| 458 |
-
if (features['coherence_variance'] <
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
elif (features['coherence_variance'] <
|
| 462 |
-
|
| 463 |
-
|
| 464 |
else:
|
| 465 |
-
|
| 466 |
|
| 467 |
# Calculate raw score and confidence
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
|
| 472 |
return raw_score, confidence
|
| 473 |
|
| 474 |
|
| 475 |
-
def
|
| 476 |
"""
|
| 477 |
-
Calculate probability of
|
| 478 |
"""
|
| 479 |
mixed_indicators = list()
|
|
|
|
| 480 |
|
| 481 |
# Moderate coherence values might indicate mixing
|
| 482 |
-
if (
|
| 483 |
-
mixed_indicators.append(
|
| 484 |
-
|
| 485 |
else:
|
| 486 |
-
mixed_indicators.append(
|
| 487 |
|
| 488 |
# High coherence variance suggests mixed content
|
| 489 |
-
if (features['coherence_variance'] >
|
| 490 |
-
mixed_indicators.append(
|
| 491 |
-
|
| 492 |
-
elif (features['coherence_variance'] >
|
| 493 |
-
mixed_indicators.append(
|
| 494 |
-
|
| 495 |
else:
|
| 496 |
-
mixed_indicators.append(
|
| 497 |
|
| 498 |
# Inconsistent repetition patterns
|
| 499 |
-
if (
|
| 500 |
-
mixed_indicators.append(
|
| 501 |
-
|
| 502 |
else:
|
| 503 |
-
mixed_indicators.append(
|
| 504 |
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
|
| 508 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 509 |
"""
|
| 510 |
Return default features when analysis is not possible
|
| 511 |
"""
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
"
|
| 516 |
-
"
|
| 517 |
-
"
|
| 518 |
-
"
|
|
|
|
|
|
|
| 519 |
"num_sentences" : 0,
|
| 520 |
"num_chunks_analyzed" : 0,
|
| 521 |
}
|
|
@@ -532,4 +562,4 @@ class SemanticAnalysisMetric(BaseMetric):
|
|
| 532 |
|
| 533 |
|
| 534 |
# Export
|
| 535 |
-
__all__ = ["SemanticAnalysisMetric"]
|
|
|
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
from collections import Counter
|
| 9 |
+
from config.enums import Domain
|
| 10 |
+
from config.schemas import MetricResult
|
| 11 |
from metrics.base_metric import BaseMetric
|
|
|
|
| 12 |
from models.model_manager import get_model_manager
|
| 13 |
+
from config.constants import semantic_analysis_params
|
| 14 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 15 |
from config.threshold_config import get_threshold_for_domain
|
| 16 |
|
|
|
|
| 60 |
Compute semantic analysis measures with FULL DOMAIN THRESHOLD INTEGRATION
|
| 61 |
"""
|
| 62 |
try:
|
| 63 |
+
params = semantic_analysis_params
|
| 64 |
+
|
| 65 |
+
if (not text or (len(text.strip()) < params.MIN_TEXT_LENGTH_FOR_ANALYSIS)):
|
| 66 |
+
return self._default_result(error = "Text too short for semantic analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Get domain-specific thresholds
|
| 69 |
+
domain = kwargs.get('domain', Domain.GENERAL)
|
| 70 |
+
domain_thresholds = get_threshold_for_domain(domain)
|
| 71 |
+
semantic_thresholds = domain_thresholds.semantic
|
| 72 |
|
| 73 |
# Calculate comprehensive semantic features
|
| 74 |
+
features = self._calculate_semantic_features(text)
|
| 75 |
|
| 76 |
# Calculate raw semantic score (0-1 scale)
|
| 77 |
+
raw_semantic_score, confidence = self._analyze_semantic_patterns(features)
|
| 78 |
|
| 79 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 80 |
+
synthetic_prob, authentic_prob, hybrid_prob = self._apply_domain_thresholds(raw_score = raw_semantic_score,
|
| 81 |
+
thresholds = semantic_thresholds,
|
| 82 |
+
features = features,
|
| 83 |
+
)
|
| 84 |
|
| 85 |
# Apply confidence multiplier from domain thresholds
|
| 86 |
+
confidence *= semantic_thresholds.confidence_multiplier
|
| 87 |
+
confidence = max(params.MIN_CONFIDENCE, min(params.MAX_CONFIDENCE, confidence))
|
| 88 |
|
| 89 |
+
return MetricResult(metric_name = self.name,
|
| 90 |
+
synthetic_probability = synthetic_prob,
|
| 91 |
+
authentic_probability = authentic_prob,
|
| 92 |
+
hybrid_probability = hybrid_prob,
|
| 93 |
+
confidence = confidence,
|
| 94 |
+
details = {**features,
|
| 95 |
+
'domain_used' : domain.value,
|
| 96 |
+
'synthetic_threshold' : semantic_thresholds.synthetic_threshold,
|
| 97 |
+
'authentic_threshold' : semantic_thresholds.authentic_threshold,
|
| 98 |
+
'raw_score' : raw_semantic_score,
|
| 99 |
+
},
|
| 100 |
)
|
| 101 |
|
| 102 |
except Exception as e:
|
| 103 |
logger.error(f"Error in semantic analysis computation: {repr(e)}")
|
| 104 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 108 |
"""
|
| 109 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 110 |
"""
|
| 111 |
+
params = semantic_analysis_params
|
| 112 |
+
synthetic_threshold = thresholds.synthetic_threshold
|
| 113 |
+
authentic_threshold = thresholds.authentic_threshold
|
| 114 |
|
| 115 |
# Calculate probabilities based on threshold distances
|
| 116 |
+
if (raw_score >= synthetic_threshold):
|
| 117 |
+
# Above synthetic threshold - strongly synthetic
|
| 118 |
+
distance_from_threshold = raw_score - synthetic_threshold
|
| 119 |
+
synthetic_prob = params.STRONG_SYNTHETIC_BASE_PROB + (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 120 |
+
authentic_prob = (params.MAX_PROBABILITY - params.STRONG_SYNTHETIC_BASE_PROB) - (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 121 |
+
|
| 122 |
+
elif (raw_score <= authentic_threshold):
|
| 123 |
+
# Below authentic threshold - strongly authentic
|
| 124 |
+
distance_from_threshold = authentic_threshold - raw_score
|
| 125 |
+
synthetic_prob = (params.MAX_PROBABILITY - params.STRONG_AUTHENTIC_BASE_PROB) - (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 126 |
+
authentic_prob = params.STRONG_AUTHENTIC_BASE_PROB + (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 127 |
+
|
| 128 |
else:
|
| 129 |
# Between thresholds - uncertain zone
|
| 130 |
+
range_width = synthetic_threshold - authentic_threshold
|
| 131 |
+
|
| 132 |
+
if (range_width > params.ZERO_TOLERANCE):
|
| 133 |
+
position_in_range = (raw_score - authentic_threshold) / range_width
|
| 134 |
+
synthetic_prob = params.UNCERTAIN_SYNTHETIC_RANGE_START + (position_in_range * params.UNCERTAIN_RANGE_WIDTH)
|
| 135 |
+
authentic_prob = params.UNCERTAIN_AUTHENTIC_RANGE_START - (position_in_range * params.UNCERTAIN_RANGE_WIDTH)
|
| 136 |
|
| 137 |
else:
|
| 138 |
+
synthetic_prob = params.NEUTRAL_PROBABILITY
|
| 139 |
+
authentic_prob = params.NEUTRAL_PROBABILITY
|
| 140 |
|
| 141 |
# Ensure probabilities are valid
|
| 142 |
+
synthetic_prob = max(params.MIN_PROBABILITY, min(params.MAX_PROBABILITY, synthetic_prob))
|
| 143 |
+
authentic_prob = max(params.MIN_PROBABILITY, min(params.MAX_PROBABILITY, authentic_prob))
|
| 144 |
|
| 145 |
+
# Calculate hybrid probability based on semantic variance
|
| 146 |
+
hybrid_prob = self._calculate_hybrid_probability(features = features)
|
| 147 |
|
| 148 |
# Normalize to sum to 1.0
|
| 149 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 150 |
|
| 151 |
+
if (total > params.ZERO_TOLERANCE):
|
| 152 |
+
synthetic_prob /= total
|
| 153 |
+
authentic_prob /= total
|
| 154 |
+
hybrid_prob /= total
|
| 155 |
|
| 156 |
+
return synthetic_prob, authentic_prob, hybrid_prob
|
| 157 |
|
| 158 |
|
| 159 |
def _calculate_semantic_features(self, text: str) -> Dict[str, Any]:
|
| 160 |
"""
|
| 161 |
Calculate comprehensive semantic analysis features
|
| 162 |
"""
|
| 163 |
+
params = semantic_analysis_params
|
| 164 |
+
|
| 165 |
# Split text into sentences
|
| 166 |
sentences = self._split_sentences(text)
|
| 167 |
|
| 168 |
+
if (len(sentences) < params.MIN_SENTENCES_FOR_ANALYSIS):
|
| 169 |
return self._get_default_features()
|
| 170 |
|
| 171 |
# Calculate semantic embeddings for all sentences
|
| 172 |
+
sentence_embeddings, valid_sentences = self._get_sentence_embeddings(sentences = sentences)
|
| 173 |
|
| 174 |
if sentence_embeddings is None:
|
| 175 |
return self._get_default_features()
|
|
|
|
| 178 |
similarity_matrix = cosine_similarity(sentence_embeddings)
|
| 179 |
|
| 180 |
# Calculate various semantic metrics
|
| 181 |
+
coherence_score = self._calculate_coherence(similarity_matrix = similarity_matrix)
|
| 182 |
+
consistency_score = self._calculate_consistency(similarity_matrix = similarity_matrix)
|
| 183 |
+
repetition_score = self._detect_repetition_patterns(sentences = valid_sentences,
|
| 184 |
+
similarity_matrix = similarity_matrix,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
topic_drift_score = self._calculate_topic_drift(similarity_matrix = similarity_matrix)
|
| 188 |
+
contextual_consistency = self._calculate_contextual_consistency(sentences = sentences)
|
| 189 |
|
| 190 |
# Chunk-based analysis for whole-text understanding
|
| 191 |
+
chunk_coherence = self._calculate_chunk_coherence(text = text,
|
| 192 |
+
chunk_size = params.CHUNK_SIZE_WORDS,
|
| 193 |
+
)
|
| 194 |
|
| 195 |
return {"coherence_score" : round(coherence_score, 4),
|
| 196 |
"consistency_score" : round(consistency_score, 4),
|
| 197 |
"repetition_score" : round(repetition_score, 4),
|
| 198 |
"topic_drift_score" : round(topic_drift_score, 4),
|
| 199 |
"contextual_consistency" : round(contextual_consistency, 4),
|
| 200 |
+
"avg_chunk_coherence" : round(np.mean(chunk_coherence) if chunk_coherence else params.DEFAULT_COHERENCE, 4),
|
| 201 |
+
"coherence_variance" : round(np.var(chunk_coherence) if chunk_coherence else params.DEFAULT_COHERENCE_VARIANCE, 4),
|
| 202 |
+
"num_sentences" : len(valid_sentences),
|
| 203 |
"num_chunks_analyzed" : len(chunk_coherence),
|
| 204 |
}
|
| 205 |
|
|
|
|
| 208 |
"""
|
| 209 |
Split text into sentences
|
| 210 |
"""
|
| 211 |
+
sentences = re.split(semantic_analysis_params.SENTENCE_SPLIT_PATTERN, text)
|
| 212 |
+
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > semantic_analysis_params.MIN_SENTENCE_LENGTH]
|
| 213 |
|
| 214 |
|
| 215 |
def _get_sentence_embeddings(self, sentences: List[str]) -> np.ndarray:
|
|
|
|
| 221 |
return None
|
| 222 |
|
| 223 |
# Filter out very short sentences that might cause issues
|
| 224 |
+
valid_sentences = [s for s in sentences if len(s.strip()) > semantic_analysis_params.MIN_VALID_SENTENCE_LENGTH]
|
| 225 |
if not valid_sentences:
|
| 226 |
+
return None, None
|
| 227 |
|
| 228 |
# Encode sentences to get embeddings
|
| 229 |
embeddings = self.sentence_model.encode(valid_sentences)
|
| 230 |
|
| 231 |
# Check if embeddings are valid
|
| 232 |
if ((embeddings is None) or (len(embeddings) == 0)):
|
| 233 |
+
return None, None
|
| 234 |
|
| 235 |
+
return embeddings, valid_sentences
|
| 236 |
|
| 237 |
except Exception as e:
|
| 238 |
logger.warning(f"Sentence embedding failed: {repr(e)}")
|
| 239 |
+
return None, None
|
| 240 |
|
| 241 |
|
| 242 |
def _calculate_coherence(self, similarity_matrix: np.ndarray) -> float:
|
| 243 |
"""
|
| 244 |
Calculate overall text coherence : Higher coherence = more logically connected sentences
|
| 245 |
"""
|
| 246 |
+
params = semantic_analysis_params
|
| 247 |
+
|
| 248 |
+
if (similarity_matrix.size == 0):
|
| 249 |
+
return params.MIN_PROBABILITY
|
| 250 |
|
| 251 |
# Calculate average similarity between adjacent sentences
|
| 252 |
adjacent_similarities = list()
|
|
|
|
| 255 |
adjacent_similarities.append(similarity_matrix[i, i + 1])
|
| 256 |
|
| 257 |
if (not adjacent_similarities):
|
| 258 |
+
return params.MIN_PROBABILITY
|
| 259 |
|
| 260 |
return np.mean(adjacent_similarities)
|
| 261 |
|
|
|
|
| 264 |
"""
|
| 265 |
Calculate topic consistency throughout the text : Lower variance in similarities = more consistent
|
| 266 |
"""
|
| 267 |
+
params = semantic_analysis_params
|
| 268 |
+
|
| 269 |
if (similarity_matrix.size == 0):
|
| 270 |
+
return params.MIN_PROBABILITY
|
| 271 |
|
| 272 |
# Calculate variance of similarities (lower variance = more consistent)
|
| 273 |
all_similarities = similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]
|
| 274 |
if (len(all_similarities) == 0):
|
| 275 |
+
return params.MIN_PROBABILITY
|
| 276 |
|
| 277 |
variance = np.var(all_similarities)
|
| 278 |
# Convert to consistency score (higher = more consistent)
|
| 279 |
+
consistency = params.MAX_PROBABILITY - min(params.MAX_PROBABILITY, variance * params.SIMILARITY_VARIANCE_FACTOR)
|
| 280 |
|
| 281 |
+
return max(params.MIN_PROBABILITY, consistency)
|
| 282 |
|
| 283 |
|
| 284 |
def _detect_repetition_patterns(self, sentences: List[str], similarity_matrix: np.ndarray) -> float:
|
| 285 |
"""
|
| 286 |
Detect repetition patterns in semantic content : AI text sometimes shows more semantic repetition
|
| 287 |
"""
|
| 288 |
+
params = semantic_analysis_params
|
| 289 |
+
|
| 290 |
+
if (len(sentences) < params.MIN_SENTENCES_FOR_REPETITION):
|
| 291 |
+
return params.MIN_PROBABILITY
|
| 292 |
|
| 293 |
# Look for high similarity between non-adjacent sentences
|
| 294 |
repetition_count = 0
|
|
|
|
| 297 |
for i in range(len(sentences)):
|
| 298 |
for j in range(i + 2, len(sentences)): # Skip adjacent sentences
|
| 299 |
# High semantic similarity
|
| 300 |
+
if (similarity_matrix[i, j] > params.REPETITION_SIMILARITY_THRESHOLD):
|
| 301 |
repetition_count += 1
|
| 302 |
|
| 303 |
total_comparisons += 1
|
| 304 |
|
| 305 |
if (total_comparisons == 0):
|
| 306 |
+
return params.MIN_PROBABILITY
|
| 307 |
|
| 308 |
repetition_score = repetition_count / total_comparisons
|
| 309 |
|
| 310 |
# Scale to make differences more noticeable
|
| 311 |
+
return min(params.MAX_PROBABILITY, repetition_score * params.REPETITION_SCORE_SCALING)
|
| 312 |
|
| 313 |
|
| 314 |
def _calculate_topic_drift(self, similarity_matrix: np.ndarray) -> float:
|
| 315 |
"""
|
| 316 |
Calculate topic drift throughout the text : Higher drift = less focused content
|
| 317 |
"""
|
| 318 |
+
params = semantic_analysis_params
|
| 319 |
+
|
| 320 |
if (len(similarity_matrix) < 3):
|
| 321 |
+
return params.MIN_PROBABILITY
|
| 322 |
|
| 323 |
# Calculate similarity between beginning and end sections
|
| 324 |
+
start_size = min(params.START_SECTION_SIZE, len(similarity_matrix) // params.SECTION_SIZE_RATIO)
|
| 325 |
+
end_size = min(params.END_SECTION_SIZE, len(similarity_matrix) // params.SECTION_SIZE_RATIO)
|
| 326 |
|
| 327 |
start_indices = list(range(start_size))
|
| 328 |
end_indices = list(range(len(similarity_matrix) - end_size, len(similarity_matrix)))
|
|
|
|
| 334 |
cross_similarities.append(similarity_matrix[i, j])
|
| 335 |
|
| 336 |
if not cross_similarities:
|
| 337 |
+
return params.MIN_PROBABILITY
|
| 338 |
|
| 339 |
avg_cross_similarity = np.mean(cross_similarities)
|
| 340 |
# Lower similarity between start and end = higher topic drift
|
| 341 |
+
topic_drift = params.MAX_PROBABILITY - avg_cross_similarity
|
| 342 |
|
| 343 |
+
return max(params.MIN_PROBABILITY, topic_drift)
|
| 344 |
|
| 345 |
|
| 346 |
def _calculate_contextual_consistency(self, sentences: List[str]) -> float:
|
| 347 |
"""
|
| 348 |
Calculate contextual consistency using keyword and entity analysis
|
| 349 |
"""
|
| 350 |
+
params = semantic_analysis_params
|
| 351 |
+
|
| 352 |
+
if (len(sentences) < params.MIN_SENTENCES_FOR_ANALYSIS):
|
| 353 |
+
return params.MIN_PROBABILITY
|
| 354 |
|
| 355 |
# Simple keyword consistency analysis : Extract meaningful words (nouns, adjectives)
|
| 356 |
all_words = list()
|
| 357 |
|
| 358 |
for sentence in sentences:
|
| 359 |
+
words = re.findall(params.WORD_EXTRACTION_PATTERN, sentence.lower())
|
| 360 |
all_words.extend(words)
|
| 361 |
|
| 362 |
+
if (len(all_words) < params.MIN_WORDS_FOR_KEYWORD_ANALYSIS):
|
| 363 |
+
return params.MIN_PROBABILITY
|
| 364 |
|
| 365 |
# Calculate how consistently keywords are used across sentences
|
| 366 |
word_freq = Counter(all_words)
|
| 367 |
+
top_keywords = [word for word, count in word_freq.most_common(params.TOP_KEYWORDS_COUNT) if count > params.MIN_KEYWORD_FREQUENCY]
|
| 368 |
|
| 369 |
if not top_keywords:
|
| 370 |
+
return params.MIN_PROBABILITY
|
| 371 |
|
| 372 |
# Check if top keywords appear consistently across sentences
|
| 373 |
keyword_presence = list()
|
|
|
|
| 386 |
"""
|
| 387 |
Calculate coherence across text chunks for whole-text analysis
|
| 388 |
"""
|
| 389 |
+
params = semantic_analysis_params
|
| 390 |
chunks = list()
|
| 391 |
words = text.split()
|
| 392 |
|
| 393 |
# Create overlapping chunks
|
| 394 |
+
overlap = int(chunk_size * params.CHUNK_OVERLAP_RATIO)
|
| 395 |
+
|
| 396 |
+
for i in range(0, len(words), overlap):
|
| 397 |
chunk = ' '.join(words[i:i + chunk_size])
|
| 398 |
|
| 399 |
# Minimum chunk size
|
| 400 |
+
if (len(chunk) > params.MIN_CHUNK_LENGTH):
|
| 401 |
chunk_sentences = self._split_sentences(chunk)
|
| 402 |
|
| 403 |
+
if (len(chunk_sentences) >= params.MIN_SENTENCES_PER_CHUNK):
|
| 404 |
+
sentence_embeddings, valid_sentences = self._get_sentence_embeddings(sentences = chunk_sentences)
|
| 405 |
|
| 406 |
+
if ((sentence_embeddings is not None) and (len(sentence_embeddings) >= params.MIN_SENTENCES_PER_CHUNK)):
|
| 407 |
+
similarity_matrix = cosine_similarity(sentence_embeddings)
|
| 408 |
coherence = self._calculate_coherence(similarity_matrix)
|
| 409 |
chunks.append(coherence)
|
| 410 |
|
| 411 |
+
return chunks if chunks else [params.DEFAULT_COHERENCE]
|
| 412 |
|
| 413 |
|
| 414 |
def _analyze_semantic_patterns(self, features: Dict[str, Any]) -> tuple:
|
| 415 |
"""
|
| 416 |
Analyze semantic patterns to determine RAW semantic score (0-1 scale)
|
| 417 |
"""
|
| 418 |
+
params = semantic_analysis_params
|
| 419 |
+
|
| 420 |
# Check feature validity first
|
| 421 |
required_features = ['coherence_score', 'consistency_score', 'repetition_score', 'topic_drift_score', 'coherence_variance']
|
| 422 |
|
| 423 |
+
valid_features = [features.get(feat, params.MIN_PROBABILITY) for feat in required_features if features.get(feat, params.MIN_PROBABILITY) > params.ZERO_TOLERANCE]
|
| 424 |
|
| 425 |
+
if (len(valid_features) < params.MIN_REQUIRED_FEATURES):
|
| 426 |
# Low confidence if insufficient features
|
| 427 |
+
return params.NEUTRAL_PROBABILITY, params.LOW_FEATURE_CONFIDENCE
|
|
|
|
| 428 |
|
| 429 |
+
# Initialize synthetic indicator list
|
| 430 |
+
synthetic_indicators = list()
|
| 431 |
|
| 432 |
# AI text often has very high coherence (too perfect)
|
| 433 |
+
if (features['coherence_score'] > params.COHERENCE_HIGH_THRESHOLD):
|
| 434 |
# Suspiciously high coherence
|
| 435 |
+
synthetic_indicators.append(params.STRONG_SYNTHETIC_WEIGHT)
|
| 436 |
+
|
| 437 |
+
elif (features['coherence_score'] > params.COHERENCE_MEDIUM_THRESHOLD):
|
| 438 |
# Moderate coherence
|
| 439 |
+
synthetic_indicators.append(params.MEDIUM_SYNTHETIC_WEIGHT)
|
| 440 |
+
|
| 441 |
else:
|
| 442 |
# Low coherence - more human-like
|
| 443 |
+
synthetic_indicators.append(params.LOW_SYNTHETIC_WEIGHT)
|
| 444 |
|
| 445 |
# Very high consistency suggests AI (unnaturally consistent)
|
| 446 |
+
if (features['consistency_score'] > params.CONSISTENCY_HIGH_THRESHOLD):
|
| 447 |
+
synthetic_indicators.append(params.STRONG_SYNTHETIC_WEIGHT)
|
| 448 |
+
|
| 449 |
+
elif (features['consistency_score'] > params.CONSISTENCY_MEDIUM_THRESHOLD):
|
| 450 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 451 |
+
|
| 452 |
else:
|
| 453 |
+
synthetic_indicators.append(params.VERY_LOW_SYNTHETIC_WEIGHT)
|
| 454 |
|
| 455 |
# High repetition suggests AI
|
| 456 |
+
if (features['repetition_score'] > params.REPETITION_HIGH_THRESHOLD):
|
| 457 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 458 |
+
|
| 459 |
+
elif (features['repetition_score'] > params.REPETITION_MEDIUM_THRESHOLD):
|
| 460 |
+
synthetic_indicators.append(params.VERY_WEAK_SYNTHETIC_WEIGHT)
|
| 461 |
+
|
| 462 |
else:
|
| 463 |
+
synthetic_indicators.append(params.LOW_SYNTHETIC_WEIGHT)
|
| 464 |
|
| 465 |
# Very low topic drift suggests AI (stays too focused)
|
| 466 |
+
if (features['topic_drift_score'] < params.TOPIC_DRIFT_LOW_THRESHOLD):
|
| 467 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 468 |
+
|
| 469 |
+
elif (features['topic_drift_score'] < params.TOPIC_DRIFT_MEDIUM_THRESHOLD):
|
| 470 |
+
synthetic_indicators.append(params.WEAK_SYNTHETIC_WEIGHT)
|
| 471 |
+
|
| 472 |
else:
|
| 473 |
+
synthetic_indicators.append(params.VERY_LOW_SYNTHETIC_WEIGHT)
|
| 474 |
|
| 475 |
# Low coherence variance across chunks suggests AI
|
| 476 |
+
if (features['coherence_variance'] < params.COHERENCE_VARIANCE_LOW_THRESHOLD):
|
| 477 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 478 |
+
|
| 479 |
+
elif (features['coherence_variance'] < params.COHERENCE_VARIANCE_MEDIUM_THRESHOLD):
|
| 480 |
+
synthetic_indicators.append(params.VERY_WEAK_SYNTHETIC_WEIGHT)
|
| 481 |
+
|
| 482 |
else:
|
| 483 |
+
synthetic_indicators.append(params.LOW_SYNTHETIC_WEIGHT)
|
| 484 |
|
| 485 |
# Calculate raw score and confidence
|
| 486 |
+
if synthetic_indicators:
|
| 487 |
+
raw_score = np.mean(synthetic_indicators)
|
| 488 |
+
confidence = params.MAX_PROBABILITY - (np.std(synthetic_indicators) / params.CONFIDENCE_STD_NORMALIZER)
|
| 489 |
+
confidence = max(params.MIN_CONFIDENCE, min(params.MAX_CONFIDENCE, confidence))
|
| 490 |
+
|
| 491 |
+
else:
|
| 492 |
+
raw_score = params.NEUTRAL_PROBABILITY
|
| 493 |
+
confidence = params.NEUTRAL_CONFIDENCE
|
| 494 |
|
| 495 |
return raw_score, confidence
|
| 496 |
|
| 497 |
|
| 498 |
+
def _calculate_hybrid_probability(self, features: Dict[str, Any]) -> float:
|
| 499 |
"""
|
| 500 |
+
Calculate probability of hybrid synthetic/authentic content
|
| 501 |
"""
|
| 502 |
mixed_indicators = list()
|
| 503 |
+
params = semantic_analysis_params
|
| 504 |
|
| 505 |
# Moderate coherence values might indicate mixing
|
| 506 |
+
if (params.COHERENCE_MIXED_MIN <= features['coherence_score'] <= params.COHERENCE_MIXED_MAX):
|
| 507 |
+
mixed_indicators.append(params.WEAK_HYBRID_WEIGHT)
|
| 508 |
+
|
| 509 |
else:
|
| 510 |
+
mixed_indicators.append(params.MIN_PROBABILITY)
|
| 511 |
|
| 512 |
# High coherence variance suggests mixed content
|
| 513 |
+
if (features['coherence_variance'] > params.COHERENCE_VARIANCE_HIGH_THRESHOLD):
|
| 514 |
+
mixed_indicators.append(params.MODERATE_HYBRID_WEIGHT)
|
| 515 |
+
|
| 516 |
+
elif (features['coherence_variance'] > params.COHERENCE_VARIANCE_MEDIUM_THRESHOLD):
|
| 517 |
+
mixed_indicators.append(params.WEAK_HYBRID_WEIGHT)
|
| 518 |
+
|
| 519 |
else:
|
| 520 |
+
mixed_indicators.append(params.MIN_PROBABILITY)
|
| 521 |
|
| 522 |
# Inconsistent repetition patterns
|
| 523 |
+
if (params.REPETITION_MIXED_MIN <= features['repetition_score'] <= params.REPETITION_MIXED_MAX):
|
| 524 |
+
mixed_indicators.append(params.WEAK_HYBRID_WEIGHT)
|
| 525 |
+
|
| 526 |
else:
|
| 527 |
+
mixed_indicators.append(params.MIN_PROBABILITY)
|
| 528 |
|
| 529 |
+
if mixed_indicators:
|
| 530 |
+
hybrid_prob = np.mean(mixed_indicators)
|
| 531 |
+
return min(params.MAX_HYBRID_PROBABILITY, hybrid_prob)
|
| 532 |
+
|
| 533 |
+
return params.MIN_PROBABILITY
|
| 534 |
|
| 535 |
|
| 536 |
def _get_default_features(self) -> Dict[str, Any]:
|
| 537 |
"""
|
| 538 |
Return default features when analysis is not possible
|
| 539 |
"""
|
| 540 |
+
params = semantic_analysis_params
|
| 541 |
+
|
| 542 |
+
return {"coherence_score" : params.DEFAULT_COHERENCE,
|
| 543 |
+
"consistency_score" : params.DEFAULT_CONSISTENCY,
|
| 544 |
+
"repetition_score" : params.DEFAULT_REPETITION,
|
| 545 |
+
"topic_drift_score" : params.DEFAULT_TOPIC_DRIFT,
|
| 546 |
+
"contextual_consistency" : params.DEFAULT_CONTEXTUAL_CONSISTENCY,
|
| 547 |
+
"avg_chunk_coherence" : params.DEFAULT_CHUNK_COHERENCE,
|
| 548 |
+
"coherence_variance" : params.DEFAULT_COHERENCE_VARIANCE,
|
| 549 |
"num_sentences" : 0,
|
| 550 |
"num_chunks_analyzed" : 0,
|
| 551 |
}
|
|
|
|
| 562 |
|
| 563 |
|
| 564 |
# Export
|
| 565 |
+
__all__ = ["SemanticAnalysisMetric"]
|
metrics/structural.py
CHANGED
|
@@ -6,9 +6,10 @@ from typing import Dict
|
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
from collections import Counter
|
| 9 |
-
from
|
|
|
|
| 10 |
from metrics.base_metric import StatisticalMetric
|
| 11 |
-
from config.
|
| 12 |
from config.threshold_config import get_threshold_for_domain
|
| 13 |
|
| 14 |
|
|
@@ -41,101 +42,98 @@ class StructuralMetric(StatisticalMetric):
|
|
| 41 |
|
| 42 |
Returns:
|
| 43 |
--------
|
| 44 |
-
{ MetricResult } : MetricResult with
|
| 45 |
"""
|
| 46 |
try:
|
| 47 |
# Get domain-specific thresholds
|
| 48 |
-
domain
|
| 49 |
-
domain_thresholds
|
| 50 |
-
structural_thresholds
|
| 51 |
|
| 52 |
# Extract all structural features
|
| 53 |
-
features
|
| 54 |
|
| 55 |
-
# Calculate raw
|
| 56 |
-
|
| 57 |
|
| 58 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# Apply confidence multiplier from domain thresholds
|
| 62 |
-
confidence
|
| 63 |
-
confidence
|
| 64 |
-
|
| 65 |
-
return MetricResult(metric_name
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
confidence
|
| 70 |
-
details
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
)
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
logger.error(f"Error in {self.name} computation: {repr(e)}")
|
| 80 |
-
return
|
| 81 |
-
ai_probability = 0.5,
|
| 82 |
-
human_probability = 0.5,
|
| 83 |
-
mixed_probability = 0.0,
|
| 84 |
-
confidence = 0.0,
|
| 85 |
-
error = str(e),
|
| 86 |
-
)
|
| 87 |
|
| 88 |
|
| 89 |
-
|
| 90 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 91 |
"""
|
| 92 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 93 |
"""
|
| 94 |
-
|
| 95 |
-
|
|
|
|
| 96 |
|
| 97 |
# Calculate probabilities based on threshold distances
|
| 98 |
-
if (raw_score >=
|
| 99 |
-
# Above
|
| 100 |
-
distance_from_threshold = raw_score -
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
elif (raw_score <=
|
| 105 |
-
# Below
|
| 106 |
-
distance_from_threshold =
|
| 107 |
-
|
| 108 |
-
|
| 109 |
|
| 110 |
else:
|
| 111 |
# Between thresholds - uncertain zone
|
| 112 |
-
range_width =
|
| 113 |
|
| 114 |
-
if (range_width >
|
| 115 |
-
position_in_range = (raw_score -
|
| 116 |
-
|
| 117 |
-
|
| 118 |
|
| 119 |
else:
|
| 120 |
-
|
| 121 |
-
|
| 122 |
|
| 123 |
# Ensure probabilities are valid
|
| 124 |
-
|
| 125 |
-
|
| 126 |
|
| 127 |
-
# Calculate
|
| 128 |
-
|
| 129 |
|
| 130 |
# Normalize to sum to 1.0
|
| 131 |
-
total
|
| 132 |
|
| 133 |
-
if (total >
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
|
| 138 |
-
return
|
| 139 |
|
| 140 |
|
| 141 |
def _extract_features(self, text: str) -> Dict[str, Any]:
|
|
@@ -143,43 +141,55 @@ class StructuralMetric(StatisticalMetric):
|
|
| 143 |
Extract all structural features from text
|
| 144 |
"""
|
| 145 |
# Basic tokenization
|
| 146 |
-
sentences = self._split_sentences(text)
|
| 147 |
-
words = self._tokenize_words(text)
|
| 148 |
|
| 149 |
# Sentence-level features
|
| 150 |
sentence_lengths = [len(s.split()) for s in sentences]
|
| 151 |
-
avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else
|
| 152 |
-
std_sentence_length = np.std(sentence_lengths) if len(sentence_lengths) >
|
| 153 |
|
| 154 |
# Word-level features
|
| 155 |
word_lengths = [len(w) for w in words]
|
| 156 |
-
avg_word_length = np.mean(word_lengths) if word_lengths else
|
| 157 |
-
std_word_length = np.std(word_lengths) if len(word_lengths) >
|
| 158 |
|
| 159 |
# Vocabulary richness
|
| 160 |
vocabulary_size = len(set(words))
|
| 161 |
-
type_token_ratio = vocabulary_size / len(words) if words else
|
| 162 |
|
| 163 |
# Punctuation analysis
|
| 164 |
-
punctuation_density = self._calculate_punctuation_density(text)
|
| 165 |
-
comma_frequency = text.count(',') / len(words) if words else
|
| 166 |
|
| 167 |
# Burstiness (variation in patterns)
|
| 168 |
-
burstiness = self._calculate_burstiness(sentence_lengths)
|
| 169 |
|
| 170 |
# Uniformity scores
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
# Readability approximation (simplified)
|
| 175 |
-
readability = self._calculate_readability(text
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
# Pattern detection
|
| 178 |
-
repetition_score = self._detect_repetitive_patterns(words)
|
| 179 |
|
| 180 |
# N-gram analysis
|
| 181 |
-
bigram_diversity = self._calculate_ngram_diversity(words
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
return {"avg_sentence_length" : round(avg_sentence_length, 2),
|
| 185 |
"std_sentence_length" : round(std_sentence_length, 2),
|
|
@@ -204,8 +214,7 @@ class StructuralMetric(StatisticalMetric):
|
|
| 204 |
"""
|
| 205 |
Split text into sentences
|
| 206 |
"""
|
| 207 |
-
|
| 208 |
-
sentences = re.split(r'[.!?]+', text)
|
| 209 |
|
| 210 |
return [s.strip() for s in sentences if s.strip()]
|
| 211 |
|
|
@@ -214,8 +223,7 @@ class StructuralMetric(StatisticalMetric):
|
|
| 214 |
"""
|
| 215 |
Tokenize text into words
|
| 216 |
"""
|
| 217 |
-
|
| 218 |
-
words = re.findall(r'\b\w+\b', text.lower())
|
| 219 |
|
| 220 |
return words
|
| 221 |
|
|
@@ -224,54 +232,53 @@ class StructuralMetric(StatisticalMetric):
|
|
| 224 |
"""
|
| 225 |
Calculate punctuation density
|
| 226 |
"""
|
| 227 |
-
punctuation = re.findall(
|
| 228 |
total_chars = len(text)
|
| 229 |
|
| 230 |
-
return len(punctuation) / total_chars if total_chars >
|
| 231 |
|
| 232 |
|
| 233 |
def _calculate_burstiness(self, values: List[float]) -> float:
|
| 234 |
"""
|
| 235 |
-
Calculate burstiness score (variation in patterns)
|
| 236 |
-
Higher burstiness typically indicates human writing
|
| 237 |
"""
|
| 238 |
-
if (len(values) <
|
| 239 |
-
return
|
| 240 |
|
| 241 |
mean_val = np.mean(values)
|
| 242 |
std_val = np.std(values)
|
| 243 |
|
| 244 |
-
if (mean_val
|
| 245 |
-
return
|
| 246 |
|
| 247 |
# Coefficient of variation
|
| 248 |
cv = std_val / mean_val
|
| 249 |
|
| 250 |
# Normalize to 0-1 range
|
| 251 |
-
burstiness = min(
|
| 252 |
|
| 253 |
return burstiness
|
| 254 |
|
| 255 |
|
| 256 |
def _calculate_readability(self, text: str, sentences: List[str], words: List[str]) -> float:
|
| 257 |
"""
|
| 258 |
-
Calculate simplified readability score
|
| 259 |
-
(Approximation of Flesch Reading Ease)
|
| 260 |
"""
|
| 261 |
if not sentences or not words:
|
| 262 |
-
return
|
| 263 |
|
| 264 |
total_sentences = len(sentences)
|
| 265 |
total_words = len(words)
|
| 266 |
total_syllables = sum(self._count_syllables(word) for word in words)
|
| 267 |
|
| 268 |
# Flesch Reading Ease approximation
|
| 269 |
-
if ((total_sentences >
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
|
| 274 |
-
return 50.0
|
| 275 |
|
| 276 |
|
| 277 |
def _count_syllables(self, word: str) -> int:
|
|
@@ -287,7 +294,7 @@ class StructuralMetric(StatisticalMetric):
|
|
| 287 |
is_vowel = char in vowels
|
| 288 |
if is_vowel and not previous_was_vowel:
|
| 289 |
syllable_count += 1
|
| 290 |
-
|
| 291 |
previous_was_vowel = is_vowel
|
| 292 |
|
| 293 |
# Adjust for silent 'e'
|
|
@@ -306,11 +313,10 @@ class StructuralMetric(StatisticalMetric):
|
|
| 306 |
Detect repetitive patterns in text
|
| 307 |
AI text sometimes shows more repetition
|
| 308 |
"""
|
| 309 |
-
if (len(words) <
|
| 310 |
-
return
|
| 311 |
|
| 312 |
-
|
| 313 |
-
window_size = 10
|
| 314 |
repetitions = 0
|
| 315 |
|
| 316 |
for i in range(len(words) - window_size):
|
|
@@ -321,128 +327,129 @@ class StructuralMetric(StatisticalMetric):
|
|
| 321 |
|
| 322 |
# Normalize
|
| 323 |
max_repetitions = (len(words) - window_size) * window_size
|
| 324 |
-
repetition_score = repetitions / max_repetitions if max_repetitions > 0 else 0
|
| 325 |
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
|
| 329 |
def _calculate_ngram_diversity(self, words: List[str], n: int = 2) -> float:
|
| 330 |
"""
|
| 331 |
-
Calculate n-gram diversity
|
| 332 |
-
Higher diversity often indicates human writing
|
| 333 |
"""
|
| 334 |
-
if (len(words) <
|
| 335 |
-
return
|
| 336 |
|
| 337 |
# Generate n-grams
|
| 338 |
ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
|
| 339 |
-
|
| 340 |
-
# Calculate diversity as ratio of unique n-grams to total n-grams
|
| 341 |
-
unique_ngrams = len(set(ngrams))
|
| 342 |
total_ngrams = len(ngrams)
|
| 343 |
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
-
return
|
| 347 |
|
| 348 |
|
| 349 |
-
def
|
| 350 |
"""
|
| 351 |
-
Calculate
|
| 352 |
-
Returns raw score and confidence
|
| 353 |
"""
|
| 354 |
-
|
|
|
|
| 355 |
|
| 356 |
-
# Low burstiness suggests
|
| 357 |
-
if (features['burstiness_score'] <
|
| 358 |
-
|
| 359 |
-
ai_indicators.append(0.7)
|
| 360 |
-
|
| 361 |
-
elif (features['burstiness_score'] < 0.5):
|
| 362 |
-
# Moderate AI indicator
|
| 363 |
-
ai_indicators.append(0.5)
|
| 364 |
|
|
|
|
|
|
|
|
|
|
| 365 |
else:
|
| 366 |
-
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
-
# High length uniformity suggests AI
|
| 370 |
-
if (features['length_uniformity'] > 0.7):
|
| 371 |
-
# Strong AI indicator
|
| 372 |
-
ai_indicators.append(0.7)
|
| 373 |
-
|
| 374 |
-
elif (features['length_uniformity'] > 0.5):
|
| 375 |
-
# Moderate AI indicator
|
| 376 |
-
ai_indicators.append(0.5)
|
| 377 |
-
|
| 378 |
else:
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
-
# Low n-gram diversity suggests AI
|
| 383 |
-
if (features['bigram_diversity'] < 0.7):
|
| 384 |
-
# Moderate AI indicator
|
| 385 |
-
ai_indicators.append(0.6)
|
| 386 |
-
|
| 387 |
else:
|
| 388 |
-
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
| 390 |
|
| 391 |
-
# Moderate readability suggests AI (AI often produces "perfect" readability)
|
| 392 |
-
if (60 <= features['readability_score'] <= 75):
|
| 393 |
-
# Moderate AI indicator
|
| 394 |
-
ai_indicators.append(0.6)
|
| 395 |
-
|
| 396 |
else:
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
-
# Low repetition suggests AI (AI avoids excessive repetition)
|
| 401 |
-
if (features['repetition_score'] < 0.1):
|
| 402 |
-
# Moderate AI indicator
|
| 403 |
-
ai_indicators.append(0.6)
|
| 404 |
-
|
| 405 |
-
elif (features['repetition_score'] < 0.2):
|
| 406 |
-
# Neutral
|
| 407 |
-
ai_indicators.append(0.5)
|
| 408 |
-
|
| 409 |
else:
|
| 410 |
-
|
| 411 |
-
ai_indicators.append(0.3)
|
| 412 |
|
| 413 |
# Calculate raw score and confidence
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
return raw_score, confidence
|
| 419 |
|
| 420 |
|
| 421 |
-
def
|
| 422 |
"""
|
| 423 |
-
Calculate probability of
|
| 424 |
"""
|
| 425 |
-
mixed_indicators =
|
|
|
|
| 426 |
|
| 427 |
-
# High burstiness suggests
|
| 428 |
-
if features['burstiness_score'] >
|
| 429 |
-
mixed_indicators.append(
|
| 430 |
|
| 431 |
# Inconsistent sentence lengths might indicate mixing
|
| 432 |
-
if (features['std_sentence_length'] > features['avg_sentence_length'] *
|
| 433 |
-
mixed_indicators.append(
|
| 434 |
|
| 435 |
# Extreme values in multiple features might indicate mixing
|
| 436 |
extreme_features = 0
|
| 437 |
-
if (features['type_token_ratio'] <
|
| 438 |
extreme_features += 1
|
| 439 |
-
|
|
|
|
| 440 |
extreme_features += 1
|
| 441 |
|
| 442 |
if (extreme_features >= 2):
|
| 443 |
-
mixed_indicators.append(
|
| 444 |
|
| 445 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
|
| 447 |
|
| 448 |
# Export
|
|
|
|
| 6 |
from typing import List
|
| 7 |
from loguru import logger
|
| 8 |
from collections import Counter
|
| 9 |
+
from config.enums import Domain
|
| 10 |
+
from config.schemas import MetricResult
|
| 11 |
from metrics.base_metric import StatisticalMetric
|
| 12 |
+
from config.constants import structural_metric_params
|
| 13 |
from config.threshold_config import get_threshold_for_domain
|
| 14 |
|
| 15 |
|
|
|
|
| 42 |
|
| 43 |
Returns:
|
| 44 |
--------
|
| 45 |
+
{ MetricResult } : MetricResult with synthetic/authentic probabilities
|
| 46 |
"""
|
| 47 |
try:
|
| 48 |
# Get domain-specific thresholds
|
| 49 |
+
domain = kwargs.get('domain', Domain.GENERAL)
|
| 50 |
+
domain_thresholds = get_threshold_for_domain(domain)
|
| 51 |
+
structural_thresholds = domain_thresholds.structural
|
| 52 |
|
| 53 |
# Extract all structural features
|
| 54 |
+
features = self._extract_features(text = text)
|
| 55 |
|
| 56 |
+
# Calculate raw synthetic probability based on features
|
| 57 |
+
raw_synthetic_score, confidence = self._calculate_synthetic_probability(features = features)
|
| 58 |
|
| 59 |
# Apply domain-specific thresholds to convert raw score to probabilities
|
| 60 |
+
synthetic_prob, authentic_prob, hybrid_prob = self._apply_domain_thresholds(raw_score = raw_synthetic_score,
|
| 61 |
+
thresholds = structural_thresholds,
|
| 62 |
+
features = features,
|
| 63 |
+
)
|
| 64 |
|
| 65 |
# Apply confidence multiplier from domain thresholds
|
| 66 |
+
confidence *= structural_thresholds.confidence_multiplier
|
| 67 |
+
confidence = max(structural_metric_params.MIN_CONFIDENCE, min(structural_metric_params.MAX_CONFIDENCE, confidence))
|
| 68 |
+
|
| 69 |
+
return MetricResult(metric_name = self.name,
|
| 70 |
+
synthetic_probability = synthetic_prob,
|
| 71 |
+
authentic_probability = authentic_prob,
|
| 72 |
+
hybrid_probability = hybrid_prob,
|
| 73 |
+
confidence = confidence,
|
| 74 |
+
details = {**features,
|
| 75 |
+
'domain_used' : domain.value,
|
| 76 |
+
'synthetic_threshold': structural_thresholds.synthetic_threshold,
|
| 77 |
+
'authentic_threshold': structural_thresholds.authentic_threshold,
|
| 78 |
+
'raw_score' : raw_synthetic_score,
|
| 79 |
+
},
|
| 80 |
)
|
| 81 |
|
| 82 |
except Exception as e:
|
| 83 |
logger.error(f"Error in {self.name} computation: {repr(e)}")
|
| 84 |
+
return self._default_result(error = str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
|
|
|
|
| 87 |
def _apply_domain_thresholds(self, raw_score: float, thresholds: Any, features: Dict[str, Any]) -> tuple:
|
| 88 |
"""
|
| 89 |
Apply domain-specific thresholds to convert raw score to probabilities
|
| 90 |
"""
|
| 91 |
+
params = structural_metric_params
|
| 92 |
+
synthetic_threshold = thresholds.synthetic_threshold
|
| 93 |
+
authentic_threshold = thresholds.authentic_threshold
|
| 94 |
|
| 95 |
# Calculate probabilities based on threshold distances
|
| 96 |
+
if (raw_score >= synthetic_threshold):
|
| 97 |
+
# Above synthetic threshold - strongly synthetic
|
| 98 |
+
distance_from_threshold = raw_score - synthetic_threshold
|
| 99 |
+
synthetic_prob = params.STRONG_SYNTHETIC_BASE_PROB + (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 100 |
+
authentic_prob = (params.MAX_PROBABILITY - params.STRONG_SYNTHETIC_BASE_PROB) - (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 101 |
+
|
| 102 |
+
elif (raw_score <= authentic_threshold):
|
| 103 |
+
# Below authentic threshold - strongly authentic
|
| 104 |
+
distance_from_threshold = authentic_threshold - raw_score
|
| 105 |
+
synthetic_prob = (params.MAX_PROBABILITY - params.STRONG_AUTHENTIC_BASE_PROB) - (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 106 |
+
authentic_prob = params.STRONG_AUTHENTIC_BASE_PROB + (distance_from_threshold * params.WEAK_PROBABILITY_ADJUSTMENT)
|
| 107 |
|
| 108 |
else:
|
| 109 |
# Between thresholds - uncertain zone
|
| 110 |
+
range_width = synthetic_threshold - authentic_threshold
|
| 111 |
|
| 112 |
+
if (range_width > params.ZERO_TOLERANCE):
|
| 113 |
+
position_in_range = (raw_score - authentic_threshold) / range_width
|
| 114 |
+
synthetic_prob = params.UNCERTAIN_SYNTHETIC_RANGE_START + (position_in_range * params.UNCERTAIN_RANGE_WIDTH)
|
| 115 |
+
authentic_prob = params.UNCERTAIN_AUTHENTIC_RANGE_START - (position_in_range * params.UNCERTAIN_RANGE_WIDTH)
|
| 116 |
|
| 117 |
else:
|
| 118 |
+
synthetic_prob = params.NEUTRAL_PROBABILITY
|
| 119 |
+
authentic_prob = params.NEUTRAL_PROBABILITY
|
| 120 |
|
| 121 |
# Ensure probabilities are valid
|
| 122 |
+
synthetic_prob = max(params.MIN_PROBABILITY, min(params.MAX_PROBABILITY, synthetic_prob))
|
| 123 |
+
authentic_prob = max(params.MIN_PROBABILITY, min(params.MAX_PROBABILITY, authentic_prob))
|
| 124 |
|
| 125 |
+
# Calculate hybrid probability based on statistical patterns
|
| 126 |
+
hybrid_prob = self._calculate_hybrid_probability(features = features)
|
| 127 |
|
| 128 |
# Normalize to sum to 1.0
|
| 129 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 130 |
|
| 131 |
+
if (total > params.ZERO_TOLERANCE):
|
| 132 |
+
synthetic_prob /= total
|
| 133 |
+
authentic_prob /= total
|
| 134 |
+
hybrid_prob /= total
|
| 135 |
|
| 136 |
+
return synthetic_prob, authentic_prob, hybrid_prob
|
| 137 |
|
| 138 |
|
| 139 |
def _extract_features(self, text: str) -> Dict[str, Any]:
|
|
|
|
| 141 |
Extract all structural features from text
|
| 142 |
"""
|
| 143 |
# Basic tokenization
|
| 144 |
+
sentences = self._split_sentences(text = text)
|
| 145 |
+
words = self._tokenize_words(text = text)
|
| 146 |
|
| 147 |
# Sentence-level features
|
| 148 |
sentence_lengths = [len(s.split()) for s in sentences]
|
| 149 |
+
avg_sentence_length = np.mean(sentence_lengths) if sentence_lengths else structural_metric_params.ZERO_VALUE
|
| 150 |
+
std_sentence_length = np.std(sentence_lengths) if len(sentence_lengths) > structural_metric_params.MIN_SENTENCE_LENGTH_FOR_STD else structural_metric_params.ZERO_VALUE
|
| 151 |
|
| 152 |
# Word-level features
|
| 153 |
word_lengths = [len(w) for w in words]
|
| 154 |
+
avg_word_length = np.mean(word_lengths) if word_lengths else structural_metric_params.ZERO_VALUE
|
| 155 |
+
std_word_length = np.std(word_lengths) if len(word_lengths) > structural_metric_params.MIN_WORD_LENGTH_FOR_STD else structural_metric_params.ZERO_VALUE
|
| 156 |
|
| 157 |
# Vocabulary richness
|
| 158 |
vocabulary_size = len(set(words))
|
| 159 |
+
type_token_ratio = vocabulary_size / len(words) if words else structural_metric_params.ZERO_VALUE
|
| 160 |
|
| 161 |
# Punctuation analysis
|
| 162 |
+
punctuation_density = self._calculate_punctuation_density(text = text)
|
| 163 |
+
comma_frequency = text.count(',') / len(words) if words else structural_metric_params.ZERO_VALUE
|
| 164 |
|
| 165 |
# Burstiness (variation in patterns)
|
| 166 |
+
burstiness = self._calculate_burstiness(values = sentence_lengths)
|
| 167 |
|
| 168 |
# Uniformity scores
|
| 169 |
+
if (avg_sentence_length > structural_metric_params.ZERO_TOLERANCE):
|
| 170 |
+
length_uniformity = structural_metric_params.MAX_PROBABILITY - (std_sentence_length / avg_sentence_length)
|
| 171 |
+
length_uniformity = max(structural_metric_params.MIN_PROBABILITY, min(structural_metric_params.MAX_PROBABILITY, length_uniformity))
|
| 172 |
+
|
| 173 |
+
else:
|
| 174 |
+
length_uniformity = structural_metric_params.MIN_PROBABILITY
|
| 175 |
|
| 176 |
# Readability approximation (simplified)
|
| 177 |
+
readability = self._calculate_readability(text = text,
|
| 178 |
+
sentences = sentences,
|
| 179 |
+
words = words,
|
| 180 |
+
)
|
| 181 |
|
| 182 |
# Pattern detection
|
| 183 |
+
repetition_score = self._detect_repetitive_patterns(words = words)
|
| 184 |
|
| 185 |
# N-gram analysis
|
| 186 |
+
bigram_diversity = self._calculate_ngram_diversity(words = words,
|
| 187 |
+
n = structural_metric_params.BIGRAM_N,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
trigram_diversity = self._calculate_ngram_diversity(words = words,
|
| 191 |
+
n = structural_metric_params.TRIGRAM_N,
|
| 192 |
+
)
|
| 193 |
|
| 194 |
return {"avg_sentence_length" : round(avg_sentence_length, 2),
|
| 195 |
"std_sentence_length" : round(std_sentence_length, 2),
|
|
|
|
| 214 |
"""
|
| 215 |
Split text into sentences
|
| 216 |
"""
|
| 217 |
+
sentences = re.split(structural_metric_params.SENTENCE_SPLIT_PATTERN, text)
|
|
|
|
| 218 |
|
| 219 |
return [s.strip() for s in sentences if s.strip()]
|
| 220 |
|
|
|
|
| 223 |
"""
|
| 224 |
Tokenize text into words
|
| 225 |
"""
|
| 226 |
+
words = re.findall(structural_metric_params.WORD_TOKENIZE_PATTERN, text.lower())
|
|
|
|
| 227 |
|
| 228 |
return words
|
| 229 |
|
|
|
|
| 232 |
"""
|
| 233 |
Calculate punctuation density
|
| 234 |
"""
|
| 235 |
+
punctuation = re.findall(structural_metric_params.PUNCTUATION_PATTERN, text)
|
| 236 |
total_chars = len(text)
|
| 237 |
|
| 238 |
+
return len(punctuation) / total_chars if total_chars > structural_metric_params.ZERO_TOLERANCE else structural_metric_params.ZERO_VALUE
|
| 239 |
|
| 240 |
|
| 241 |
def _calculate_burstiness(self, values: List[float]) -> float:
|
| 242 |
"""
|
| 243 |
+
Calculate burstiness score (variation in patterns): Higher burstiness typically indicates human writing
|
|
|
|
| 244 |
"""
|
| 245 |
+
if (len(values) < structural_metric_params.MIN_VALUES_FOR_BURSTINESS):
|
| 246 |
+
return structural_metric_params.ZERO_VALUE
|
| 247 |
|
| 248 |
mean_val = np.mean(values)
|
| 249 |
std_val = np.std(values)
|
| 250 |
|
| 251 |
+
if (mean_val < structural_metric_params.ZERO_TOLERANCE):
|
| 252 |
+
return structural_metric_params.ZERO_VALUE
|
| 253 |
|
| 254 |
# Coefficient of variation
|
| 255 |
cv = std_val / mean_val
|
| 256 |
|
| 257 |
# Normalize to 0-1 range
|
| 258 |
+
burstiness = min(structural_metric_params.MAX_PROBABILITY, cv / structural_metric_params.BURSTINESS_NORMALIZATION_FACTOR)
|
| 259 |
|
| 260 |
return burstiness
|
| 261 |
|
| 262 |
|
| 263 |
def _calculate_readability(self, text: str, sentences: List[str], words: List[str]) -> float:
|
| 264 |
"""
|
| 265 |
+
Calculate simplified readability score: Approximation of Flesch Reading Ease
|
|
|
|
| 266 |
"""
|
| 267 |
if not sentences or not words:
|
| 268 |
+
return structural_metric_params.NEUTRAL_READABILITY_SCORE
|
| 269 |
|
| 270 |
total_sentences = len(sentences)
|
| 271 |
total_words = len(words)
|
| 272 |
total_syllables = sum(self._count_syllables(word) for word in words)
|
| 273 |
|
| 274 |
# Flesch Reading Ease approximation
|
| 275 |
+
if ((total_sentences > structural_metric_params.ZERO_TOLERANCE) and (total_words > structural_metric_params.ZERO_TOLERANCE)):
|
| 276 |
+
|
| 277 |
+
score = (structural_metric_params.FLESCH_CONSTANT_1 - structural_metric_params.FLESCH_CONSTANT_2 * (total_words / total_sentences) - structural_metric_params.FLESCH_CONSTANT_3 * (total_syllables / total_words))
|
| 278 |
+
|
| 279 |
+
return max(structural_metric_params.MIN_READABILITY_SCORE, min(structural_metric_params.MAX_READABILITY_SCORE, score))
|
| 280 |
|
| 281 |
+
return structural_metric_params.NEUTRAL_READABILITY_SCORE
|
|
|
|
| 282 |
|
| 283 |
|
| 284 |
def _count_syllables(self, word: str) -> int:
|
|
|
|
| 294 |
is_vowel = char in vowels
|
| 295 |
if is_vowel and not previous_was_vowel:
|
| 296 |
syllable_count += 1
|
| 297 |
+
|
| 298 |
previous_was_vowel = is_vowel
|
| 299 |
|
| 300 |
# Adjust for silent 'e'
|
|
|
|
| 313 |
Detect repetitive patterns in text
|
| 314 |
AI text sometimes shows more repetition
|
| 315 |
"""
|
| 316 |
+
if (len(words) < structural_metric_params.MIN_WORDS_FOR_REPETITION):
|
| 317 |
+
return structural_metric_params.ZERO_VALUE
|
| 318 |
|
| 319 |
+
window_size = structural_metric_params.REPETITION_WINDOW_SIZE
|
|
|
|
| 320 |
repetitions = 0
|
| 321 |
|
| 322 |
for i in range(len(words) - window_size):
|
|
|
|
| 327 |
|
| 328 |
# Normalize
|
| 329 |
max_repetitions = (len(words) - window_size) * window_size
|
|
|
|
| 330 |
|
| 331 |
+
if (max_repetitions > structural_metric_params.ZERO_TOLERANCE):
|
| 332 |
+
repetition_score = repetitions / max_repetitions
|
| 333 |
+
return min(structural_metric_params.MAX_PROBABILITY, repetition_score)
|
| 334 |
+
|
| 335 |
+
return structural_metric_params.ZERO_VALUE
|
| 336 |
|
| 337 |
|
| 338 |
def _calculate_ngram_diversity(self, words: List[str], n: int = 2) -> float:
|
| 339 |
"""
|
| 340 |
+
Calculate n-gram diversity: Higher diversity often indicates human writing
|
|
|
|
| 341 |
"""
|
| 342 |
+
if (len(words) < structural_metric_params.MIN_WORDS_FOR_NGRAM):
|
| 343 |
+
return structural_metric_params.ZERO_VALUE
|
| 344 |
|
| 345 |
# Generate n-grams
|
| 346 |
ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
|
|
|
|
|
|
|
|
|
|
| 347 |
total_ngrams = len(ngrams)
|
| 348 |
|
| 349 |
+
if total_ngrams > structural_metric_params.ZERO_TOLERANCE:
|
| 350 |
+
unique_ngrams = len(set(ngrams))
|
| 351 |
+
diversity = unique_ngrams / total_ngrams
|
| 352 |
+
return min(structural_metric_params.MAX_PROBABILITY, diversity)
|
| 353 |
|
| 354 |
+
return structural_metric_params.ZERO_VALUE
|
| 355 |
|
| 356 |
|
| 357 |
+
def _calculate_synthetic_probability(self, features: Dict[str, Any]) -> tuple:
|
| 358 |
"""
|
| 359 |
+
Calculate synthetic probability based on structural features: Returns raw score and confidence
|
|
|
|
| 360 |
"""
|
| 361 |
+
synthetic_indicators = list()
|
| 362 |
+
params = structural_metric_params
|
| 363 |
|
| 364 |
+
# Low burstiness suggests synthetic (AI is more consistent)
|
| 365 |
+
if (features['burstiness_score'] < params.BURSTINESS_LOW_THRESHOLD):
|
| 366 |
+
synthetic_indicators.append(params.STRONG_SYNTHETIC_WEIGHT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
+
elif (features['burstiness_score'] < params.BURSTINESS_MEDIUM_THRESHOLD):
|
| 369 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 370 |
+
|
| 371 |
else:
|
| 372 |
+
synthetic_indicators.append(params.WEAK_SYNTHETIC_WEIGHT)
|
| 373 |
+
|
| 374 |
+
# High length uniformity suggests synthetic
|
| 375 |
+
if (features['length_uniformity'] > params.LENGTH_UNIFORMITY_HIGH_THRESHOLD):
|
| 376 |
+
synthetic_indicators.append(params.STRONG_SYNTHETIC_WEIGHT)
|
| 377 |
+
|
| 378 |
+
elif (features['length_uniformity'] > params.LENGTH_UNIFORMITY_MEDIUM_THRESH):
|
| 379 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
else:
|
| 382 |
+
synthetic_indicators.append(params.WEAK_SYNTHETIC_WEIGHT)
|
| 383 |
+
|
| 384 |
+
# Low n-gram diversity suggests synthetic
|
| 385 |
+
if (features['bigram_diversity'] < params.BIGRAM_DIVERSITY_LOW_THRESHOLD):
|
| 386 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
else:
|
| 389 |
+
synthetic_indicators.append(params.VERY_WEAK_SYNTHETIC_WEIGHT)
|
| 390 |
+
|
| 391 |
+
# Moderate readability suggests synthetic (AI often produces "perfect" readability)
|
| 392 |
+
if (params.READABILITY_SYNTHETIC_MIN <= features['readability_score'] <= params.READABILITY_SYNTHETIC_MAX):
|
| 393 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
else:
|
| 396 |
+
synthetic_indicators.append(params.VERY_WEAK_SYNTHETIC_WEIGHT)
|
| 397 |
+
|
| 398 |
+
# Low repetition suggests synthetic (AI avoids excessive repetition)
|
| 399 |
+
if (features['repetition_score'] < params.REPETITION_LOW_THRESHOLD):
|
| 400 |
+
synthetic_indicators.append(params.MODERATE_SYNTHETIC_WEIGHT)
|
| 401 |
+
|
| 402 |
+
elif (features['repetition_score'] < params.REPETITION_MEDIUM_THRESHOLD):
|
| 403 |
+
synthetic_indicators.append(params.NEUTRAL_WEIGHT)
|
| 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
else:
|
| 406 |
+
synthetic_indicators.append(params.WEAK_SYNTHETIC_WEIGHT)
|
|
|
|
| 407 |
|
| 408 |
# Calculate raw score and confidence
|
| 409 |
+
if synthetic_indicators:
|
| 410 |
+
raw_score = np.mean(synthetic_indicators)
|
| 411 |
+
confidence = params.MAX_PROBABILITY - min(params.MAX_PROBABILITY, np.std(synthetic_indicators) / params.CONFIDENCE_STD_NORMALIZER)
|
| 412 |
+
confidence = max(params.MIN_CONFIDENCE, min(params.MAX_CONFIDENCE, confidence))
|
| 413 |
+
|
| 414 |
+
else:
|
| 415 |
+
raw_score = params.NEUTRAL_PROBABILITY
|
| 416 |
+
confidence = params.NEUTRAL_CONFIDENCE
|
| 417 |
|
| 418 |
return raw_score, confidence
|
| 419 |
|
| 420 |
|
| 421 |
+
def _calculate_hybrid_probability(self, features: Dict[str, Any]) -> float:
|
| 422 |
"""
|
| 423 |
+
Calculate probability of hybrid synthetic/authentic content based on structural patterns
|
| 424 |
"""
|
| 425 |
+
mixed_indicators = list()
|
| 426 |
+
params = structural_metric_params
|
| 427 |
|
| 428 |
+
# High burstiness suggests hybrid content (inconsistent patterns)
|
| 429 |
+
if (features['burstiness_score'] > params.BURSTINESS_HIGH_THRESHOLD):
|
| 430 |
+
mixed_indicators.append(params.MODERATE_HYBRID_WEIGHT)
|
| 431 |
|
| 432 |
# Inconsistent sentence lengths might indicate mixing
|
| 433 |
+
if (features['avg_sentence_length'] > params.ZERO_TOLERANCE and features['std_sentence_length'] > features['avg_sentence_length'] * params.SENTENCE_LENGTH_VARIANCE_RATIO):
|
| 434 |
+
mixed_indicators.append(params.WEAK_HYBRID_WEIGHT)
|
| 435 |
|
| 436 |
# Extreme values in multiple features might indicate mixing
|
| 437 |
extreme_features = 0
|
| 438 |
+
if (features['type_token_ratio'] < params.TYPE_TOKEN_RATIO_EXTREME_LOW) or (features['type_token_ratio'] > params.TYPE_TOKEN_RATIO_EXTREME_HIGH):
|
| 439 |
extreme_features += 1
|
| 440 |
+
|
| 441 |
+
if (features['readability_score'] < params.READABILITY_EXTREME_LOW) or (features['readability_score'] > params.READABILITY_EXTREME_HIGH):
|
| 442 |
extreme_features += 1
|
| 443 |
|
| 444 |
if (extreme_features >= 2):
|
| 445 |
+
mixed_indicators.append(params.WEAK_HYBRID_WEIGHT)
|
| 446 |
|
| 447 |
+
if mixed_indicators:
|
| 448 |
+
hybrid_prob = np.mean(mixed_indicators)
|
| 449 |
+
return min(params.MAX_HYBRID_PROBABILITY, hybrid_prob)
|
| 450 |
+
|
| 451 |
+
return params.MIN_PROBABILITY
|
| 452 |
+
|
| 453 |
|
| 454 |
|
| 455 |
# Export
|
models/__init__.py
CHANGED
|
@@ -1,13 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
from .model_manager import *
|
| 3 |
-
from .model_registry import *
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
# Export everything
|
| 7 |
-
__all__ = ["ModelCache",
|
| 8 |
-
"ModelManager",
|
| 9 |
-
"ModelRegistry",
|
| 10 |
-
"ModelUsageStats",
|
| 11 |
-
"get_model_manager",
|
| 12 |
-
"get_model_registry",
|
| 13 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/model_manager.py
CHANGED
|
@@ -181,15 +181,7 @@ class ModelManager:
|
|
| 181 |
"""
|
| 182 |
Check if model is already downloaded
|
| 183 |
"""
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
if not model_config:
|
| 187 |
-
return False
|
| 188 |
-
|
| 189 |
-
# Check if model exists in cache directory
|
| 190 |
-
model_path = self.cache_dir / model_config.model_id.replace("/", "_")
|
| 191 |
-
|
| 192 |
-
return model_path.exists() and model_name in self.metadata
|
| 193 |
|
| 194 |
|
| 195 |
def load_model(self, model_name: str, force_download: bool = False) -> Any:
|
|
@@ -211,6 +203,8 @@ class ModelManager:
|
|
| 211 |
cached = self.cache.get(key = model_name)
|
| 212 |
|
| 213 |
if cached is not None:
|
|
|
|
|
|
|
| 214 |
return cached
|
| 215 |
|
| 216 |
# Get model configuration
|
|
@@ -226,8 +220,8 @@ class ModelManager:
|
|
| 226 |
if (model_config.model_type == ModelType.SENTENCE_TRANSFORMER):
|
| 227 |
model = self._load_sentence_transformer(config = model_config)
|
| 228 |
|
| 229 |
-
elif (model_config.model_type == ModelType.
|
| 230 |
-
model = self.
|
| 231 |
|
| 232 |
elif (model_config.model_type == ModelType.CLASSIFIER):
|
| 233 |
model = self._load_classifier(config = model_config)
|
|
@@ -295,7 +289,7 @@ class ModelManager:
|
|
| 295 |
logger.info(f"Loading tokenizer for: {model_name}")
|
| 296 |
|
| 297 |
try:
|
| 298 |
-
if (model_config.model_type in [ModelType.
|
| 299 |
ModelType.CLASSIFIER,
|
| 300 |
ModelType.SEQUENCE_CLASSIFICATION,
|
| 301 |
ModelType.TRANSFORMER,
|
|
@@ -328,7 +322,7 @@ class ModelManager:
|
|
| 328 |
return model
|
| 329 |
|
| 330 |
|
| 331 |
-
def
|
| 332 |
"""
|
| 333 |
Load GPT-style model with tokenizer
|
| 334 |
"""
|
|
@@ -489,12 +483,20 @@ class ModelManager:
|
|
| 489 |
raise ValueError(f"Unknown model: {model_name}")
|
| 490 |
|
| 491 |
logger.info(f"Loading pipeline: {task} with {model_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
pipe = pipeline(task = task,
|
| 494 |
model = model_config.model_id,
|
| 495 |
device = 0 if self.device.type == "cuda" else -1,
|
| 496 |
model_kwargs = {"cache_dir": str(self.cache_dir)},
|
| 497 |
)
|
|
|
|
|
|
|
| 498 |
|
| 499 |
return pipe
|
| 500 |
|
|
@@ -549,7 +551,7 @@ class ModelManager:
|
|
| 549 |
cache_folder = str(self.cache_dir),
|
| 550 |
)
|
| 551 |
|
| 552 |
-
elif (model_config.model_type == ModelType.
|
| 553 |
GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path = model_config.model_id,
|
| 554 |
cache_dir = str(self.cache_dir),
|
| 555 |
)
|
|
|
|
| 181 |
"""
|
| 182 |
Check if model is already downloaded
|
| 183 |
"""
|
| 184 |
+
return model_name in self.metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
|
| 187 |
def load_model(self, model_name: str, force_download: bool = False) -> Any:
|
|
|
|
| 203 |
cached = self.cache.get(key = model_name)
|
| 204 |
|
| 205 |
if cached is not None:
|
| 206 |
+
self.metadata[model_name]["last_used"] = datetime.now().isoformat()
|
| 207 |
+
self._save_metadata()
|
| 208 |
return cached
|
| 209 |
|
| 210 |
# Get model configuration
|
|
|
|
| 220 |
if (model_config.model_type == ModelType.SENTENCE_TRANSFORMER):
|
| 221 |
model = self._load_sentence_transformer(config = model_config)
|
| 222 |
|
| 223 |
+
elif (model_config.model_type == ModelType.LANGUAGE_MODEL):
|
| 224 |
+
model = self._load_language_model(config = model_config)
|
| 225 |
|
| 226 |
elif (model_config.model_type == ModelType.CLASSIFIER):
|
| 227 |
model = self._load_classifier(config = model_config)
|
|
|
|
| 289 |
logger.info(f"Loading tokenizer for: {model_name}")
|
| 290 |
|
| 291 |
try:
|
| 292 |
+
if (model_config.model_type in [ModelType.LANGUAGE_MODEL,
|
| 293 |
ModelType.CLASSIFIER,
|
| 294 |
ModelType.SEQUENCE_CLASSIFICATION,
|
| 295 |
ModelType.TRANSFORMER,
|
|
|
|
| 322 |
return model
|
| 323 |
|
| 324 |
|
| 325 |
+
def _load_language_model(self, config: ModelConfig) -> tuple:
|
| 326 |
"""
|
| 327 |
Load GPT-style model with tokenizer
|
| 328 |
"""
|
|
|
|
| 483 |
raise ValueError(f"Unknown model: {model_name}")
|
| 484 |
|
| 485 |
logger.info(f"Loading pipeline: {task} with {model_name}")
|
| 486 |
+
|
| 487 |
+
cache_key = f"{model_name}:{task}"
|
| 488 |
+
cached = self.cache.get(cache_key)
|
| 489 |
+
|
| 490 |
+
if cached:
|
| 491 |
+
return cached
|
| 492 |
|
| 493 |
pipe = pipeline(task = task,
|
| 494 |
model = model_config.model_id,
|
| 495 |
device = 0 if self.device.type == "cuda" else -1,
|
| 496 |
model_kwargs = {"cache_dir": str(self.cache_dir)},
|
| 497 |
)
|
| 498 |
+
|
| 499 |
+
self.cache.put(cache_key, pipe)
|
| 500 |
|
| 501 |
return pipe
|
| 502 |
|
|
|
|
| 551 |
cache_folder = str(self.cache_dir),
|
| 552 |
)
|
| 553 |
|
| 554 |
+
elif (model_config.model_type == ModelType.LANGUAGE_MODEL):
|
| 555 |
GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path = model_config.model_id,
|
| 556 |
cache_dir = str(self.cache_dir),
|
| 557 |
)
|
models/model_registry.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
-
import gc
|
| 3 |
-
import torch
|
| 4 |
import threading
|
| 5 |
from typing import Any
|
| 6 |
from typing import Dict
|
|
@@ -8,36 +6,11 @@ from typing import List
|
|
| 8 |
from loguru import logger
|
| 9 |
from typing import Optional
|
| 10 |
from datetime import datetime
|
| 11 |
-
from
|
| 12 |
-
from config.model_config import ModelConfig
|
| 13 |
from config.model_config import MODEL_REGISTRY
|
| 14 |
from config.model_config import get_model_config
|
| 15 |
|
| 16 |
|
| 17 |
-
@dataclass
|
| 18 |
-
class ModelUsageStats:
|
| 19 |
-
"""
|
| 20 |
-
Lightweight model usage statistics
|
| 21 |
-
"""
|
| 22 |
-
model_name : str
|
| 23 |
-
load_count : int
|
| 24 |
-
last_used : datetime
|
| 25 |
-
total_usage_time_seconds : float
|
| 26 |
-
avg_usage_time_seconds : float
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 30 |
-
"""
|
| 31 |
-
Convert to dictionary
|
| 32 |
-
"""
|
| 33 |
-
return {"model_name" : self.model_name,
|
| 34 |
-
"load_count" : self.load_count,
|
| 35 |
-
"last_used" : self.last_used.isoformat(),
|
| 36 |
-
"total_usage_time_seconds" : round(self.total_usage_time_seconds, 2),
|
| 37 |
-
"avg_usage_time_seconds" : round(self.avg_usage_time_seconds, 2),
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
|
| 41 |
class ModelRegistry:
|
| 42 |
"""
|
| 43 |
Model registry module for tracking model usage statistics and performance metrics
|
|
@@ -64,13 +37,24 @@ class ModelRegistry:
|
|
| 64 |
"""
|
| 65 |
Initialize registry with all known models
|
| 66 |
"""
|
|
|
|
|
|
|
|
|
|
| 67 |
for model_name in MODEL_REGISTRY.keys():
|
|
|
|
|
|
|
|
|
|
| 68 |
self.usage_stats[model_name] = ModelUsageStats(model_name = model_name,
|
| 69 |
-
|
| 70 |
-
last_used =
|
|
|
|
| 71 |
total_usage_time_seconds = 0.0,
|
| 72 |
avg_usage_time_seconds = 0.0,
|
| 73 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def record_model_usage(self, model_name: str, usage_time_seconds: float = 0.0):
|
|
@@ -87,21 +71,23 @@ class ModelRegistry:
|
|
| 87 |
if model_name not in self.usage_stats:
|
| 88 |
# Auto-register unknown models
|
| 89 |
self.usage_stats[model_name] = ModelUsageStats(model_name = model_name,
|
| 90 |
-
|
| 91 |
-
last_used = datetime.
|
|
|
|
| 92 |
total_usage_time_seconds = 0.0,
|
| 93 |
avg_usage_time_seconds = 0.0,
|
| 94 |
)
|
| 95 |
|
| 96 |
-
stats
|
| 97 |
-
stats.
|
| 98 |
-
stats.last_used
|
| 99 |
|
| 100 |
if (usage_time_seconds > 0):
|
| 101 |
stats.total_usage_time_seconds += usage_time_seconds
|
| 102 |
-
stats.
|
|
|
|
| 103 |
|
| 104 |
-
logger.debug(f"Recorded usage for {model_name} (count: {stats.
|
| 105 |
|
| 106 |
|
| 107 |
def get_usage_stats(self, model_name: str) -> Optional[ModelUsageStats]:
|
|
@@ -118,7 +104,7 @@ class ModelRegistry:
|
|
| 118 |
"""
|
| 119 |
with self.lock:
|
| 120 |
sorted_models = sorted(self.usage_stats.values(),
|
| 121 |
-
key = lambda x: x.
|
| 122 |
reverse = True,
|
| 123 |
)
|
| 124 |
|
|
@@ -193,10 +179,10 @@ class ModelRegistry:
|
|
| 193 |
Generate a comprehensive usage report
|
| 194 |
"""
|
| 195 |
with self.lock:
|
| 196 |
-
total_usage = sum(stats.
|
| 197 |
-
active_models = [name for name, stats in self.usage_stats.items() if stats.
|
| 198 |
|
| 199 |
-
return {"timestamp" : datetime.
|
| 200 |
"summary" : {"total_models_tracked" : len(self.usage_stats),
|
| 201 |
"active_models" : len(active_models),
|
| 202 |
"total_usage_count" : total_usage,
|
|
@@ -219,8 +205,9 @@ class ModelRegistry:
|
|
| 219 |
if model_name:
|
| 220 |
if model_name in self.usage_stats:
|
| 221 |
self.usage_stats[model_name] = ModelUsageStats(model_name = model_name,
|
| 222 |
-
|
| 223 |
-
last_used = datetime.
|
|
|
|
| 224 |
total_usage_time_seconds = 0.0,
|
| 225 |
avg_usage_time_seconds = 0.0,
|
| 226 |
)
|
|
|
|
| 1 |
# DEPENDENCIES
|
|
|
|
|
|
|
| 2 |
import threading
|
| 3 |
from typing import Any
|
| 4 |
from typing import Dict
|
|
|
|
| 6 |
from loguru import logger
|
| 7 |
from typing import Optional
|
| 8 |
from datetime import datetime
|
| 9 |
+
from config.schemas import ModelUsageStats
|
|
|
|
| 10 |
from config.model_config import MODEL_REGISTRY
|
| 11 |
from config.model_config import get_model_config
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
class ModelRegistry:
|
| 15 |
"""
|
| 16 |
Model registry module for tracking model usage statistics and performance metrics
|
|
|
|
| 37 |
"""
|
| 38 |
Initialize registry with all known models
|
| 39 |
"""
|
| 40 |
+
self.usage_stats.clear()
|
| 41 |
+
self.dependency_graph.clear()
|
| 42 |
+
|
| 43 |
for model_name in MODEL_REGISTRY.keys():
|
| 44 |
+
config = get_model_config(model_name)
|
| 45 |
+
|
| 46 |
+
# Register usage stats
|
| 47 |
self.usage_stats[model_name] = ModelUsageStats(model_name = model_name,
|
| 48 |
+
usage_count = 0,
|
| 49 |
+
last_used = None,
|
| 50 |
+
timed_usage_count = 0,
|
| 51 |
total_usage_time_seconds = 0.0,
|
| 52 |
avg_usage_time_seconds = 0.0,
|
| 53 |
)
|
| 54 |
+
|
| 55 |
+
# Register dependencies if defined
|
| 56 |
+
if config and config.additional_params.get("depends_on"):
|
| 57 |
+
self.dependency_graph[model_name] = config.additional_params["depends_on"]
|
| 58 |
|
| 59 |
|
| 60 |
def record_model_usage(self, model_name: str, usage_time_seconds: float = 0.0):
|
|
|
|
| 71 |
if model_name not in self.usage_stats:
|
| 72 |
# Auto-register unknown models
|
| 73 |
self.usage_stats[model_name] = ModelUsageStats(model_name = model_name,
|
| 74 |
+
usage_count = 0,
|
| 75 |
+
last_used = datetime.utcnow(),
|
| 76 |
+
timed_usage_count = 0,
|
| 77 |
total_usage_time_seconds = 0.0,
|
| 78 |
avg_usage_time_seconds = 0.0,
|
| 79 |
)
|
| 80 |
|
| 81 |
+
stats = self.usage_stats[model_name]
|
| 82 |
+
stats.usage_count += 1
|
| 83 |
+
stats.last_used = datetime.utcnow()
|
| 84 |
|
| 85 |
if (usage_time_seconds > 0):
|
| 86 |
stats.total_usage_time_seconds += usage_time_seconds
|
| 87 |
+
stats.timed_usage_count += 1
|
| 88 |
+
stats.avg_usage_time_seconds = (stats.total_usage_time_seconds / stats.timed_usage_count)
|
| 89 |
|
| 90 |
+
logger.debug(f"Recorded usage for {model_name} (count: {stats.usage_count})")
|
| 91 |
|
| 92 |
|
| 93 |
def get_usage_stats(self, model_name: str) -> Optional[ModelUsageStats]:
|
|
|
|
| 104 |
"""
|
| 105 |
with self.lock:
|
| 106 |
sorted_models = sorted(self.usage_stats.values(),
|
| 107 |
+
key = lambda x: x.usage_count,
|
| 108 |
reverse = True,
|
| 109 |
)
|
| 110 |
|
|
|
|
| 179 |
Generate a comprehensive usage report
|
| 180 |
"""
|
| 181 |
with self.lock:
|
| 182 |
+
total_usage = sum(stats.usage_count for stats in self.usage_stats.values())
|
| 183 |
+
active_models = [name for name, stats in self.usage_stats.items() if stats.usage_count > 0]
|
| 184 |
|
| 185 |
+
return {"timestamp" : datetime.utcnow().isoformat(),
|
| 186 |
"summary" : {"total_models_tracked" : len(self.usage_stats),
|
| 187 |
"active_models" : len(active_models),
|
| 188 |
"total_usage_count" : total_usage,
|
|
|
|
| 205 |
if model_name:
|
| 206 |
if model_name in self.usage_stats:
|
| 207 |
self.usage_stats[model_name] = ModelUsageStats(model_name = model_name,
|
| 208 |
+
usage_count = 0,
|
| 209 |
+
last_used = datetime.utcnow(),
|
| 210 |
+
timed_usage_count = 0,
|
| 211 |
total_usage_time_seconds = 0.0,
|
| 212 |
avg_usage_time_seconds = 0.0,
|
| 213 |
)
|
processors/__init__.py
CHANGED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
from .text_processor import *
|
| 3 |
-
from .language_detector import *
|
| 4 |
-
from .domain_classifier import *
|
| 5 |
-
from .document_extractor import *
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
# Export everything
|
| 9 |
-
__all__ = ["Script",
|
| 10 |
-
"Language",
|
| 11 |
-
"is_english",
|
| 12 |
-
"extract_text",
|
| 13 |
-
"quick_detect",
|
| 14 |
-
"TextProcessor",
|
| 15 |
-
"ProcessedText",
|
| 16 |
-
"quick_process",
|
| 17 |
-
"extract_words",
|
| 18 |
-
"LanguageDetector",
|
| 19 |
-
"DomainClassifier",
|
| 20 |
-
"DomainPrediction",
|
| 21 |
-
"extract_sentences",
|
| 22 |
-
"DocumentExtractor",
|
| 23 |
-
"ExtractedDocument",
|
| 24 |
-
"extract_from_upload",
|
| 25 |
-
"LanguageDetectionResult",
|
| 26 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processors/document_extractor.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
| 2 |
import io
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
-
import mimetypes
|
| 6 |
from typing import Any
|
| 7 |
from typing import Dict
|
| 8 |
from typing import List
|
|
@@ -10,7 +9,8 @@ from pathlib import Path
|
|
| 10 |
from typing import Tuple
|
| 11 |
from loguru import logger
|
| 12 |
from typing import Optional
|
| 13 |
-
from
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
# Document processing libraries
|
|
@@ -67,82 +67,38 @@ except ImportError:
|
|
| 67 |
BS4_AVAILABLE = False
|
| 68 |
|
| 69 |
|
| 70 |
-
@dataclass
|
| 71 |
-
class ExtractedDocument:
|
| 72 |
-
"""
|
| 73 |
-
Container for extracted document content with metadata
|
| 74 |
-
"""
|
| 75 |
-
text : str
|
| 76 |
-
file_path : Optional[str]
|
| 77 |
-
file_type : str
|
| 78 |
-
file_size_bytes : int
|
| 79 |
-
page_count : int
|
| 80 |
-
extraction_method : str
|
| 81 |
-
metadata : Dict[str, Any]
|
| 82 |
-
is_success : bool
|
| 83 |
-
error_message : Optional[str]
|
| 84 |
-
warnings : List[str]
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 88 |
-
"""
|
| 89 |
-
Convert to dictionary for JSON serialization
|
| 90 |
-
"""
|
| 91 |
-
return {"text_length" : len(self.text),
|
| 92 |
-
"file_type" : self.file_type,
|
| 93 |
-
"file_size_bytes" : self.file_size_bytes,
|
| 94 |
-
"page_count" : self.page_count,
|
| 95 |
-
"extraction_method" : self.extraction_method,
|
| 96 |
-
"metadata" : self.metadata,
|
| 97 |
-
"is_success" : self.is_success,
|
| 98 |
-
"error_message" : self.error_message,
|
| 99 |
-
"warnings" : self.warnings,
|
| 100 |
-
}
|
| 101 |
-
|
| 102 |
-
|
| 103 |
class DocumentExtractor:
|
| 104 |
"""
|
| 105 |
-
Extracts
|
|
|
|
| 106 |
|
| 107 |
Supported Formats:
|
| 108 |
-
- Plain text (.txt, .md, .log)
|
| 109 |
-
- PDF documents (.pdf)
|
| 110 |
- Microsoft Word (.doc, .docx)
|
| 111 |
- Rich Text Format (.rtf)
|
| 112 |
- HTML files (.html, .htm)
|
| 113 |
|
| 114 |
-
|
| 115 |
-
-
|
| 116 |
-
-
|
| 117 |
-
- Metadata
|
| 118 |
-
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
# Supported file extensions
|
| 123 |
-
SUPPORTED_EXTENSIONS = {'.txt', '.text', '.md', '.markdown', '.log', '.csv', '.pdf', '.docx', '.doc', '.rtf', '.html', '.htm'}
|
| 124 |
-
|
| 125 |
-
# Text file extensions
|
| 126 |
-
TEXT_EXTENSIONS = {'.txt', '.text', '.md', '.markdown', '.log', '.csv'}
|
| 127 |
-
|
| 128 |
-
# Maximum file size (50 MB default)
|
| 129 |
-
MAX_FILE_SIZE = 50 * 1024 * 1024
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
def __init__(self, max_file_size: int = MAX_FILE_SIZE, extract_metadata: bool = True):
|
| 133 |
"""
|
| 134 |
Initialize document extractor
|
| 135 |
|
| 136 |
Arguments:
|
| 137 |
----------
|
| 138 |
-
max_file_size { int } : Maximum file size in bytes
|
| 139 |
-
|
| 140 |
extract_metadata { bool } : Extract document metadata
|
| 141 |
"""
|
| 142 |
-
self.max_file_size
|
| 143 |
-
self.
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
logger.info(f"DocumentExtractor initialized (max_size={max_file_size/1024/1024:.1f}MB)")
|
| 146 |
|
| 147 |
|
| 148 |
def extract(self, file_path: str) -> ExtractedDocument:
|
|
@@ -173,7 +129,7 @@ class DocumentExtractor:
|
|
| 173 |
file_ext = file_path.suffix.lower()
|
| 174 |
|
| 175 |
# Route to appropriate extractor
|
| 176 |
-
if (file_ext in self.
|
| 177 |
result = self._extract_text_file(file_path)
|
| 178 |
|
| 179 |
elif (file_ext == '.pdf'):
|
|
@@ -227,7 +183,7 @@ class DocumentExtractor:
|
|
| 227 |
# Determine file type
|
| 228 |
file_ext = Path(filename).suffix.lower()
|
| 229 |
|
| 230 |
-
if file_ext not in self.
|
| 231 |
return self._create_error_result(file_path = filename,
|
| 232 |
error = f"Unsupported file type: {file_ext}",
|
| 233 |
)
|
|
@@ -239,7 +195,7 @@ class DocumentExtractor:
|
|
| 239 |
)
|
| 240 |
|
| 241 |
# Route to appropriate extractor
|
| 242 |
-
if (file_ext in self.
|
| 243 |
result = self._extract_text_bytes(file_bytes, filename)
|
| 244 |
|
| 245 |
elif (file_ext == '.pdf'):
|
|
@@ -508,7 +464,7 @@ class DocumentExtractor:
|
|
| 508 |
# Primary: Try PyMuPDF first
|
| 509 |
if PYPDF_AVAILABLE:
|
| 510 |
try:
|
| 511 |
-
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 512 |
page_count = doc.page_count
|
| 513 |
metadata = doc.metadata
|
| 514 |
|
|
@@ -865,7 +821,7 @@ class DocumentExtractor:
|
|
| 865 |
return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"
|
| 866 |
|
| 867 |
# Check file extension
|
| 868 |
-
if (file_path.suffix.lower() not in self.
|
| 869 |
return False, f"Unsupported file type: {file_path.suffix}"
|
| 870 |
|
| 871 |
return True, None
|
|
|
|
| 2 |
import io
|
| 3 |
import os
|
| 4 |
import re
|
|
|
|
| 5 |
from typing import Any
|
| 6 |
from typing import Dict
|
| 7 |
from typing import List
|
|
|
|
| 9 |
from typing import Tuple
|
| 10 |
from loguru import logger
|
| 11 |
from typing import Optional
|
| 12 |
+
from config.schemas import ExtractedDocument
|
| 13 |
+
from config.constants import document_extraction_params
|
| 14 |
|
| 15 |
|
| 16 |
# Document processing libraries
|
|
|
|
| 67 |
BS4_AVAILABLE = False
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
class DocumentExtractor:
|
| 71 |
"""
|
| 72 |
+
Extracts and normalizes textual content from heterogeneous document formats
|
| 73 |
+
for downstream text authentication and provenance analysis
|
| 74 |
|
| 75 |
Supported Formats:
|
| 76 |
+
- Plain text (.txt, .md, .log, .csv)
|
| 77 |
+
- PDF documents (.pdf)
|
| 78 |
- Microsoft Word (.doc, .docx)
|
| 79 |
- Rich Text Format (.rtf)
|
| 80 |
- HTML files (.html, .htm)
|
| 81 |
|
| 82 |
+
Design Principles:
|
| 83 |
+
- Loss-minimized text extraction
|
| 84 |
+
- Best-effort fallback strategy
|
| 85 |
+
- Metadata-preserving ingestion
|
| 86 |
+
- Format-agnostic downstream compatibility
|
| 87 |
+
"""
|
| 88 |
+
def __init__(self, extract_metadata: bool = True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
"""
|
| 90 |
Initialize document extractor
|
| 91 |
|
| 92 |
Arguments:
|
| 93 |
----------
|
|
|
|
|
|
|
| 94 |
extract_metadata { bool } : Extract document metadata
|
| 95 |
"""
|
| 96 |
+
self.max_file_size = document_extraction_params.MAX_FILE_SIZE
|
| 97 |
+
self.text_extensions = document_extraction_params.TEXT_EXTENSIONS
|
| 98 |
+
self.supported_extensions = document_extraction_params.SUPPORTED_EXTENSIONS
|
| 99 |
+
self.extract_metadata = extract_metadata
|
| 100 |
|
| 101 |
+
logger.info(f"DocumentExtractor initialized (max_size={self.max_file_size/1024/1024:.1f}MB)")
|
| 102 |
|
| 103 |
|
| 104 |
def extract(self, file_path: str) -> ExtractedDocument:
|
|
|
|
| 129 |
file_ext = file_path.suffix.lower()
|
| 130 |
|
| 131 |
# Route to appropriate extractor
|
| 132 |
+
if (file_ext in self.text_extensions):
|
| 133 |
result = self._extract_text_file(file_path)
|
| 134 |
|
| 135 |
elif (file_ext == '.pdf'):
|
|
|
|
| 183 |
# Determine file type
|
| 184 |
file_ext = Path(filename).suffix.lower()
|
| 185 |
|
| 186 |
+
if file_ext not in self.supported_extensions:
|
| 187 |
return self._create_error_result(file_path = filename,
|
| 188 |
error = f"Unsupported file type: {file_ext}",
|
| 189 |
)
|
|
|
|
| 195 |
)
|
| 196 |
|
| 197 |
# Route to appropriate extractor
|
| 198 |
+
if (file_ext in self.text_extensions):
|
| 199 |
result = self._extract_text_bytes(file_bytes, filename)
|
| 200 |
|
| 201 |
elif (file_ext == '.pdf'):
|
|
|
|
| 464 |
# Primary: Try PyMuPDF first
|
| 465 |
if PYPDF_AVAILABLE:
|
| 466 |
try:
|
| 467 |
+
doc = fitz.open(stream = file_bytes, filetype = "pdf")
|
| 468 |
page_count = doc.page_count
|
| 469 |
metadata = doc.metadata
|
| 470 |
|
|
|
|
| 821 |
return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"
|
| 822 |
|
| 823 |
# Check file extension
|
| 824 |
+
if (file_path.suffix.lower() not in self.supported_extensions):
|
| 825 |
return False, f"Unsupported file type: {file_path.suffix}"
|
| 826 |
|
| 827 |
return True, None
|
processors/domain_classifier.py
CHANGED
|
@@ -4,45 +4,36 @@ from typing import List
|
|
| 4 |
from typing import Tuple
|
| 5 |
from loguru import logger
|
| 6 |
from typing import Optional
|
| 7 |
-
from
|
| 8 |
-
from config.
|
| 9 |
from models.model_manager import get_model_manager
|
|
|
|
| 10 |
from config.threshold_config import interpolate_thresholds
|
| 11 |
from config.threshold_config import get_threshold_for_domain
|
| 12 |
|
| 13 |
-
|
| 14 |
-
@dataclass
|
| 15 |
-
class DomainPrediction:
|
| 16 |
-
"""
|
| 17 |
-
Result of domain classification
|
| 18 |
-
"""
|
| 19 |
-
primary_domain : Domain
|
| 20 |
-
secondary_domain : Optional[Domain]
|
| 21 |
-
confidence : float
|
| 22 |
-
domain_scores : Dict[str, float]
|
| 23 |
|
| 24 |
|
| 25 |
class DomainClassifier:
|
| 26 |
"""
|
| 27 |
Classifies text into domains using zero-shot classification
|
| 28 |
"""
|
| 29 |
-
#
|
| 30 |
-
DOMAIN_LABELS = {Domain.ACADEMIC : ["academic
|
| 31 |
-
Domain.CREATIVE : ["creative
|
| 32 |
-
Domain.AI_ML : ["
|
| 33 |
-
Domain.SOFTWARE_DEV : ["
|
| 34 |
-
Domain.TECHNICAL_DOC : ["
|
| 35 |
-
Domain.ENGINEERING : ["engineering
|
| 36 |
-
Domain.SCIENCE : ["
|
| 37 |
-
Domain.BUSINESS : ["business
|
| 38 |
-
Domain.JOURNALISM : ["
|
| 39 |
-
Domain.SOCIAL_MEDIA : ["
|
| 40 |
-
Domain.BLOG_PERSONAL : ["
|
| 41 |
-
Domain.LEGAL : ["legal
|
| 42 |
-
Domain.MEDICAL : ["medical
|
| 43 |
-
Domain.MARKETING : ["marketing
|
| 44 |
-
Domain.TUTORIAL : ["tutorial"
|
| 45 |
-
Domain.GENERAL : ["general
|
| 46 |
}
|
| 47 |
|
| 48 |
|
|
@@ -61,7 +52,7 @@ class DomainClassifier:
|
|
| 61 |
logger.info("Initializing domain classifier...")
|
| 62 |
|
| 63 |
# Load primary domain classifier (zero-shot)
|
| 64 |
-
self.primary_classifier = self.model_manager.load_model(model_name = "
|
| 65 |
|
| 66 |
# Load fallback classifier
|
| 67 |
try:
|
|
@@ -81,15 +72,15 @@ class DomainClassifier:
|
|
| 81 |
return False
|
| 82 |
|
| 83 |
|
| 84 |
-
def classify(self, text: str, top_k: int =
|
| 85 |
"""
|
| 86 |
Classify text into domain using zero-shot classification
|
| 87 |
|
| 88 |
Arguments:
|
| 89 |
----------
|
| 90 |
-
text
|
| 91 |
|
| 92 |
-
top_k
|
| 93 |
|
| 94 |
min_confidence { float } : Minimum confidence threshold
|
| 95 |
|
|
@@ -110,7 +101,7 @@ class DomainClassifier:
|
|
| 110 |
)
|
| 111 |
|
| 112 |
# If primary result meets confidence threshold, return it
|
| 113 |
-
if (primary_result.
|
| 114 |
return primary_result
|
| 115 |
|
| 116 |
# If primary is low confidence but we have fallback, try fallback
|
|
@@ -122,7 +113,7 @@ class DomainClassifier:
|
|
| 122 |
)
|
| 123 |
|
| 124 |
# Use fallback if it has higher confidence
|
| 125 |
-
if fallback_result.
|
| 126 |
return fallback_result
|
| 127 |
|
| 128 |
# Return primary result even if low confidence
|
|
@@ -152,7 +143,7 @@ class DomainClassifier:
|
|
| 152 |
Classify using a zero-shot classification model
|
| 153 |
"""
|
| 154 |
# Preprocess text
|
| 155 |
-
processed_text
|
| 156 |
|
| 157 |
# Get all candidate labels
|
| 158 |
all_labels = list()
|
|
@@ -160,8 +151,9 @@ class DomainClassifier:
|
|
| 160 |
|
| 161 |
for domain, labels in self.DOMAIN_LABELS.items():
|
| 162 |
# Use the first label as the primary one for this domain
|
| 163 |
-
primary_label
|
| 164 |
all_labels.append(primary_label)
|
|
|
|
| 165 |
label_to_domain[primary_label] = domain
|
| 166 |
|
| 167 |
# Perform zero-shot classification
|
|
@@ -195,35 +187,48 @@ class DomainClassifier:
|
|
| 195 |
|
| 196 |
secondary_domain = None
|
| 197 |
secondary_score = 0.0
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
if ((len(sorted_domains) > 1) and (sorted_domains[1][1] >=
|
| 200 |
secondary_domain = Domain(sorted_domains[1][0])
|
| 201 |
secondary_score = sorted_domains[1][1]
|
| 202 |
|
| 203 |
-
# Calculate
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
# If we have mixed domains with close scores, adjust confidence
|
| 207 |
-
if (secondary_domain and (primary_score <
|
|
|
|
| 208 |
score_ratio = secondary_score / primary_score
|
| 209 |
|
| 210 |
# Secondary is at least 60% of primary
|
| 211 |
-
if (score_ratio >
|
| 212 |
# Lower confidence for mixed domains
|
| 213 |
-
|
| 214 |
logger.info(f"Mixed domain detected: {primary_domain.value} + {secondary_domain.value}, will use interpolated thresholds")
|
| 215 |
|
|
|
|
|
|
|
|
|
|
| 216 |
# If primary score is low and we have a secondary, it's uncertain
|
| 217 |
-
|
| 218 |
-
# Reduce confidence
|
| 219 |
-
|
| 220 |
|
| 221 |
-
logger.info(f"{model_type.capitalize()} model classified domain: {primary_domain.value} (confidence: {
|
| 222 |
|
| 223 |
-
return DomainPrediction(primary_domain
|
| 224 |
-
secondary_domain
|
| 225 |
-
|
| 226 |
-
domain_scores
|
| 227 |
)
|
| 228 |
|
| 229 |
|
|
@@ -231,10 +236,12 @@ class DomainClassifier:
|
|
| 231 |
"""
|
| 232 |
Preprocess text for classification
|
| 233 |
"""
|
| 234 |
-
# Truncate to reasonable length
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
| 238 |
|
| 239 |
# Clean up text
|
| 240 |
text = text.strip()
|
|
@@ -248,10 +255,10 @@ class DomainClassifier:
|
|
| 248 |
"""
|
| 249 |
Get default prediction when classification fails
|
| 250 |
"""
|
| 251 |
-
return DomainPrediction(primary_domain
|
| 252 |
-
secondary_domain
|
| 253 |
-
|
| 254 |
-
domain_scores
|
| 255 |
)
|
| 256 |
|
| 257 |
|
|
@@ -259,9 +266,15 @@ class DomainClassifier:
|
|
| 259 |
"""
|
| 260 |
Get adaptive thresholds based on domain prediction
|
| 261 |
"""
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
return get_threshold_for_domain(domain_prediction.primary_domain)
|
| 264 |
|
|
|
|
| 265 |
if domain_prediction.secondary_domain:
|
| 266 |
primary_score = domain_prediction.domain_scores.get(domain_prediction.primary_domain.value, 0)
|
| 267 |
secondary_score = domain_prediction.domain_scores.get(domain_prediction.secondary_domain.value, 0)
|
|
@@ -270,19 +283,21 @@ class DomainClassifier:
|
|
| 270 |
weight1 = primary_score / (primary_score + secondary_score)
|
| 271 |
|
| 272 |
else:
|
| 273 |
-
weight1 = domain_prediction.
|
| 274 |
|
| 275 |
-
return interpolate_thresholds(domain1
|
| 276 |
-
domain2
|
| 277 |
-
weight1
|
| 278 |
)
|
| 279 |
|
| 280 |
-
|
|
|
|
| 281 |
return interpolate_thresholds(domain1 = domain_prediction.primary_domain,
|
| 282 |
domain2 = Domain.GENERAL,
|
| 283 |
-
weight1 = domain_prediction.
|
| 284 |
)
|
| 285 |
|
|
|
|
| 286 |
return get_threshold_for_domain(domain_prediction.primary_domain)
|
| 287 |
|
| 288 |
|
|
@@ -295,8 +310,130 @@ class DomainClassifier:
|
|
| 295 |
self.is_initialized = False
|
| 296 |
|
| 297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
# Export
|
| 300 |
-
__all__ = ["
|
|
|
|
| 301 |
"DomainPrediction",
|
| 302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from typing import Tuple
|
| 5 |
from loguru import logger
|
| 6 |
from typing import Optional
|
| 7 |
+
from config.enums import Domain
|
| 8 |
+
from config.schemas import DomainPrediction
|
| 9 |
from models.model_manager import get_model_manager
|
| 10 |
+
from config.constants import domain_classification_params
|
| 11 |
from config.threshold_config import interpolate_thresholds
|
| 12 |
from config.threshold_config import get_threshold_for_domain
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class DomainClassifier:
|
| 17 |
"""
|
| 18 |
Classifies text into domains using zero-shot classification
|
| 19 |
"""
|
| 20 |
+
# Use constants from config - map string keys to Domain enum
|
| 21 |
+
DOMAIN_LABELS = {Domain.ACADEMIC : domain_classification_params.DOMAIN_LABELS["academic"],
|
| 22 |
+
Domain.CREATIVE : domain_classification_params.DOMAIN_LABELS["creative"],
|
| 23 |
+
Domain.AI_ML : domain_classification_params.DOMAIN_LABELS["ai_ml"],
|
| 24 |
+
Domain.SOFTWARE_DEV : domain_classification_params.DOMAIN_LABELS["software_dev"],
|
| 25 |
+
Domain.TECHNICAL_DOC : domain_classification_params.DOMAIN_LABELS["technical_doc"],
|
| 26 |
+
Domain.ENGINEERING : domain_classification_params.DOMAIN_LABELS["engineering"],
|
| 27 |
+
Domain.SCIENCE : domain_classification_params.DOMAIN_LABELS["science"],
|
| 28 |
+
Domain.BUSINESS : domain_classification_params.DOMAIN_LABELS["business"],
|
| 29 |
+
Domain.JOURNALISM : domain_classification_params.DOMAIN_LABELS["journalism"],
|
| 30 |
+
Domain.SOCIAL_MEDIA : domain_classification_params.DOMAIN_LABELS["social_media"],
|
| 31 |
+
Domain.BLOG_PERSONAL : domain_classification_params.DOMAIN_LABELS["blog_personal"],
|
| 32 |
+
Domain.LEGAL : domain_classification_params.DOMAIN_LABELS["legal"],
|
| 33 |
+
Domain.MEDICAL : domain_classification_params.DOMAIN_LABELS["medical"],
|
| 34 |
+
Domain.MARKETING : domain_classification_params.DOMAIN_LABELS["marketing"],
|
| 35 |
+
Domain.TUTORIAL : domain_classification_params.DOMAIN_LABELS["tutorial"],
|
| 36 |
+
Domain.GENERAL : domain_classification_params.DOMAIN_LABELS["general"],
|
| 37 |
}
|
| 38 |
|
| 39 |
|
|
|
|
| 52 |
logger.info("Initializing domain classifier...")
|
| 53 |
|
| 54 |
# Load primary domain classifier (zero-shot)
|
| 55 |
+
self.primary_classifier = self.model_manager.load_model(model_name = "content_domain_classifier")
|
| 56 |
|
| 57 |
# Load fallback classifier
|
| 58 |
try:
|
|
|
|
| 72 |
return False
|
| 73 |
|
| 74 |
|
| 75 |
+
def classify(self, text: str, top_k: int = domain_classification_params.TOP_K_DOMAINS, min_confidence: float = domain_classification_params.MIN_CONFIDENCE_THRESHOLD) -> DomainPrediction:
|
| 76 |
"""
|
| 77 |
Classify text into domain using zero-shot classification
|
| 78 |
|
| 79 |
Arguments:
|
| 80 |
----------
|
| 81 |
+
text { str } : Input text
|
| 82 |
|
| 83 |
+
top_k { int } : Number of top domains to consider
|
| 84 |
|
| 85 |
min_confidence { float } : Minimum confidence threshold
|
| 86 |
|
|
|
|
| 101 |
)
|
| 102 |
|
| 103 |
# If primary result meets confidence threshold, return it
|
| 104 |
+
if (primary_result.evidence_strength >= min_confidence):
|
| 105 |
return primary_result
|
| 106 |
|
| 107 |
# If primary is low confidence but we have fallback, try fallback
|
|
|
|
| 113 |
)
|
| 114 |
|
| 115 |
# Use fallback if it has higher confidence
|
| 116 |
+
if (fallback_result.evidence_strength > primary_result.evidence_strength):
|
| 117 |
return fallback_result
|
| 118 |
|
| 119 |
# Return primary result even if low confidence
|
|
|
|
| 143 |
Classify using a zero-shot classification model
|
| 144 |
"""
|
| 145 |
# Preprocess text
|
| 146 |
+
processed_text = self._preprocess_text(text)
|
| 147 |
|
| 148 |
# Get all candidate labels
|
| 149 |
all_labels = list()
|
|
|
|
| 151 |
|
| 152 |
for domain, labels in self.DOMAIN_LABELS.items():
|
| 153 |
# Use the first label as the primary one for this domain
|
| 154 |
+
primary_label = labels[0]
|
| 155 |
all_labels.append(primary_label)
|
| 156 |
+
|
| 157 |
label_to_domain[primary_label] = domain
|
| 158 |
|
| 159 |
# Perform zero-shot classification
|
|
|
|
| 187 |
|
| 188 |
secondary_domain = None
|
| 189 |
secondary_score = 0.0
|
| 190 |
+
|
| 191 |
+
# Use constant for secondary domain minimum score
|
| 192 |
+
secondary_min_score = domain_classification_params.SECONDARY_DOMAIN_MIN_SCORE
|
| 193 |
|
| 194 |
+
if ((len(sorted_domains) > 1) and (sorted_domains[1][1] >= secondary_min_score)):
|
| 195 |
secondary_domain = Domain(sorted_domains[1][0])
|
| 196 |
secondary_score = sorted_domains[1][1]
|
| 197 |
|
| 198 |
+
# Calculate evidence_strength
|
| 199 |
+
evidence_strength = primary_score
|
| 200 |
+
|
| 201 |
+
# Use constants for mixed domain detection
|
| 202 |
+
high_conf_threshold = domain_classification_params.HIGH_CONFIDENCE_THRESHOLD
|
| 203 |
+
mixed_secondary_min = domain_classification_params.MIXED_DOMAIN_SECONDARY_MIN
|
| 204 |
+
mixed_ratio_thresh = domain_classification_params.MIXED_DOMAIN_RATIO_THRESHOLD
|
| 205 |
+
mixed_conf_penalty = domain_classification_params.MIXED_DOMAIN_CONFIDENCE_PENALTY
|
| 206 |
|
| 207 |
# If we have mixed domains with close scores, adjust confidence
|
| 208 |
+
if (secondary_domain and (primary_score < high_conf_threshold) and (secondary_score > mixed_secondary_min)):
|
| 209 |
+
|
| 210 |
score_ratio = secondary_score / primary_score
|
| 211 |
|
| 212 |
# Secondary is at least 60% of primary
|
| 213 |
+
if (score_ratio > mixed_ratio_thresh):
|
| 214 |
# Lower confidence for mixed domains
|
| 215 |
+
evidence_strength = ((primary_score + secondary_score) / 2 * mixed_conf_penalty)
|
| 216 |
logger.info(f"Mixed domain detected: {primary_domain.value} + {secondary_domain.value}, will use interpolated thresholds")
|
| 217 |
|
| 218 |
+
# Use constant for low confidence threshold
|
| 219 |
+
low_conf_threshold = domain_classification_params.LOW_CONFIDENCE_THRESHOLD
|
| 220 |
+
|
| 221 |
# If primary score is low and we have a secondary, it's uncertain
|
| 222 |
+
if ((primary_score < low_conf_threshold) and secondary_domain):
|
| 223 |
+
# Reduce confidence using penalty
|
| 224 |
+
evidence_strength *= mixed_conf_penalty
|
| 225 |
|
| 226 |
+
logger.info(f"{model_type.capitalize()} model classified domain: {primary_domain.value} (confidence: {evidence_strength:.3f})")
|
| 227 |
|
| 228 |
+
return DomainPrediction(primary_domain = primary_domain,
|
| 229 |
+
secondary_domain = secondary_domain,
|
| 230 |
+
evidence_strength = evidence_strength,
|
| 231 |
+
domain_scores = avg_domain_scores,
|
| 232 |
)
|
| 233 |
|
| 234 |
|
|
|
|
| 236 |
"""
|
| 237 |
Preprocess text for classification
|
| 238 |
"""
|
| 239 |
+
# Truncate to reasonable length using constant
|
| 240 |
+
max_words = domain_classification_params.MAX_WORDS_FOR_CLASSIFICATION
|
| 241 |
+
words = text.split()
|
| 242 |
+
|
| 243 |
+
if (len(words) > max_words):
|
| 244 |
+
text = ' '.join(words[:max_words])
|
| 245 |
|
| 246 |
# Clean up text
|
| 247 |
text = text.strip()
|
|
|
|
| 255 |
"""
|
| 256 |
Get default prediction when classification fails
|
| 257 |
"""
|
| 258 |
+
return DomainPrediction(primary_domain = Domain.GENERAL,
|
| 259 |
+
secondary_domain = None,
|
| 260 |
+
evidence_strength = 0.5,
|
| 261 |
+
domain_scores = {Domain.GENERAL.value: 1.0},
|
| 262 |
)
|
| 263 |
|
| 264 |
|
|
|
|
| 266 |
"""
|
| 267 |
Get adaptive thresholds based on domain prediction
|
| 268 |
"""
|
| 269 |
+
# Use constants for threshold decisions
|
| 270 |
+
high_conf_threshold = domain_classification_params.HIGH_CONFIDENCE_THRESHOLD
|
| 271 |
+
med_conf_threshold = domain_classification_params.MEDIUM_CONFIDENCE_THRESHOLD
|
| 272 |
+
|
| 273 |
+
# High confidence, single domain - use domain-specific thresholds
|
| 274 |
+
if ((domain_prediction.evidence_strength > high_conf_threshold) and (not domain_prediction.secondary_domain)):
|
| 275 |
return get_threshold_for_domain(domain_prediction.primary_domain)
|
| 276 |
|
| 277 |
+
# Mixed domains - interpolate between primary and secondary
|
| 278 |
if domain_prediction.secondary_domain:
|
| 279 |
primary_score = domain_prediction.domain_scores.get(domain_prediction.primary_domain.value, 0)
|
| 280 |
secondary_score = domain_prediction.domain_scores.get(domain_prediction.secondary_domain.value, 0)
|
|
|
|
| 283 |
weight1 = primary_score / (primary_score + secondary_score)
|
| 284 |
|
| 285 |
else:
|
| 286 |
+
weight1 = domain_prediction.evidence_strength
|
| 287 |
|
| 288 |
+
return interpolate_thresholds(domain1 = domain_prediction.primary_domain,
|
| 289 |
+
domain2 = domain_prediction.secondary_domain,
|
| 290 |
+
weight1 = weight1,
|
| 291 |
)
|
| 292 |
|
| 293 |
+
# Low/medium confidence - blend with general domain
|
| 294 |
+
if (domain_prediction.evidence_strength < med_conf_threshold):
|
| 295 |
return interpolate_thresholds(domain1 = domain_prediction.primary_domain,
|
| 296 |
domain2 = Domain.GENERAL,
|
| 297 |
+
weight1 = domain_prediction.evidence_strength,
|
| 298 |
)
|
| 299 |
|
| 300 |
+
# Default: use domain-specific thresholds
|
| 301 |
return get_threshold_for_domain(domain_prediction.primary_domain)
|
| 302 |
|
| 303 |
|
|
|
|
| 310 |
self.is_initialized = False
|
| 311 |
|
| 312 |
|
| 313 |
+
def quick_classify(text: str, **kwargs) -> DomainPrediction:
|
| 314 |
+
"""
|
| 315 |
+
Quick domain classification with default settings
|
| 316 |
+
|
| 317 |
+
Arguments:
|
| 318 |
+
----------
|
| 319 |
+
text { str } : Input text
|
| 320 |
+
|
| 321 |
+
**kwargs : Override settings
|
| 322 |
+
|
| 323 |
+
Returns:
|
| 324 |
+
--------
|
| 325 |
+
{ DomainPrediction } : DomainPrediction object
|
| 326 |
+
"""
|
| 327 |
+
classifier = DomainClassifier()
|
| 328 |
+
classifier.initialize()
|
| 329 |
+
return classifier.classify(text, **kwargs)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def get_domain_name(domain: Domain) -> str:
|
| 333 |
+
"""
|
| 334 |
+
Get human-readable domain name
|
| 335 |
+
|
| 336 |
+
Arguments:
|
| 337 |
+
----------
|
| 338 |
+
domain { Domain } : Domain enum value
|
| 339 |
+
|
| 340 |
+
Returns:
|
| 341 |
+
--------
|
| 342 |
+
{ str } : Human-readable domain name
|
| 343 |
+
"""
|
| 344 |
+
domain_names = {Domain.ACADEMIC : "Academic",
|
| 345 |
+
Domain.CREATIVE : "Creative Writing",
|
| 346 |
+
Domain.AI_ML : "AI/ML",
|
| 347 |
+
Domain.SOFTWARE_DEV : "Software Development",
|
| 348 |
+
Domain.TECHNICAL_DOC : "Technical Documentation",
|
| 349 |
+
Domain.ENGINEERING : "Engineering",
|
| 350 |
+
Domain.SCIENCE : "Science",
|
| 351 |
+
Domain.BUSINESS : "Business",
|
| 352 |
+
Domain.JOURNALISM : "Journalism",
|
| 353 |
+
Domain.SOCIAL_MEDIA : "Social Media",
|
| 354 |
+
Domain.BLOG_PERSONAL : "Personal Blog",
|
| 355 |
+
Domain.LEGAL : "Legal",
|
| 356 |
+
Domain.MEDICAL : "Medical",
|
| 357 |
+
Domain.MARKETING : "Marketing",
|
| 358 |
+
Domain.TUTORIAL : "Tutorial",
|
| 359 |
+
Domain.GENERAL : "General",
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
return domain_names.get(domain, "Unknown")
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def is_technical_domain(domain: Domain) -> bool:
|
| 366 |
+
"""
|
| 367 |
+
Check if domain is technical in nature
|
| 368 |
+
|
| 369 |
+
Arguments:
|
| 370 |
+
----------
|
| 371 |
+
domain { Domain } : Domain enum value
|
| 372 |
+
|
| 373 |
+
Returns:
|
| 374 |
+
--------
|
| 375 |
+
{ bool } : True if technical domain
|
| 376 |
+
"""
|
| 377 |
+
technical_domains = {Domain.AI_ML,
|
| 378 |
+
Domain.SOFTWARE_DEV,
|
| 379 |
+
Domain.TECHNICAL_DOC,
|
| 380 |
+
Domain.ENGINEERING,
|
| 381 |
+
Domain.SCIENCE,
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
return domain in technical_domains
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def is_creative_domain(domain: Domain) -> bool:
|
| 388 |
+
"""
|
| 389 |
+
Check if domain is creative in nature
|
| 390 |
+
|
| 391 |
+
Arguments:
|
| 392 |
+
----------
|
| 393 |
+
domain { Domain } : Domain enum value
|
| 394 |
+
|
| 395 |
+
Returns:
|
| 396 |
+
--------
|
| 397 |
+
{ bool } : True if creative domain
|
| 398 |
+
"""
|
| 399 |
+
creative_domains = {Domain.CREATIVE,
|
| 400 |
+
Domain.JOURNALISM,
|
| 401 |
+
Domain.SOCIAL_MEDIA,
|
| 402 |
+
Domain.BLOG_PERSONAL,
|
| 403 |
+
Domain.MARKETING,
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
return domain in creative_domains
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
def is_formal_domain(domain: Domain) -> bool:
|
| 410 |
+
"""
|
| 411 |
+
Check if domain is formal in nature
|
| 412 |
+
|
| 413 |
+
Arguments:
|
| 414 |
+
----------
|
| 415 |
+
domain { Domain } : Domain enum value
|
| 416 |
+
|
| 417 |
+
Returns:
|
| 418 |
+
--------
|
| 419 |
+
{ bool } : True if formal domain
|
| 420 |
+
"""
|
| 421 |
+
formal_domains = {Domain.ACADEMIC,
|
| 422 |
+
Domain.LEGAL,
|
| 423 |
+
Domain.MEDICAL,
|
| 424 |
+
Domain.BUSINESS,
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
return domain in formal_domains
|
| 428 |
+
|
| 429 |
|
| 430 |
# Export
|
| 431 |
+
__all__ = ["Domain",
|
| 432 |
+
"DomainClassifier",
|
| 433 |
"DomainPrediction",
|
| 434 |
+
"quick_classify",
|
| 435 |
+
"get_domain_name",
|
| 436 |
+
"is_technical_domain",
|
| 437 |
+
"is_creative_domain",
|
| 438 |
+
"is_formal_domain",
|
| 439 |
+
]
|
processors/language_detector.py
CHANGED
|
@@ -2,13 +2,16 @@
|
|
| 2 |
import re
|
| 3 |
import torch
|
| 4 |
import string
|
| 5 |
-
from enum import Enum
|
| 6 |
from typing import Dict
|
| 7 |
from typing import List
|
| 8 |
from typing import Tuple
|
| 9 |
from loguru import logger
|
| 10 |
from typing import Optional
|
|
|
|
| 11 |
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
# Try to import optional libraries
|
|
@@ -32,152 +35,50 @@ except ImportError:
|
|
| 32 |
MODEL_MANAGER_AVAILABLE = False
|
| 33 |
|
| 34 |
|
| 35 |
-
class Language(Enum):
|
| 36 |
-
"""
|
| 37 |
-
ISO 639-1 language codes for supported languages
|
| 38 |
-
"""
|
| 39 |
-
ENGLISH = "en"
|
| 40 |
-
SPANISH = "es"
|
| 41 |
-
FRENCH = "fr"
|
| 42 |
-
GERMAN = "de"
|
| 43 |
-
ITALIAN = "it"
|
| 44 |
-
PORTUGUESE = "pt"
|
| 45 |
-
RUSSIAN = "ru"
|
| 46 |
-
CHINESE = "zh"
|
| 47 |
-
JAPANESE = "ja"
|
| 48 |
-
KOREAN = "ko"
|
| 49 |
-
ARABIC = "ar"
|
| 50 |
-
HINDI = "hi"
|
| 51 |
-
DUTCH = "nl"
|
| 52 |
-
POLISH = "pl"
|
| 53 |
-
TURKISH = "tr"
|
| 54 |
-
SWEDISH = "sv"
|
| 55 |
-
VIETNAMESE = "vi"
|
| 56 |
-
INDONESIAN = "id"
|
| 57 |
-
THAI = "th"
|
| 58 |
-
GREEK = "el"
|
| 59 |
-
HEBREW = "he"
|
| 60 |
-
CZECH = "cs"
|
| 61 |
-
ROMANIAN = "ro"
|
| 62 |
-
DANISH = "da"
|
| 63 |
-
FINNISH = "fi"
|
| 64 |
-
NORWEGIAN = "no"
|
| 65 |
-
UNKNOWN = "unknown"
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
class Script(Enum):
|
| 69 |
-
"""
|
| 70 |
-
Writing scripts
|
| 71 |
-
"""
|
| 72 |
-
LATIN = "latin"
|
| 73 |
-
CYRILLIC = "cyrillic"
|
| 74 |
-
ARABIC = "arabic"
|
| 75 |
-
CHINESE = "chinese"
|
| 76 |
-
JAPANESE = "japanese"
|
| 77 |
-
KOREAN = "korean"
|
| 78 |
-
DEVANAGARI = "devanagari"
|
| 79 |
-
GREEK = "greek"
|
| 80 |
-
HEBREW = "hebrew"
|
| 81 |
-
THAI = "thai"
|
| 82 |
-
MIXED = "mixed"
|
| 83 |
-
UNKNOWN = "unknown"
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
@dataclass
|
| 87 |
-
class LanguageDetectionResult:
|
| 88 |
-
"""
|
| 89 |
-
Result of language detection
|
| 90 |
-
"""
|
| 91 |
-
primary_language : Language
|
| 92 |
-
confidence : float
|
| 93 |
-
all_languages : Dict[str, float] # language_code -> confidence
|
| 94 |
-
script : Script
|
| 95 |
-
is_multilingual : bool
|
| 96 |
-
detection_method : str
|
| 97 |
-
char_count : int
|
| 98 |
-
word_count : int
|
| 99 |
-
warnings : List[str]
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
def to_dict(self) -> Dict:
|
| 103 |
-
"""
|
| 104 |
-
Convert to dictionary
|
| 105 |
-
"""
|
| 106 |
-
return {"primary_language" : self.primary_language.value,
|
| 107 |
-
"confidence" : round(self.confidence, 4),
|
| 108 |
-
"all_languages" : {k: round(v, 4) for k, v in self.all_languages.items()},
|
| 109 |
-
"script" : self.script.value,
|
| 110 |
-
"is_multilingual" : self.is_multilingual,
|
| 111 |
-
"detection_method" : self.detection_method,
|
| 112 |
-
"char_count" : self.char_count,
|
| 113 |
-
"word_count" : self.word_count,
|
| 114 |
-
"warnings" : self.warnings,
|
| 115 |
-
}
|
| 116 |
-
|
| 117 |
-
|
| 118 |
class LanguageDetector:
|
| 119 |
"""
|
| 120 |
Detects the language of input text using multiple strategies with fallbacks.
|
| 121 |
|
| 122 |
Features:
|
| 123 |
-
-
|
| 124 |
-
-
|
| 125 |
-
-
|
| 126 |
-
-
|
| 127 |
-
- Multi-language detection
|
| 128 |
-
- Script detection (Latin, Cyrillic, Arabic, etc.)
|
| 129 |
|
| 130 |
Supported Languages:
|
| 131 |
-
-
|
| 132 |
-
-
|
| 133 |
"""
|
| 134 |
-
#
|
| 135 |
-
MIN_TEXT_LENGTH =
|
| 136 |
-
|
| 137 |
-
# Language name mappings
|
| 138 |
-
LANGUAGE_NAMES = {"en": "English",
|
| 139 |
-
"es": "Spanish",
|
| 140 |
-
"fr": "French",
|
| 141 |
-
"de": "German",
|
| 142 |
-
"it": "Italian",
|
| 143 |
-
"pt": "Portuguese",
|
| 144 |
-
"ru": "Russian",
|
| 145 |
-
"zh": "Chinese",
|
| 146 |
-
"ja": "Japanese",
|
| 147 |
-
"ko": "Korean",
|
| 148 |
-
"ar": "Arabic",
|
| 149 |
-
"hi": "Hindi",
|
| 150 |
-
}
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
SCRIPT_RANGES = {Script.LATIN: [
|
| 154 |
-
Script.CYRILLIC: [
|
| 155 |
-
Script.ARABIC: [
|
| 156 |
-
Script.CHINESE: [
|
| 157 |
-
Script.JAPANESE: [
|
| 158 |
-
Script.KOREAN: [
|
| 159 |
-
Script.DEVANAGARI: [
|
| 160 |
-
Script.GREEK: [
|
| 161 |
-
Script.HEBREW: [
|
| 162 |
-
Script.THAI: [
|
| 163 |
}
|
| 164 |
|
| 165 |
|
| 166 |
-
def __init__(self, use_model: bool = True
|
| 167 |
"""
|
| 168 |
Initialize language detector
|
| 169 |
|
| 170 |
Arguments:
|
| 171 |
----------
|
| 172 |
-
use_model
|
| 173 |
-
|
| 174 |
-
min_confidence : Minimum confidence threshold
|
| 175 |
"""
|
| 176 |
-
self.use_model
|
| 177 |
-
self.
|
| 178 |
-
self.
|
| 179 |
-
self.
|
| 180 |
-
self.is_initialized = False
|
| 181 |
|
| 182 |
logger.info(f"LanguageDetector initialized (use_model={self.use_model})")
|
| 183 |
|
|
@@ -251,44 +152,45 @@ class LanguageDetector:
|
|
| 251 |
# Try detection methods in order
|
| 252 |
result = None
|
| 253 |
|
| 254 |
-
# Method 1
|
| 255 |
if self.use_model and self.is_initialized:
|
| 256 |
try:
|
| 257 |
result = self._detect_with_model(text = cleaned_text)
|
| 258 |
-
result.detection_method = "
|
| 259 |
|
| 260 |
except Exception as e:
|
| 261 |
logger.warning(f"Model detection failed: {repr(e)}, trying fallback")
|
| 262 |
warnings.append("Model detection failed, using fallback")
|
| 263 |
|
| 264 |
-
# Method 2
|
| 265 |
if result is None and LANGDETECT_AVAILABLE:
|
| 266 |
try:
|
| 267 |
result = self._detect_with_langdetect(text = cleaned_text)
|
| 268 |
-
result.detection_method = "
|
| 269 |
|
| 270 |
except Exception as e:
|
| 271 |
logger.warning(f"langdetect failed: {repr(e)}, trying heuristics")
|
| 272 |
warnings.append("langdetect failed, using heuristics")
|
| 273 |
|
| 274 |
-
# Method 3
|
| 275 |
if result is None:
|
| 276 |
result = self._detect_with_heuristics(cleaned_text, script)
|
| 277 |
-
result.detection_method = "character-
|
| 278 |
|
| 279 |
# Add metadata
|
| 280 |
result.script = script
|
| 281 |
result.char_count = char_count
|
| 282 |
result.word_count = word_count
|
| 283 |
-
|
| 284 |
result.warnings.extend(warnings)
|
| 285 |
|
| 286 |
-
# Check for multilingual content
|
| 287 |
-
|
|
|
|
|
|
|
| 288 |
result.is_multilingual = True
|
| 289 |
warnings.append("Text appears to contain multiple languages")
|
| 290 |
|
| 291 |
-
logger.info(f"Detected language: {result.primary_language.value} (
|
| 292 |
|
| 293 |
return result
|
| 294 |
|
|
@@ -328,7 +230,7 @@ class LanguageDetector:
|
|
| 328 |
raise
|
| 329 |
|
| 330 |
|
| 331 |
-
def _split_text_into_chunks(self, text: str, max_chunk_length: int =
|
| 332 |
"""
|
| 333 |
Split text into meaningful chunks for language detection
|
| 334 |
|
|
@@ -352,12 +254,11 @@ class LanguageDetector:
|
|
| 352 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 353 |
|
| 354 |
chunks = list()
|
| 355 |
-
|
| 356 |
current_chunk = ""
|
| 357 |
|
| 358 |
for sentence in sentences:
|
| 359 |
# If adding this sentence doesn't exceed max length
|
| 360 |
-
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
|
| 361 |
if current_chunk:
|
| 362 |
current_chunk += " " + sentence
|
| 363 |
|
|
@@ -366,7 +267,7 @@ class LanguageDetector:
|
|
| 366 |
|
| 367 |
else:
|
| 368 |
# Current chunk is full, save it
|
| 369 |
-
if current_chunk and len(current_chunk) >= min_chunk_length:
|
| 370 |
chunks.append(current_chunk)
|
| 371 |
|
| 372 |
# Start new chunk with current sentence
|
|
@@ -377,29 +278,32 @@ class LanguageDetector:
|
|
| 377 |
chunks.append(current_chunk)
|
| 378 |
|
| 379 |
# Strategy 2: If sentence splitting didn't work well, use fixed-length chunks
|
| 380 |
-
if ((len(chunks) == 0) or ((len(chunks) == 1
|
| 381 |
chunks = self._split_fixed_length(text, max_chunk_length)
|
| 382 |
|
| 383 |
logger.debug(f"Split {len(text)} chars into {len(chunks)} chunks: {[len(c) for c in chunks]}")
|
| 384 |
return chunks
|
| 385 |
|
| 386 |
|
| 387 |
-
def _split_fixed_length(self, text: str, chunk_size: int =
|
| 388 |
"""
|
| 389 |
Fallback: Split text into fixed-length chunks
|
| 390 |
"""
|
| 391 |
-
chunks
|
|
|
|
| 392 |
|
| 393 |
for i in range(0, len(text), chunk_size):
|
| 394 |
chunk = text[i:i + chunk_size]
|
|
|
|
| 395 |
# Try to break at word boundaries
|
| 396 |
if ((i + chunk_size) < len(text)):
|
| 397 |
last_space = chunk.rfind(' ')
|
| 398 |
-
# If we found a space in the last 30%
|
| 399 |
-
if (last_space > chunk_size *
|
| 400 |
chunk = chunk[:last_space].strip()
|
| 401 |
|
| 402 |
chunks.append(chunk)
|
|
|
|
| 403 |
return chunks
|
| 404 |
|
| 405 |
|
|
@@ -408,32 +312,34 @@ class LanguageDetector:
|
|
| 408 |
Process a single chunk through the language detection model
|
| 409 |
"""
|
| 410 |
# Get the tokenizer from the pipeline
|
| 411 |
-
tokenizer
|
| 412 |
-
|
| 413 |
-
# Tokenize with explicit length limits
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
|
|
|
| 421 |
|
| 422 |
# Get model from pipeline
|
| 423 |
-
model
|
| 424 |
-
device
|
| 425 |
|
| 426 |
# Move inputs to correct device
|
| 427 |
-
inputs
|
| 428 |
|
| 429 |
with torch.no_grad():
|
| 430 |
outputs = model(**inputs)
|
| 431 |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
| 432 |
|
| 433 |
-
# Get top predictions for this chunk
|
| 434 |
-
|
|
|
|
| 435 |
|
| 436 |
-
chunk_results
|
| 437 |
|
| 438 |
for i in range(len(top_predictions.indices)):
|
| 439 |
lang_idx = top_predictions.indices[i].item()
|
|
@@ -451,6 +357,19 @@ class LanguageDetector:
|
|
| 451 |
return chunk_results
|
| 452 |
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
def _aggregate_chunk_results(self, chunk_results: List[Dict]) -> LanguageDetectionResult:
|
| 455 |
"""
|
| 456 |
Aggregate results from multiple chunks using weighted averaging
|
|
@@ -460,9 +379,9 @@ class LanguageDetector:
|
|
| 460 |
chunk_weights = list()
|
| 461 |
|
| 462 |
for chunk_result in chunk_results:
|
| 463 |
-
# Calculate chunk weight based on
|
| 464 |
top_score = max(chunk_result.values()) if chunk_result else 0
|
| 465 |
-
# Weight by
|
| 466 |
chunk_weight = top_score
|
| 467 |
|
| 468 |
chunk_weights.append(chunk_weight)
|
|
@@ -480,12 +399,12 @@ class LanguageDetector:
|
|
| 480 |
if (len(scores) != len(chunk_weights)):
|
| 481 |
# Use simple average if weight mismatch
|
| 482 |
weighted_scores[lang_code] = sum(scores) / len(scores)
|
| 483 |
-
|
| 484 |
else:
|
| 485 |
# Weighted average
|
| 486 |
weighted_sum = sum(score * weight for score, weight in zip(scores, chunk_weights))
|
| 487 |
total_weight = sum(chunk_weights)
|
| 488 |
-
weighted_scores[lang_code] = weighted_sum / total_weight if total_weight > 0 else sum(scores) / len(scores)
|
| 489 |
|
| 490 |
# Find primary language
|
| 491 |
primary_lang = None
|
|
@@ -493,13 +412,13 @@ class LanguageDetector:
|
|
| 493 |
|
| 494 |
for lang_code, score in weighted_scores.items():
|
| 495 |
if (score > primary_conf):
|
| 496 |
-
|
| 497 |
-
primary_lang
|
| 498 |
|
| 499 |
# Convert to Language enum
|
| 500 |
try:
|
| 501 |
-
primary_language =
|
| 502 |
-
|
| 503 |
except ValueError:
|
| 504 |
primary_language = Language.UNKNOWN
|
| 505 |
|
|
@@ -508,21 +427,21 @@ class LanguageDetector:
|
|
| 508 |
|
| 509 |
warnings = list()
|
| 510 |
|
| 511 |
-
if detection_quality.get('
|
| 512 |
-
warnings.append("Low
|
| 513 |
|
| 514 |
if detection_quality.get('inconsistent', False):
|
| 515 |
warnings.append("Inconsistent language detection across chunks")
|
| 516 |
|
| 517 |
-
return LanguageDetectionResult(primary_language
|
| 518 |
-
|
| 519 |
-
all_languages
|
| 520 |
-
script
|
| 521 |
-
is_multilingual
|
| 522 |
-
detection_method
|
| 523 |
-
char_count
|
| 524 |
-
word_count
|
| 525 |
-
warnings
|
| 526 |
)
|
| 527 |
|
| 528 |
|
|
@@ -530,18 +449,21 @@ class LanguageDetector:
|
|
| 530 |
"""
|
| 531 |
Assess the quality and consistency of language detection across chunks
|
| 532 |
"""
|
| 533 |
-
quality_metrics = {'
|
| 534 |
-
'inconsistent'
|
| 535 |
-
'multilingual'
|
| 536 |
}
|
| 537 |
|
| 538 |
if not chunk_results:
|
| 539 |
return quality_metrics
|
| 540 |
|
| 541 |
-
# Check for low
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
|
|
|
|
|
|
|
|
|
| 545 |
|
| 546 |
# Check for inconsistency (different primary languages across chunks)
|
| 547 |
chunk_primaries = list()
|
|
@@ -554,8 +476,10 @@ class LanguageDetector:
|
|
| 554 |
if (len(set(chunk_primaries)) > 1):
|
| 555 |
quality_metrics['inconsistent'] = True
|
| 556 |
|
| 557 |
-
# Check for multilingual content
|
| 558 |
-
|
|
|
|
|
|
|
| 559 |
if (len(strong_languages) > 1):
|
| 560 |
quality_metrics['multilingual'] = True
|
| 561 |
|
|
@@ -575,63 +499,63 @@ class LanguageDetector:
|
|
| 575 |
all_languages[prob.lang] = prob.prob
|
| 576 |
|
| 577 |
# Primary language
|
| 578 |
-
primary
|
| 579 |
|
| 580 |
try:
|
| 581 |
-
primary_language =
|
| 582 |
-
|
| 583 |
except ValueError:
|
| 584 |
primary_language = Language.UNKNOWN
|
| 585 |
|
| 586 |
-
return LanguageDetectionResult(primary_language
|
| 587 |
-
|
| 588 |
-
all_languages
|
| 589 |
-
script
|
| 590 |
-
is_multilingual
|
| 591 |
-
detection_method
|
| 592 |
-
char_count
|
| 593 |
-
word_count
|
| 594 |
-
warnings
|
| 595 |
)
|
| 596 |
-
|
| 597 |
|
| 598 |
def _detect_with_heuristics(self, text: str, script: Script) -> LanguageDetectionResult:
|
| 599 |
"""
|
| 600 |
Detect language using character-based heuristics
|
| 601 |
"""
|
| 602 |
# Script-based language mapping
|
| 603 |
-
script_to_language = {Script.CHINESE
|
| 604 |
-
Script.JAPANESE
|
| 605 |
-
Script.KOREAN
|
| 606 |
-
Script.ARABIC
|
| 607 |
-
Script.CYRILLIC
|
| 608 |
-
Script.DEVANAGARI
|
| 609 |
-
Script.GREEK
|
| 610 |
-
Script.HEBREW
|
| 611 |
-
Script.THAI
|
| 612 |
}
|
| 613 |
|
| 614 |
# If script clearly indicates language
|
| 615 |
if script in script_to_language:
|
| 616 |
-
primary_language
|
| 617 |
-
# Moderate
|
| 618 |
-
|
| 619 |
|
| 620 |
else:
|
| 621 |
# For Latin script, check common words
|
| 622 |
-
primary_language
|
| 623 |
-
# Lower
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
return LanguageDetectionResult(primary_language
|
| 627 |
-
|
| 628 |
-
all_languages
|
| 629 |
-
script
|
| 630 |
-
is_multilingual
|
| 631 |
-
detection_method
|
| 632 |
-
char_count
|
| 633 |
-
word_count
|
| 634 |
-
warnings
|
| 635 |
)
|
| 636 |
|
| 637 |
|
|
@@ -660,9 +584,9 @@ class LanguageDetector:
|
|
| 660 |
|
| 661 |
# Return language with highest score
|
| 662 |
if scores:
|
| 663 |
-
best_lang = max(scores.items(), key
|
| 664 |
# At least 3 matches
|
| 665 |
-
if (best_lang[1] > 2):
|
| 666 |
return best_lang[0]
|
| 667 |
|
| 668 |
# Default to English for Latin script
|
|
@@ -697,10 +621,11 @@ class LanguageDetector:
|
|
| 697 |
# Calculate percentages
|
| 698 |
script_percentages = {script: count / total_chars for script, count in script_counts.items() if count > 0}
|
| 699 |
|
| 700 |
-
# Check if mixed
|
|
|
|
| 701 |
if (len(script_percentages) > 1):
|
| 702 |
max_percentage = max(script_percentages.values())
|
| 703 |
-
if (max_percentage <
|
| 704 |
return Script.MIXED
|
| 705 |
|
| 706 |
# Return dominant script
|
|
@@ -731,19 +656,20 @@ class LanguageDetector:
|
|
| 731 |
"""
|
| 732 |
Create result for unknown language
|
| 733 |
"""
|
| 734 |
-
return LanguageDetectionResult(primary_language
|
| 735 |
-
|
| 736 |
-
all_languages
|
| 737 |
-
script
|
| 738 |
-
is_multilingual
|
| 739 |
-
detection_method
|
| 740 |
-
char_count
|
| 741 |
-
word_count
|
| 742 |
-
warnings
|
| 743 |
)
|
| 744 |
|
| 745 |
|
| 746 |
-
def is_language(self, text: str, target_language: Language, threshold: float =
|
|
|
|
| 747 |
"""
|
| 748 |
Check if text is in a specific language
|
| 749 |
|
|
@@ -753,14 +679,14 @@ class LanguageDetector:
|
|
| 753 |
|
| 754 |
target_language : Language to check for
|
| 755 |
|
| 756 |
-
threshold : Minimum
|
| 757 |
|
| 758 |
Returns:
|
| 759 |
--------
|
| 760 |
-
{ bool } : True if text is in target language with sufficient
|
| 761 |
"""
|
| 762 |
result = self.detect(text)
|
| 763 |
-
return ((result.primary_language == target_language) and (result.
|
| 764 |
|
| 765 |
|
| 766 |
def get_supported_languages(self) -> List[str]:
|
|
@@ -778,7 +704,7 @@ class LanguageDetector:
|
|
| 778 |
self.is_initialized = False
|
| 779 |
|
| 780 |
|
| 781 |
-
|
| 782 |
def quick_detect(text: str, **kwargs) -> LanguageDetectionResult:
|
| 783 |
"""
|
| 784 |
Quick language detection with default settings
|
|
@@ -786,7 +712,6 @@ def quick_detect(text: str, **kwargs) -> LanguageDetectionResult:
|
|
| 786 |
Arguments:
|
| 787 |
----------
|
| 788 |
text : Input text
|
| 789 |
-
|
| 790 |
**kwargs : Override settings
|
| 791 |
|
| 792 |
Returns:
|
|
@@ -801,17 +726,16 @@ def quick_detect(text: str, **kwargs) -> LanguageDetectionResult:
|
|
| 801 |
return detector.detect(text)
|
| 802 |
|
| 803 |
|
| 804 |
-
def is_english(text: str, threshold: float =
|
| 805 |
"""
|
| 806 |
Quick check if text is English
|
| 807 |
"""
|
| 808 |
-
detector = LanguageDetector(use_model
|
| 809 |
is_english = detector.is_language(text, Language.ENGLISH, threshold)
|
| 810 |
|
| 811 |
return is_english
|
| 812 |
|
| 813 |
|
| 814 |
-
|
| 815 |
# Export
|
| 816 |
__all__ = ['Script',
|
| 817 |
'Language',
|
|
@@ -819,4 +743,4 @@ __all__ = ['Script',
|
|
| 819 |
'quick_detect',
|
| 820 |
'LanguageDetector',
|
| 821 |
'LanguageDetectionResult',
|
| 822 |
-
]
|
|
|
|
| 2 |
import re
|
| 3 |
import torch
|
| 4 |
import string
|
|
|
|
| 5 |
from typing import Dict
|
| 6 |
from typing import List
|
| 7 |
from typing import Tuple
|
| 8 |
from loguru import logger
|
| 9 |
from typing import Optional
|
| 10 |
+
from config.enums import Script
|
| 11 |
from dataclasses import dataclass
|
| 12 |
+
from config.enums import Language
|
| 13 |
+
from config.schemas import LanguageDetectionResult
|
| 14 |
+
from config.constants import language_detection_params
|
| 15 |
|
| 16 |
|
| 17 |
# Try to import optional libraries
|
|
|
|
| 35 |
MODEL_MANAGER_AVAILABLE = False
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
class LanguageDetector:
|
| 39 |
"""
|
| 40 |
Detects the language of input text using multiple strategies with fallbacks.
|
| 41 |
|
| 42 |
Features:
|
| 43 |
+
- Learned language representations (when available)
|
| 44 |
+
- Statistical language probability estimation
|
| 45 |
+
- Script and character distribution analysis
|
| 46 |
+
- Multi-signal aggregation with fallbacks
|
|
|
|
|
|
|
| 47 |
|
| 48 |
Supported Languages:
|
| 49 |
+
- Broad multilingual coverage via learned language representations
|
| 50 |
+
- Deterministic support via script and statistical analysis
|
| 51 |
"""
|
| 52 |
+
# Use constants from config
|
| 53 |
+
MIN_TEXT_LENGTH = language_detection_params.MINIMUM_TEXT_LENGTH
|
| 54 |
+
LANGUAGE_NAMES = language_detection_params.LANGUAGE_NAMES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
# Map Script enum to string keys for SCRIPT_RANGES
|
| 57 |
+
SCRIPT_RANGES = {Script.LATIN : language_detection_params.SCRIPT_RANGES["latin"],
|
| 58 |
+
Script.CYRILLIC : language_detection_params.SCRIPT_RANGES["cyrillic"],
|
| 59 |
+
Script.ARABIC : language_detection_params.SCRIPT_RANGES["arabic"],
|
| 60 |
+
Script.CHINESE : language_detection_params.SCRIPT_RANGES["chinese"],
|
| 61 |
+
Script.JAPANESE : language_detection_params.SCRIPT_RANGES["japanese"],
|
| 62 |
+
Script.KOREAN : language_detection_params.SCRIPT_RANGES["korean"],
|
| 63 |
+
Script.DEVANAGARI : language_detection_params.SCRIPT_RANGES["devanagari"],
|
| 64 |
+
Script.GREEK : language_detection_params.SCRIPT_RANGES["greek"],
|
| 65 |
+
Script.HEBREW : language_detection_params.SCRIPT_RANGES["hebrew"],
|
| 66 |
+
Script.THAI : language_detection_params.SCRIPT_RANGES["thai"],
|
| 67 |
}
|
| 68 |
|
| 69 |
|
| 70 |
+
def __init__(self, use_model: bool = True):
|
| 71 |
"""
|
| 72 |
Initialize language detector
|
| 73 |
|
| 74 |
Arguments:
|
| 75 |
----------
|
| 76 |
+
use_model : Use ML model for detection (more accurate)
|
|
|
|
|
|
|
| 77 |
"""
|
| 78 |
+
self.use_model = use_model and MODEL_MANAGER_AVAILABLE
|
| 79 |
+
self.model_manager = None
|
| 80 |
+
self.classifier = None
|
| 81 |
+
self.is_initialized = False
|
|
|
|
| 82 |
|
| 83 |
logger.info(f"LanguageDetector initialized (use_model={self.use_model})")
|
| 84 |
|
|
|
|
| 152 |
# Try detection methods in order
|
| 153 |
result = None
|
| 154 |
|
| 155 |
+
# Method 1: ML Model
|
| 156 |
if self.use_model and self.is_initialized:
|
| 157 |
try:
|
| 158 |
result = self._detect_with_model(text = cleaned_text)
|
| 159 |
+
result.detection_method = "learned-language-representation"
|
| 160 |
|
| 161 |
except Exception as e:
|
| 162 |
logger.warning(f"Model detection failed: {repr(e)}, trying fallback")
|
| 163 |
warnings.append("Model detection failed, using fallback")
|
| 164 |
|
| 165 |
+
# Method 2: langdetect library
|
| 166 |
if result is None and LANGDETECT_AVAILABLE:
|
| 167 |
try:
|
| 168 |
result = self._detect_with_langdetect(text = cleaned_text)
|
| 169 |
+
result.detection_method = "statistical-language-estimation"
|
| 170 |
|
| 171 |
except Exception as e:
|
| 172 |
logger.warning(f"langdetect failed: {repr(e)}, trying heuristics")
|
| 173 |
warnings.append("langdetect failed, using heuristics")
|
| 174 |
|
| 175 |
+
# Method 3: Character-based heuristics
|
| 176 |
if result is None:
|
| 177 |
result = self._detect_with_heuristics(cleaned_text, script)
|
| 178 |
+
result.detection_method = "character-distribution-analysis"
|
| 179 |
|
| 180 |
# Add metadata
|
| 181 |
result.script = script
|
| 182 |
result.char_count = char_count
|
| 183 |
result.word_count = word_count
|
|
|
|
| 184 |
result.warnings.extend(warnings)
|
| 185 |
|
| 186 |
+
# Check for multilingual content using constant
|
| 187 |
+
threshold = language_detection_params.MULTILINGUAL_THRESHOLD
|
| 188 |
+
|
| 189 |
+
if len([v for v in result.all_languages.values() if v > threshold]) > 1:
|
| 190 |
result.is_multilingual = True
|
| 191 |
warnings.append("Text appears to contain multiple languages")
|
| 192 |
|
| 193 |
+
logger.info(f"Detected language: {result.primary_language.value} (evidence_strength: {result.evidence_strength:.2f}, method: {result.detection_method})")
|
| 194 |
|
| 195 |
return result
|
| 196 |
|
|
|
|
| 230 |
raise
|
| 231 |
|
| 232 |
|
| 233 |
+
def _split_text_into_chunks(self, text: str, max_chunk_length: int = language_detection_params.MAX_CHUNK_LENGTH, min_chunk_length: int = language_detection_params.MIN_CHUNK_LENGTH) -> List[str]:
|
| 234 |
"""
|
| 235 |
Split text into meaningful chunks for language detection
|
| 236 |
|
|
|
|
| 254 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 255 |
|
| 256 |
chunks = list()
|
|
|
|
| 257 |
current_chunk = ""
|
| 258 |
|
| 259 |
for sentence in sentences:
|
| 260 |
# If adding this sentence doesn't exceed max length
|
| 261 |
+
if ((len(current_chunk) + len(sentence) + 1) <= max_chunk_length):
|
| 262 |
if current_chunk:
|
| 263 |
current_chunk += " " + sentence
|
| 264 |
|
|
|
|
| 267 |
|
| 268 |
else:
|
| 269 |
# Current chunk is full, save it
|
| 270 |
+
if (current_chunk and (len(current_chunk) >= min_chunk_length)):
|
| 271 |
chunks.append(current_chunk)
|
| 272 |
|
| 273 |
# Start new chunk with current sentence
|
|
|
|
| 278 |
chunks.append(current_chunk)
|
| 279 |
|
| 280 |
# Strategy 2: If sentence splitting didn't work well, use fixed-length chunks
|
| 281 |
+
if ((len(chunks) == 0) or ((len(chunks) == 1) and (len(chunks[0]) > max_chunk_length))):
|
| 282 |
chunks = self._split_fixed_length(text, max_chunk_length)
|
| 283 |
|
| 284 |
logger.debug(f"Split {len(text)} chars into {len(chunks)} chunks: {[len(c) for c in chunks]}")
|
| 285 |
return chunks
|
| 286 |
|
| 287 |
|
| 288 |
+
def _split_fixed_length(self, text: str, chunk_size: int = language_detection_params.FIXED_CHUNK_SIZE) -> List[str]:
|
| 289 |
"""
|
| 290 |
Fallback: Split text into fixed-length chunks
|
| 291 |
"""
|
| 292 |
+
chunks = list()
|
| 293 |
+
word_boundary_ratio = language_detection_params.WORD_BOUNDARY_RATIO
|
| 294 |
|
| 295 |
for i in range(0, len(text), chunk_size):
|
| 296 |
chunk = text[i:i + chunk_size]
|
| 297 |
+
|
| 298 |
# Try to break at word boundaries
|
| 299 |
if ((i + chunk_size) < len(text)):
|
| 300 |
last_space = chunk.rfind(' ')
|
| 301 |
+
# If we found a space in the last 30% (using word_boundary_ratio)
|
| 302 |
+
if (last_space > chunk_size * word_boundary_ratio):
|
| 303 |
chunk = chunk[:last_space].strip()
|
| 304 |
|
| 305 |
chunks.append(chunk)
|
| 306 |
+
|
| 307 |
return chunks
|
| 308 |
|
| 309 |
|
|
|
|
| 312 |
Process a single chunk through the language detection model
|
| 313 |
"""
|
| 314 |
# Get the tokenizer from the pipeline
|
| 315 |
+
tokenizer = self.classifier.tokenizer
|
| 316 |
+
|
| 317 |
+
# Tokenize with explicit length limits using constant
|
| 318 |
+
max_length = language_detection_params.MODEL_MAX_LENGTH
|
| 319 |
+
inputs = tokenizer(chunk,
|
| 320 |
+
return_tensors = "pt",
|
| 321 |
+
truncation = True,
|
| 322 |
+
max_length = max_length,
|
| 323 |
+
padding = True,
|
| 324 |
+
add_special_tokens = True,
|
| 325 |
+
)
|
| 326 |
|
| 327 |
# Get model from pipeline
|
| 328 |
+
model = self.classifier.model
|
| 329 |
+
device = next(model.parameters()).device
|
| 330 |
|
| 331 |
# Move inputs to correct device
|
| 332 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 333 |
|
| 334 |
with torch.no_grad():
|
| 335 |
outputs = model(**inputs)
|
| 336 |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
| 337 |
|
| 338 |
+
# Get top predictions for this chunk using constant
|
| 339 |
+
top_k = language_detection_params.TOP_K_PREDICTIONS
|
| 340 |
+
top_predictions = torch.topk(predictions[0], k=top_k)
|
| 341 |
|
| 342 |
+
chunk_results = dict()
|
| 343 |
|
| 344 |
for i in range(len(top_predictions.indices)):
|
| 345 |
lang_idx = top_predictions.indices[i].item()
|
|
|
|
| 357 |
return chunk_results
|
| 358 |
|
| 359 |
|
| 360 |
+
def _map_language_code(self, code: str) -> Language:
|
| 361 |
+
"""
|
| 362 |
+
Map language code string to Language enum
|
| 363 |
+
"""
|
| 364 |
+
code = code.lower()
|
| 365 |
+
|
| 366 |
+
for lang in Language:
|
| 367 |
+
if (lang.value == code):
|
| 368 |
+
return lang
|
| 369 |
+
|
| 370 |
+
return Language.UNKNOWN
|
| 371 |
+
|
| 372 |
+
|
| 373 |
def _aggregate_chunk_results(self, chunk_results: List[Dict]) -> LanguageDetectionResult:
|
| 374 |
"""
|
| 375 |
Aggregate results from multiple chunks using weighted averaging
|
|
|
|
| 379 |
chunk_weights = list()
|
| 380 |
|
| 381 |
for chunk_result in chunk_results:
|
| 382 |
+
# Calculate chunk weight based on evidence_strength and diversity
|
| 383 |
top_score = max(chunk_result.values()) if chunk_result else 0
|
| 384 |
+
# Weight by evidence_strength
|
| 385 |
chunk_weight = top_score
|
| 386 |
|
| 387 |
chunk_weights.append(chunk_weight)
|
|
|
|
| 399 |
if (len(scores) != len(chunk_weights)):
|
| 400 |
# Use simple average if weight mismatch
|
| 401 |
weighted_scores[lang_code] = sum(scores) / len(scores)
|
| 402 |
+
|
| 403 |
else:
|
| 404 |
# Weighted average
|
| 405 |
weighted_sum = sum(score * weight for score, weight in zip(scores, chunk_weights))
|
| 406 |
total_weight = sum(chunk_weights)
|
| 407 |
+
weighted_scores[lang_code] = (weighted_sum / total_weight if total_weight > 0 else sum(scores) / len(scores))
|
| 408 |
|
| 409 |
# Find primary language
|
| 410 |
primary_lang = None
|
|
|
|
| 412 |
|
| 413 |
for lang_code, score in weighted_scores.items():
|
| 414 |
if (score > primary_conf):
|
| 415 |
+
primary_evidence_strength = score
|
| 416 |
+
primary_lang = lang_code
|
| 417 |
|
| 418 |
# Convert to Language enum
|
| 419 |
try:
|
| 420 |
+
primary_language = self._map_language_code(code = primary_lang)
|
| 421 |
+
|
| 422 |
except ValueError:
|
| 423 |
primary_language = Language.UNKNOWN
|
| 424 |
|
|
|
|
| 427 |
|
| 428 |
warnings = list()
|
| 429 |
|
| 430 |
+
if detection_quality.get('evidence_strength', False):
|
| 431 |
+
warnings.append("Low evidence_strength across multiple chunks")
|
| 432 |
|
| 433 |
if detection_quality.get('inconsistent', False):
|
| 434 |
warnings.append("Inconsistent language detection across chunks")
|
| 435 |
|
| 436 |
+
return LanguageDetectionResult(primary_language = primary_language,
|
| 437 |
+
evidence_strength = primary_evidence_strength,
|
| 438 |
+
all_languages = weighted_scores,
|
| 439 |
+
script = Script.UNKNOWN,
|
| 440 |
+
is_multilingual = detection_quality.get('multilingual', False),
|
| 441 |
+
detection_method = "model-chunked",
|
| 442 |
+
char_count = 0,
|
| 443 |
+
word_count = 0,
|
| 444 |
+
warnings = warnings,
|
| 445 |
)
|
| 446 |
|
| 447 |
|
|
|
|
| 449 |
"""
|
| 450 |
Assess the quality and consistency of language detection across chunks
|
| 451 |
"""
|
| 452 |
+
quality_metrics = {'low_evidence_strength' : False,
|
| 453 |
+
'inconsistent' : False,
|
| 454 |
+
'multilingual' : False,
|
| 455 |
}
|
| 456 |
|
| 457 |
if not chunk_results:
|
| 458 |
return quality_metrics
|
| 459 |
|
| 460 |
+
# Check for low evidence_strength using constant
|
| 461 |
+
avg_top_evidence_strength = sum(max(chunk.values()) for chunk in chunk_results) / len(chunk_results)
|
| 462 |
+
|
| 463 |
+
low_evidence_strength_threshold = language_detection_params.LOW_CONFIDENCE_THRESHOLD
|
| 464 |
+
|
| 465 |
+
if (avg_top_evidence_strength < low_evidence_strength_threshold):
|
| 466 |
+
quality_metrics['low_evidence_strength'] = True
|
| 467 |
|
| 468 |
# Check for inconsistency (different primary languages across chunks)
|
| 469 |
chunk_primaries = list()
|
|
|
|
| 476 |
if (len(set(chunk_primaries)) > 1):
|
| 477 |
quality_metrics['inconsistent'] = True
|
| 478 |
|
| 479 |
+
# Check for multilingual content using constant
|
| 480 |
+
multilingual_threshold = language_detection_params.MULTILINGUAL_THRESHOLD
|
| 481 |
+
strong_languages = [lang for lang, score in final_scores.items() if score > multilingual_threshold]
|
| 482 |
+
|
| 483 |
if (len(strong_languages) > 1):
|
| 484 |
quality_metrics['multilingual'] = True
|
| 485 |
|
|
|
|
| 499 |
all_languages[prob.lang] = prob.prob
|
| 500 |
|
| 501 |
# Primary language
|
| 502 |
+
primary = lang_probs[0]
|
| 503 |
|
| 504 |
try:
|
| 505 |
+
primary_language = self._map_language_code(code = primary.lang)
|
| 506 |
+
|
| 507 |
except ValueError:
|
| 508 |
primary_language = Language.UNKNOWN
|
| 509 |
|
| 510 |
+
return LanguageDetectionResult(primary_language = primary_language,
|
| 511 |
+
evidence_strength = primary.prob,
|
| 512 |
+
all_languages = all_languages,
|
| 513 |
+
script = Script.UNKNOWN,
|
| 514 |
+
is_multilingual = False,
|
| 515 |
+
detection_method = "langdetect",
|
| 516 |
+
char_count = 0,
|
| 517 |
+
word_count = 0,
|
| 518 |
+
warnings = [],
|
| 519 |
)
|
| 520 |
+
|
| 521 |
|
| 522 |
def _detect_with_heuristics(self, text: str, script: Script) -> LanguageDetectionResult:
|
| 523 |
"""
|
| 524 |
Detect language using character-based heuristics
|
| 525 |
"""
|
| 526 |
# Script-based language mapping
|
| 527 |
+
script_to_language = {Script.CHINESE: Language.CHINESE,
|
| 528 |
+
Script.JAPANESE: Language.JAPANESE,
|
| 529 |
+
Script.KOREAN: Language.KOREAN,
|
| 530 |
+
Script.ARABIC: Language.ARABIC,
|
| 531 |
+
Script.CYRILLIC: Language.RUSSIAN,
|
| 532 |
+
Script.DEVANAGARI: Language.HINDI,
|
| 533 |
+
Script.GREEK: Language.GREEK,
|
| 534 |
+
Script.HEBREW: Language.HEBREW,
|
| 535 |
+
Script.THAI: Language.THAI,
|
| 536 |
}
|
| 537 |
|
| 538 |
# If script clearly indicates language
|
| 539 |
if script in script_to_language:
|
| 540 |
+
primary_language = script_to_language[script]
|
| 541 |
+
# Moderate evidence_strength for heuristics
|
| 542 |
+
evidence_strength = 0.7
|
| 543 |
|
| 544 |
else:
|
| 545 |
# For Latin script, check common words
|
| 546 |
+
primary_language = self._detect_latin_language(text)
|
| 547 |
+
# Lower evidence_strength
|
| 548 |
+
evidence_strength = 0.5
|
| 549 |
+
|
| 550 |
+
return LanguageDetectionResult(primary_language = primary_language,
|
| 551 |
+
evidence_strength = evidence_strength,
|
| 552 |
+
all_languages = {primary_language.value: evidence_strength},
|
| 553 |
+
script = script,
|
| 554 |
+
is_multilingual = False,
|
| 555 |
+
detection_method = "heuristics",
|
| 556 |
+
char_count = 0,
|
| 557 |
+
word_count = 0,
|
| 558 |
+
warnings = ["Detection using heuristics, accuracy may be limited"],
|
| 559 |
)
|
| 560 |
|
| 561 |
|
|
|
|
| 584 |
|
| 585 |
# Return language with highest score
|
| 586 |
if scores:
|
| 587 |
+
best_lang = max(scores.items(), key=lambda x: x[1])
|
| 588 |
# At least 3 matches
|
| 589 |
+
if (best_lang[1] > 2):
|
| 590 |
return best_lang[0]
|
| 591 |
|
| 592 |
# Default to English for Latin script
|
|
|
|
| 621 |
# Calculate percentages
|
| 622 |
script_percentages = {script: count / total_chars for script, count in script_counts.items() if count > 0}
|
| 623 |
|
| 624 |
+
# Check if mixed using constant
|
| 625 |
+
dominance_threshold = language_detection_params.SCRIPT_DOMINANCE_THRESHOLD
|
| 626 |
if (len(script_percentages) > 1):
|
| 627 |
max_percentage = max(script_percentages.values())
|
| 628 |
+
if (max_percentage < dominance_threshold):
|
| 629 |
return Script.MIXED
|
| 630 |
|
| 631 |
# Return dominant script
|
|
|
|
| 656 |
"""
|
| 657 |
Create result for unknown language
|
| 658 |
"""
|
| 659 |
+
return LanguageDetectionResult(primary_language = Language.UNKNOWN,
|
| 660 |
+
evidence_strength = 0.0,
|
| 661 |
+
all_languages = {},
|
| 662 |
+
script = Script.UNKNOWN,
|
| 663 |
+
is_multilingual = False,
|
| 664 |
+
detection_method = "none",
|
| 665 |
+
char_count = len(text),
|
| 666 |
+
word_count = len(text.split()),
|
| 667 |
+
warnings = warnings,
|
| 668 |
)
|
| 669 |
|
| 670 |
|
| 671 |
+
def is_language(self, text: str, target_language: Language, threshold: float = language_detection_params.LANGUAGE_MATCH_THRESHOLD
|
| 672 |
+
) -> bool:
|
| 673 |
"""
|
| 674 |
Check if text is in a specific language
|
| 675 |
|
|
|
|
| 679 |
|
| 680 |
target_language : Language to check for
|
| 681 |
|
| 682 |
+
threshold : Minimum evidence_strength threshold
|
| 683 |
|
| 684 |
Returns:
|
| 685 |
--------
|
| 686 |
+
{ bool } : True if text is in target language with sufficient evidence_strength
|
| 687 |
"""
|
| 688 |
result = self.detect(text)
|
| 689 |
+
return ((result.primary_language == target_language) and (result.evidence_strength >= threshold))
|
| 690 |
|
| 691 |
|
| 692 |
def get_supported_languages(self) -> List[str]:
|
|
|
|
| 704 |
self.is_initialized = False
|
| 705 |
|
| 706 |
|
| 707 |
+
|
| 708 |
def quick_detect(text: str, **kwargs) -> LanguageDetectionResult:
|
| 709 |
"""
|
| 710 |
Quick language detection with default settings
|
|
|
|
| 712 |
Arguments:
|
| 713 |
----------
|
| 714 |
text : Input text
|
|
|
|
| 715 |
**kwargs : Override settings
|
| 716 |
|
| 717 |
Returns:
|
|
|
|
| 726 |
return detector.detect(text)
|
| 727 |
|
| 728 |
|
| 729 |
+
def is_english(text: str, threshold: float = language_detection_params.LANGUAGE_MATCH_THRESHOLD) -> bool:
|
| 730 |
"""
|
| 731 |
Quick check if text is English
|
| 732 |
"""
|
| 733 |
+
detector = LanguageDetector(use_model=True)
|
| 734 |
is_english = detector.is_language(text, Language.ENGLISH, threshold)
|
| 735 |
|
| 736 |
return is_english
|
| 737 |
|
| 738 |
|
|
|
|
| 739 |
# Export
|
| 740 |
__all__ = ['Script',
|
| 741 |
'Language',
|
|
|
|
| 743 |
'quick_detect',
|
| 744 |
'LanguageDetector',
|
| 745 |
'LanguageDetectionResult',
|
| 746 |
+
]
|
processors/text_processor.py
CHANGED
|
@@ -7,51 +7,13 @@ from typing import Dict
|
|
| 7 |
from typing import Tuple
|
| 8 |
from loguru import logger
|
| 9 |
from typing import Optional
|
| 10 |
-
from
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
@dataclass
|
| 14 |
-
class ProcessedText:
|
| 15 |
-
"""
|
| 16 |
-
Container for processed text with metadata
|
| 17 |
-
"""
|
| 18 |
-
original_text : str
|
| 19 |
-
cleaned_text : str
|
| 20 |
-
sentences : List[str]
|
| 21 |
-
words : List[str]
|
| 22 |
-
paragraphs : List[str]
|
| 23 |
-
char_count : int
|
| 24 |
-
word_count : int
|
| 25 |
-
sentence_count : int
|
| 26 |
-
paragraph_count : int
|
| 27 |
-
avg_sentence_length: float
|
| 28 |
-
avg_word_length : float
|
| 29 |
-
is_valid : bool
|
| 30 |
-
validation_errors : List[str]
|
| 31 |
-
metadata : Dict[str, Any]
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 35 |
-
"""
|
| 36 |
-
Convert to dictionary for JSON serialization
|
| 37 |
-
"""
|
| 38 |
-
return {"original_length" : len(self.original_text),
|
| 39 |
-
"cleaned_length" : len(self.cleaned_text),
|
| 40 |
-
"char_count" : self.char_count,
|
| 41 |
-
"word_count" : self.word_count,
|
| 42 |
-
"sentence_count" : self.sentence_count,
|
| 43 |
-
"paragraph_count" : self.paragraph_count,
|
| 44 |
-
"avg_sentence_length" : round(self.avg_sentence_length, 2),
|
| 45 |
-
"avg_word_length" : round(self.avg_word_length, 2),
|
| 46 |
-
"is_valid" : self.is_valid,
|
| 47 |
-
"validation_errors" : self.validation_errors,
|
| 48 |
-
"metadata" : self.metadata,
|
| 49 |
-
}
|
| 50 |
|
| 51 |
|
| 52 |
class TextProcessor:
|
| 53 |
"""
|
| 54 |
-
Handles text cleaning, normalization, sentence splitting, and preprocessing for
|
| 55 |
|
| 56 |
Features::
|
| 57 |
- Unicode normalization
|
|
@@ -63,10 +25,6 @@ class TextProcessor:
|
|
| 63 |
- Text validation
|
| 64 |
- Chunk creation for long texts
|
| 65 |
"""
|
| 66 |
-
|
| 67 |
-
# Common abbreviations that shouldn't trigger sentence breaks
|
| 68 |
-
ABBREVIATIONS = {'dr', 'mr', 'mrs', 'ms', 'prof', 'sr', 'jr', 'ph.d', 'inc', 'ltd', 'corp', 'co', 'vs', 'etc', 'e.g', 'i.e', 'al', 'fig', 'vol', 'no', 'approx', 'est', 'min', 'max', 'avg', 'dept', 'assoc', 'bros', 'u.s', 'u.k', 'a.m', 'p.m', 'b.c', 'a.d', 'st', 'ave', 'blvd'}
|
| 69 |
-
|
| 70 |
# Patterns for sentence splitting
|
| 71 |
SENTENCE_ENDINGS = r'[.!?]+(?=\s+[A-Z]|$)'
|
| 72 |
|
|
@@ -74,37 +32,21 @@ class TextProcessor:
|
|
| 74 |
MULTIPLE_SPACES = re.compile(r'\s+')
|
| 75 |
MULTIPLE_NEWLINES = re.compile(r'\n{3,}')
|
| 76 |
|
| 77 |
-
|
| 78 |
-
def __init__(self, min_text_length: int = 50, max_text_length: int = 500000, preserve_formatting: bool = False, remove_urls: bool = True, remove_emails: bool = True,
|
| 79 |
-
normalize_unicode: bool = True, fix_encoding: bool = True):
|
| 80 |
"""
|
| 81 |
Initialize text processor
|
| 82 |
-
|
| 83 |
-
Arguments:
|
| 84 |
-
----------
|
| 85 |
-
min_text_length : Minimum acceptable text length
|
| 86 |
-
|
| 87 |
-
max_text_length : Maximum text length to process
|
| 88 |
-
|
| 89 |
-
preserve_formatting : Keep original line breaks and spacing
|
| 90 |
-
|
| 91 |
-
remove_urls : Remove URLs from text
|
| 92 |
-
|
| 93 |
-
remove_emails : Remove email addresses
|
| 94 |
-
|
| 95 |
-
normalize_unicode : Normalize Unicode characters
|
| 96 |
-
|
| 97 |
-
fix_encoding : Fix common encoding issues
|
| 98 |
"""
|
| 99 |
-
self.min_text_length =
|
| 100 |
-
self.max_text_length =
|
| 101 |
-
self.preserve_formatting =
|
| 102 |
-
self.remove_urls =
|
| 103 |
-
self.remove_emails =
|
| 104 |
-
self.normalize_unicode =
|
| 105 |
-
self.fix_encoding =
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
def process(self, text: str, **kwargs) -> ProcessedText:
|
|
@@ -170,15 +112,15 @@ class TextProcessor:
|
|
| 170 |
sent_count = len(sentences)
|
| 171 |
para_count = len(paragraphs)
|
| 172 |
|
| 173 |
-
avg_sent_len = word_count / sent_count if sent_count > 0 else 0
|
| 174 |
avg_word_len = sum(len(w) for w in words) / word_count if word_count > 0 else 0
|
| 175 |
|
| 176 |
# Additional validation
|
| 177 |
if (sent_count == 0):
|
| 178 |
validation_errors.append("No valid sentences found")
|
| 179 |
|
| 180 |
-
if (word_count <
|
| 181 |
-
validation_errors.append(f"Too few words: {word_count} (minimum:
|
| 182 |
|
| 183 |
# Create metadata
|
| 184 |
metadata = {"has_special_chars" : self._has_special_characters(text),
|
|
@@ -227,7 +169,7 @@ class TextProcessor:
|
|
| 227 |
# Protect abbreviations
|
| 228 |
protected_text = text
|
| 229 |
|
| 230 |
-
for abbr in self.
|
| 231 |
# Replace abbreviation periods with placeholder
|
| 232 |
protected_text = re.sub(pattern = rf'\b{re.escape(abbr)}\.',
|
| 233 |
repl = abbr.replace('.', '<DOT>'),
|
|
@@ -417,8 +359,8 @@ class TextProcessor:
|
|
| 417 |
text = unicodedata.normalize('NFKC', text)
|
| 418 |
|
| 419 |
# Replace smart quotes and apostrophes
|
| 420 |
-
text = text.replace('
|
| 421 |
-
text = text.replace(''
|
| 422 |
text = text.replace('—', '-').replace('–', '-')
|
| 423 |
|
| 424 |
return text
|
|
@@ -492,9 +434,6 @@ class TextProcessor:
|
|
| 492 |
)
|
| 493 |
|
| 494 |
|
| 495 |
-
|
| 496 |
-
# Convenience Functions
|
| 497 |
-
|
| 498 |
def quick_process(text: str, **kwargs) -> ProcessedText:
|
| 499 |
"""
|
| 500 |
Quick processing with default settings
|
|
@@ -535,47 +474,4 @@ __all__ = ['TextProcessor',
|
|
| 535 |
'quick_process',
|
| 536 |
'extract_sentences',
|
| 537 |
'extract_words',
|
| 538 |
-
]
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
# ==================== Testing ====================
|
| 542 |
-
if __name__ == "__main__":
|
| 543 |
-
# Test cases
|
| 544 |
-
test_texts = [
|
| 545 |
-
# Normal text
|
| 546 |
-
"This is a test. Dr. Smith works at the U.S. Department of Education. "
|
| 547 |
-
"He published a paper on AI detection in 2024.",
|
| 548 |
-
|
| 549 |
-
# Text with encoding issues
|
| 550 |
-
"This text’s got some “weird†characters that need fixing.",
|
| 551 |
-
|
| 552 |
-
# Text with URLs and emails
|
| 553 |
-
"Check out https://example.com or email me at [email protected] for more info.",
|
| 554 |
-
|
| 555 |
-
# Short text (should fail validation)
|
| 556 |
-
"Too short.",
|
| 557 |
-
|
| 558 |
-
# Text with numbers and special characters
|
| 559 |
-
"The price is $19.99 for version 2.0. Contact us at (555) 123-4567!",
|
| 560 |
-
]
|
| 561 |
-
|
| 562 |
-
processor = TextProcessor(min_text_length=20)
|
| 563 |
-
|
| 564 |
-
for i, text in enumerate(test_texts, 1):
|
| 565 |
-
print(f"\n{'='*70}")
|
| 566 |
-
print(f"TEST CASE {i}")
|
| 567 |
-
print(f"{'='*70}")
|
| 568 |
-
print(f"Input: {text[:100]}...")
|
| 569 |
-
|
| 570 |
-
result = processor.process(text)
|
| 571 |
-
|
| 572 |
-
print(f"\nValid: {result.is_valid}")
|
| 573 |
-
if not result.is_valid:
|
| 574 |
-
print(f"Errors: {result.validation_errors}")
|
| 575 |
-
|
| 576 |
-
print(f"Word count: {result.word_count}")
|
| 577 |
-
print(f"Sentence count: {result.sentence_count}")
|
| 578 |
-
print(f"Avg sentence length: {result.avg_sentence_length:.2f}")
|
| 579 |
-
print(f"\nSentences:")
|
| 580 |
-
for j, sent in enumerate(result.sentences[:3], 1):
|
| 581 |
-
print(f" {j}. {sent}")
|
|
|
|
| 7 |
from typing import Tuple
|
| 8 |
from loguru import logger
|
| 9 |
from typing import Optional
|
| 10 |
+
from config.schemas import ProcessedText
|
| 11 |
+
from config.constants import text_processing_params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class TextProcessor:
|
| 15 |
"""
|
| 16 |
+
Handles text cleaning, normalization, sentence splitting, and preprocessing for downstream text analysis and authentication signals
|
| 17 |
|
| 18 |
Features::
|
| 19 |
- Unicode normalization
|
|
|
|
| 25 |
- Text validation
|
| 26 |
- Chunk creation for long texts
|
| 27 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# Patterns for sentence splitting
|
| 29 |
SENTENCE_ENDINGS = r'[.!?]+(?=\s+[A-Z]|$)'
|
| 30 |
|
|
|
|
| 32 |
MULTIPLE_SPACES = re.compile(r'\s+')
|
| 33 |
MULTIPLE_NEWLINES = re.compile(r'\n{3,}')
|
| 34 |
|
| 35 |
+
def __init__(self):
|
|
|
|
|
|
|
| 36 |
"""
|
| 37 |
Initialize text processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
+
self.min_text_length = text_processing_params.MINIMUM_TEXT_LENGTH
|
| 40 |
+
self.max_text_length = text_processing_params.MAXIMUM_TEXT_LENGTH
|
| 41 |
+
self.preserve_formatting = text_processing_params.PRESERVE_FORMATTING
|
| 42 |
+
self.remove_urls = text_processing_params.REMOVE_URLS
|
| 43 |
+
self.remove_emails = text_processing_params.REMOVE_EMAILS
|
| 44 |
+
self.normalize_unicode = text_processing_params.NORMALIZE_UNICODE
|
| 45 |
+
self.fix_encoding = text_processing_params.FIX_ENCODING
|
| 46 |
+
self.minimum_word_count = text_processing_params.MINIMUM_WORD_COUNT
|
| 47 |
+
self.common_abbreviations = text_processing_params.COMMON_ABBREVIATIONS
|
| 48 |
+
|
| 49 |
+
logger.info(f"TextProcessor initialized with min_length={self.min_text_length}, max_length={self.max_text_length}")
|
| 50 |
|
| 51 |
|
| 52 |
def process(self, text: str, **kwargs) -> ProcessedText:
|
|
|
|
| 112 |
sent_count = len(sentences)
|
| 113 |
para_count = len(paragraphs)
|
| 114 |
|
| 115 |
+
avg_sent_len = word_count / sent_count if (sent_count > 0) else 0
|
| 116 |
avg_word_len = sum(len(w) for w in words) / word_count if word_count > 0 else 0
|
| 117 |
|
| 118 |
# Additional validation
|
| 119 |
if (sent_count == 0):
|
| 120 |
validation_errors.append("No valid sentences found")
|
| 121 |
|
| 122 |
+
if (word_count < self.minimum_word_count):
|
| 123 |
+
validation_errors.append(f"Too few words: {word_count} (minimum: {self.minimum_word_count})")
|
| 124 |
|
| 125 |
# Create metadata
|
| 126 |
metadata = {"has_special_chars" : self._has_special_characters(text),
|
|
|
|
| 169 |
# Protect abbreviations
|
| 170 |
protected_text = text
|
| 171 |
|
| 172 |
+
for abbr in self.common_abbreviations:
|
| 173 |
# Replace abbreviation periods with placeholder
|
| 174 |
protected_text = re.sub(pattern = rf'\b{re.escape(abbr)}\.',
|
| 175 |
repl = abbr.replace('.', '<DOT>'),
|
|
|
|
| 359 |
text = unicodedata.normalize('NFKC', text)
|
| 360 |
|
| 361 |
# Replace smart quotes and apostrophes
|
| 362 |
+
text = text.replace('“', '"').replace('”', '"')
|
| 363 |
+
text = text.replace('‘', "'").replace('’', "'")
|
| 364 |
text = text.replace('—', '-').replace('–', '-')
|
| 365 |
|
| 366 |
return text
|
|
|
|
| 434 |
)
|
| 435 |
|
| 436 |
|
|
|
|
|
|
|
|
|
|
| 437 |
def quick_process(text: str, **kwargs) -> ProcessedText:
|
| 438 |
"""
|
| 439 |
Quick processing with default settings
|
|
|
|
| 474 |
'quick_process',
|
| 475 |
'extract_sentences',
|
| 476 |
'extract_words',
|
| 477 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reporter/__init__.py
CHANGED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
# DEPENDENCIES
|
| 2 |
-
from reporter.report_generator import ReportGenerator
|
| 3 |
-
from reporter.reasoning_generator import DetailedReasoning
|
| 4 |
-
from reporter.reasoning_generator import ReasoningGenerator
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
__all__ = ["ReasoningGenerator",
|
| 8 |
-
"DetailedReasoning",
|
| 9 |
-
"ReportGenerator",
|
| 10 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reporter/report_generator.py
CHANGED
|
@@ -8,26 +8,10 @@ from pathlib import Path
|
|
| 8 |
from loguru import logger
|
| 9 |
from typing import Optional
|
| 10 |
from datetime import datetime
|
| 11 |
-
from
|
| 12 |
-
from
|
| 13 |
-
from
|
| 14 |
-
from
|
| 15 |
-
from reporter.reasoning_generator import ReasoningGenerator
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
@dataclass
|
| 19 |
-
class DetailedMetric:
|
| 20 |
-
"""
|
| 21 |
-
Metric data structure with sub-metrics
|
| 22 |
-
"""
|
| 23 |
-
name : str
|
| 24 |
-
ai_probability : float
|
| 25 |
-
human_probability : float
|
| 26 |
-
confidence : float
|
| 27 |
-
verdict : str
|
| 28 |
-
description : str
|
| 29 |
-
detailed_metrics : Dict[str, float]
|
| 30 |
-
weight : float
|
| 31 |
|
| 32 |
|
| 33 |
class ReportGenerator:
|
|
@@ -59,8 +43,8 @@ class ReportGenerator:
|
|
| 59 |
logger.info(f"ReportGenerator initialized (output_dir={self.output_dir})")
|
| 60 |
|
| 61 |
|
| 62 |
-
def generate_complete_report(self, detection_result: DetectionResult,
|
| 63 |
-
|
| 64 |
"""
|
| 65 |
Generate comprehensive report in JSON and PDF formats with detailed metrics
|
| 66 |
|
|
@@ -68,8 +52,6 @@ class ReportGenerator:
|
|
| 68 |
----------
|
| 69 |
detection_result : Detection analysis result
|
| 70 |
|
| 71 |
-
attribution_result : Model attribution result (optional)
|
| 72 |
-
|
| 73 |
highlighted_sentences : List of highlighted sentences (optional)
|
| 74 |
|
| 75 |
formats : List of formats to generate (json, pdf)
|
|
@@ -93,11 +75,10 @@ class ReportGenerator:
|
|
| 93 |
logger.info("Using detection_dict directly")
|
| 94 |
|
| 95 |
# Generate detailed reasoning
|
| 96 |
-
reasoning = self.reasoning_generator.generate(ensemble_result
|
| 97 |
-
metric_results
|
| 98 |
-
domain
|
| 99 |
-
|
| 100 |
-
text_length = detection_result.processed_text.word_count,
|
| 101 |
)
|
| 102 |
|
| 103 |
# Extract detailed metrics from ACTUAL detection results
|
|
@@ -113,8 +94,7 @@ class ReportGenerator:
|
|
| 113 |
json_path = self._generate_json_report(detection_data = detection_data,
|
| 114 |
detection_dict_full = detection_dict,
|
| 115 |
reasoning = reasoning,
|
| 116 |
-
detailed_metrics = detailed_metrics,
|
| 117 |
-
attribution_result = attribution_result,
|
| 118 |
highlighted_sentences = highlighted_sentences,
|
| 119 |
filename = f"{filename_prefix}_{timestamp}.json",
|
| 120 |
)
|
|
@@ -125,8 +105,7 @@ class ReportGenerator:
|
|
| 125 |
pdf_path = self._generate_pdf_report(detection_data = detection_data,
|
| 126 |
detection_dict_full = detection_dict,
|
| 127 |
reasoning = reasoning,
|
| 128 |
-
detailed_metrics = detailed_metrics,
|
| 129 |
-
attribution_result = attribution_result,
|
| 130 |
highlighted_sentences = highlighted_sentences,
|
| 131 |
filename = f"{filename_prefix}_{timestamp}.pdf",
|
| 132 |
)
|
|
@@ -141,7 +120,7 @@ class ReportGenerator:
|
|
| 141 |
return generated_files
|
| 142 |
|
| 143 |
|
| 144 |
-
def _extract_detailed_metrics(self, detection_data: Dict) -> List[
|
| 145 |
"""
|
| 146 |
Extract detailed metrics with sub-metrics from ACTUAL detection result
|
| 147 |
"""
|
|
@@ -163,33 +142,33 @@ class ReportGenerator:
|
|
| 163 |
continue
|
| 164 |
|
| 165 |
# Get actual probabilities and confidence
|
| 166 |
-
|
| 167 |
-
|
| 168 |
confidence = metric_result.get("confidence", 0)
|
| 169 |
|
| 170 |
# Determine verdict based on actual probability
|
| 171 |
-
if (
|
| 172 |
-
verdict = "
|
| 173 |
|
| 174 |
-
elif (
|
| 175 |
-
verdict = "
|
| 176 |
|
| 177 |
-
elif (
|
| 178 |
-
verdict = "
|
| 179 |
|
| 180 |
-
elif (
|
| 181 |
-
verdict = "
|
| 182 |
|
| 183 |
else:
|
| 184 |
# If both low, check which is higher
|
| 185 |
-
if (
|
| 186 |
-
verdict = "
|
| 187 |
|
| 188 |
-
elif (
|
| 189 |
-
verdict = "
|
| 190 |
|
| 191 |
else:
|
| 192 |
-
verdict = "
|
| 193 |
|
| 194 |
# Get actual weight or use default
|
| 195 |
weight = 0.0
|
|
@@ -204,15 +183,15 @@ class ReportGenerator:
|
|
| 204 |
# Get description based on metric type
|
| 205 |
description = self._get_metric_description(metric_name = metric_name)
|
| 206 |
|
| 207 |
-
detailed_metrics.append(
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
)
|
| 217 |
|
| 218 |
logger.info(f"Extracted {len(detailed_metrics)} detailed metrics")
|
|
@@ -252,8 +231,8 @@ class ReportGenerator:
|
|
| 252 |
|
| 253 |
# If no details available, provide basic calculated values
|
| 254 |
if not details:
|
| 255 |
-
details = {"
|
| 256 |
-
"
|
| 257 |
"confidence" : metric_result.get("confidence", 0) * 100,
|
| 258 |
"score" : metric_result.get("raw_score", 0) * 100,
|
| 259 |
}
|
|
@@ -276,8 +255,8 @@ class ReportGenerator:
|
|
| 276 |
return descriptions.get(metric_name, "Advanced text analysis metric.")
|
| 277 |
|
| 278 |
|
| 279 |
-
def _generate_json_report(self, detection_data: Dict, detection_dict_full: Dict, reasoning:
|
| 280 |
-
|
| 281 |
"""
|
| 282 |
Generate JSON format report with detailed metrics
|
| 283 |
"""
|
|
@@ -286,8 +265,8 @@ class ReportGenerator:
|
|
| 286 |
|
| 287 |
for metric in detailed_metrics:
|
| 288 |
metrics_data.append({"name" : metric.name,
|
| 289 |
-
"
|
| 290 |
-
"
|
| 291 |
"confidence" : metric.confidence,
|
| 292 |
"verdict" : metric.verdict,
|
| 293 |
"description" : metric.description,
|
|
@@ -303,24 +282,11 @@ class ReportGenerator:
|
|
| 303 |
|
| 304 |
for sent in highlighted_sentences:
|
| 305 |
highlighted_data.append({"text" : sent.text,
|
| 306 |
-
"
|
| 307 |
"confidence" : sent.confidence,
|
| 308 |
"color_class" : sent.color_class,
|
| 309 |
"index" : sent.index,
|
| 310 |
})
|
| 311 |
-
|
| 312 |
-
# Attribution data
|
| 313 |
-
attribution_data = None
|
| 314 |
-
|
| 315 |
-
if attribution_result:
|
| 316 |
-
attribution_data = {"predicted_model" : attribution_result.predicted_model.value,
|
| 317 |
-
"confidence" : attribution_result.confidence,
|
| 318 |
-
"model_probabilities" : attribution_result.model_probabilities,
|
| 319 |
-
"reasoning" : attribution_result.reasoning,
|
| 320 |
-
"fingerprint_matches" : attribution_result.fingerprint_matches,
|
| 321 |
-
"domain_used" : attribution_result.domain_used.value,
|
| 322 |
-
"metric_contributions": attribution_result.metric_contributions,
|
| 323 |
-
}
|
| 324 |
|
| 325 |
# Use detection results from dictionary
|
| 326 |
ensemble_data = detection_data.get("ensemble", {})
|
|
@@ -333,17 +299,17 @@ class ReportGenerator:
|
|
| 333 |
"format" : "json",
|
| 334 |
"report_id" : filename.replace('.json', ''),
|
| 335 |
},
|
| 336 |
-
"overall_results" : {"final_verdict"
|
| 337 |
-
"
|
| 338 |
-
"
|
| 339 |
-
"
|
| 340 |
-
"overall_confidence"
|
| 341 |
-
"uncertainty_score"
|
| 342 |
-
"consensus_level"
|
| 343 |
-
"domain"
|
| 344 |
-
"domain_confidence"
|
| 345 |
-
"text_length"
|
| 346 |
-
"sentence_count"
|
| 347 |
},
|
| 348 |
"ensemble_analysis" : {"method_used" : "confidence_calibrated",
|
| 349 |
"metric_weights" : ensemble_data.get("metric_contributions", {}),
|
|
@@ -362,7 +328,6 @@ class ReportGenerator:
|
|
| 362 |
"recommendations" : reasoning.recommendations,
|
| 363 |
},
|
| 364 |
"highlighted_text" : highlighted_data,
|
| 365 |
-
"model_attribution" : attribution_data,
|
| 366 |
"performance_metrics" : {"total_processing_time" : performance_data.get("total_time", 0),
|
| 367 |
"metrics_execution_time" : performance_data.get("metrics_time", {}),
|
| 368 |
"warnings" : detection_data.get("warnings", []),
|
|
@@ -384,8 +349,8 @@ class ReportGenerator:
|
|
| 384 |
return output_path
|
| 385 |
|
| 386 |
|
| 387 |
-
def _generate_pdf_report(self, detection_data: Dict, detection_dict_full: Dict, reasoning:
|
| 388 |
-
|
| 389 |
"""
|
| 390 |
Generate PDF format report with detailed metrics
|
| 391 |
"""
|
|
@@ -570,8 +535,6 @@ class ReportGenerator:
|
|
| 570 |
textColor = GRAY_DARK,
|
| 571 |
alignment = TA_CENTER,
|
| 572 |
)
|
| 573 |
-
|
| 574 |
-
print (detection_dict_full.keys())
|
| 575 |
|
| 576 |
# Use detection results from detection_data
|
| 577 |
ensemble_data = detection_data.get("ensemble", {})
|
|
@@ -585,23 +548,23 @@ class ReportGenerator:
|
|
| 585 |
original_filename = file_info.get("filename", "Unknown")
|
| 586 |
|
| 587 |
# Extract values - handle different data formats
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
confidence = ensemble_data.get("overall_confidence", 0) * 100
|
| 592 |
-
uncertainty = ensemble_data.get("uncertainty_score", 0) * 100
|
| 593 |
-
consensus = ensemble_data.get("consensus_level", 0) * 100
|
| 594 |
final_verdict = ensemble_data.get("final_verdict", "Unknown")
|
| 595 |
total_time = performance_data.get("total_time", 0)
|
| 596 |
|
| 597 |
# Determine colors based on verdict
|
| 598 |
-
if ("
|
| 599 |
verdict_color = SUCCESS_COLOR
|
| 600 |
|
| 601 |
-
elif ("
|
| 602 |
verdict_color = DANGER_COLOR
|
| 603 |
|
| 604 |
-
elif ("
|
| 605 |
verdict_color = WARNING_COLOR
|
| 606 |
|
| 607 |
else:
|
|
@@ -617,7 +580,7 @@ class ReportGenerator:
|
|
| 617 |
alignment = TA_RIGHT,
|
| 618 |
)
|
| 619 |
|
| 620 |
-
elements.append(Paragraph("
|
| 621 |
|
| 622 |
elements.append(HRFlowable(width = "100%",
|
| 623 |
thickness = 1,
|
|
@@ -627,7 +590,7 @@ class ReportGenerator:
|
|
| 627 |
)
|
| 628 |
|
| 629 |
# Title and main sections
|
| 630 |
-
elements.append(Paragraph("
|
| 631 |
elements.append(Paragraph(f"Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", subtitle_style))
|
| 632 |
|
| 633 |
# Add original filename
|
|
@@ -645,8 +608,8 @@ class ReportGenerator:
|
|
| 645 |
)
|
| 646 |
|
| 647 |
# Quick Stats Banner
|
| 648 |
-
stats_data = [['
|
| 649 |
-
['Probability', f"{
|
| 650 |
]
|
| 651 |
|
| 652 |
stats_table = Table(stats_data, colWidths = [1.5*inch, 1*inch, 1*inch, 1*inch])
|
|
@@ -673,7 +636,7 @@ class ReportGenerator:
|
|
| 673 |
# Main Verdict Section
|
| 674 |
elements.append(Paragraph("DETECTION VERDICT", section_style))
|
| 675 |
|
| 676 |
-
verdict_box_data = [[Paragraph(f"<font size=
|
| 677 |
Paragraph(f"<font size=12>Confidence: <b>{confidence:.1f}%</b></font><br/>"
|
| 678 |
f"<font size=10>Uncertainty: {uncertainty:.1f}% | Consensus: {consensus:.1f}%</font>",
|
| 679 |
ParagraphStyle('VerdictDetails', alignment=TA_CENTER))
|
|
@@ -699,7 +662,7 @@ class ReportGenerator:
|
|
| 699 |
elements.append(Paragraph("DETECTION REASONING", section_style))
|
| 700 |
|
| 701 |
# Process summary text and convert to bullet points
|
| 702 |
-
summary_text
|
| 703 |
|
| 704 |
# Fix extra spaces first
|
| 705 |
summary_text = ' '.join(summary_text.split())
|
|
@@ -906,71 +869,6 @@ class ReportGenerator:
|
|
| 906 |
|
| 907 |
elements.append(PageBreak())
|
| 908 |
|
| 909 |
-
# PAGE 6: Model Attribution & Recommendations
|
| 910 |
-
# AI MODEL ATTRIBUTION
|
| 911 |
-
if attribution_result:
|
| 912 |
-
elements.append(Paragraph("AI MODEL ATTRIBUTION", section_style))
|
| 913 |
-
elements.append(Spacer(1, 0.1*inch))
|
| 914 |
-
|
| 915 |
-
predicted_model = getattr(attribution_result.predicted_model, 'value', str(attribution_result.predicted_model))
|
| 916 |
-
predicted_model = predicted_model.replace("_", " ").title()
|
| 917 |
-
attribution_confidence = getattr(attribution_result, 'confidence', 0) * 100
|
| 918 |
-
domain_used = getattr(attribution_result.domain_used, 'value', 'Unknown').upper()
|
| 919 |
-
|
| 920 |
-
# Professional attribution table
|
| 921 |
-
attribution_data = [[Paragraph("<b>Predicted Model</b>", bold_style), Paragraph(f"<font color='{INFO_COLOR}'><b>{predicted_model}</b></font>", bold_style)],
|
| 922 |
-
[Paragraph("<b>Attribution Confidence</b>", bold_style), Paragraph(f"<b>{attribution_confidence:.1f}%</b>", bold_style)],
|
| 923 |
-
[Paragraph("<b>Domain Used</b>", bold_style), Paragraph(f"<b>{domain_used}</b>", bold_style)]
|
| 924 |
-
]
|
| 925 |
-
|
| 926 |
-
attribution_table = Table(attribution_data, colWidths = [2.5*inch, 4*inch])
|
| 927 |
-
|
| 928 |
-
attribution_table.setStyle(TableStyle([('BACKGROUND', (0, 0), (0, -1), GRAY_LIGHT),
|
| 929 |
-
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
| 930 |
-
('FONTSIZE', (0, 0), (-1, -1), 11),
|
| 931 |
-
('BOTTOMPADDING', (0, 0), (-1, -1), 8),
|
| 932 |
-
('TOPPADDING', (0, 0), (-1, -1), 8),
|
| 933 |
-
('GRID', (0, 0), (-1, -1), 0.5, GRAY_MEDIUM),
|
| 934 |
-
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
| 935 |
-
])
|
| 936 |
-
)
|
| 937 |
-
|
| 938 |
-
elements.append(attribution_table)
|
| 939 |
-
elements.append(Spacer(1, 0.2*inch))
|
| 940 |
-
|
| 941 |
-
# MODEL PROBABILITY DISTRIBUTION
|
| 942 |
-
model_probs = getattr(attribution_result, 'model_probabilities', {})
|
| 943 |
-
if (model_probs and (len(model_probs) > 0)):
|
| 944 |
-
elements.append(Paragraph("MODEL PROBABILITY DISTRIBUTION", subsection_style))
|
| 945 |
-
elements.append(Spacer(1, 0.05*inch))
|
| 946 |
-
|
| 947 |
-
# Get top models
|
| 948 |
-
sorted_models = sorted(model_probs.items(), key = lambda x: x[1], reverse = True)[:10]
|
| 949 |
-
|
| 950 |
-
prob_data = [['LANGUAGE MODEL NAME', 'ATTRIBUTION PROBABILITY']]
|
| 951 |
-
|
| 952 |
-
for model_name, probability in sorted_models:
|
| 953 |
-
display_name = model_name.replace("_", " ").replace("-", " ").title()
|
| 954 |
-
prob_data.append([Paragraph(display_name, bold_style), Paragraph(f"{probability:.1%}", bold_style)])
|
| 955 |
-
|
| 956 |
-
# Table Columns Setup
|
| 957 |
-
prob_table = Table(prob_data, colWidths = [4*inch, 2.5*inch])
|
| 958 |
-
|
| 959 |
-
prob_table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), INFO_COLOR),
|
| 960 |
-
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
|
| 961 |
-
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 962 |
-
('ALIGN', (1, 0), (1, -1), 'RIGHT'),
|
| 963 |
-
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
| 964 |
-
('FONTSIZE', (0, 0), (-1, -1), 11),
|
| 965 |
-
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
|
| 966 |
-
('TOPPADDING', (0, 0), (-1, -1), 6),
|
| 967 |
-
('GRID', (0, 0), (-1, -1), 0.5, GRAY_MEDIUM),
|
| 968 |
-
('BACKGROUND', (1, 1), (1, -1), GRAY_LIGHT),
|
| 969 |
-
])
|
| 970 |
-
)
|
| 971 |
-
|
| 972 |
-
elements.append(prob_table)
|
| 973 |
-
elements.append(Spacer(1, 0.3*inch))
|
| 974 |
|
| 975 |
# RECOMMENDATIONS
|
| 976 |
if ((hasattr(reasoning, 'recommendations')) and reasoning.recommendations):
|
|
@@ -1014,12 +912,12 @@ class ReportGenerator:
|
|
| 1014 |
# Extract report ID from filename
|
| 1015 |
report_id = filename.replace('.pdf', '')
|
| 1016 |
|
| 1017 |
-
footer_text = (f"Generated by
|
| 1018 |
f"Processing Time: {total_time:.2f}s | "
|
| 1019 |
f"Report ID: {report_id}")
|
| 1020 |
|
| 1021 |
elements.append(Paragraph(footer_text, footer_style))
|
| 1022 |
-
elements.append(Paragraph("Confidential Analysis Report • © 2025
|
| 1023 |
ParagraphStyle('Copyright', parent = footer_style, fontSize = 8, textColor = GRAY_MEDIUM)))
|
| 1024 |
|
| 1025 |
# Build PDF
|
|
@@ -1043,11 +941,11 @@ class ReportGenerator:
|
|
| 1043 |
from reportlab.lib.enums import TA_LEFT
|
| 1044 |
|
| 1045 |
# Determine metric color based on verdict
|
| 1046 |
-
if (metric.verdict == "
|
| 1047 |
metric_color = SUCCESS_COLOR
|
| 1048 |
prob_color = SUCCESS_COLOR
|
| 1049 |
|
| 1050 |
-
elif (metric.verdict == "
|
| 1051 |
metric_color = DANGER_COLOR
|
| 1052 |
prob_color = DANGER_COLOR
|
| 1053 |
|
|
@@ -1062,7 +960,7 @@ class ReportGenerator:
|
|
| 1062 |
subsection_style = ParagraphStyle('SubsectionStyle',
|
| 1063 |
parent = ParagraphStyle('Normal'),
|
| 1064 |
fontName = 'Helvetica-Bold',
|
| 1065 |
-
fontSize =
|
| 1066 |
textColor = PRIMARY_COLOR,
|
| 1067 |
spaceAfter = 8,
|
| 1068 |
spaceBefore = 16,
|
|
@@ -1075,7 +973,7 @@ class ReportGenerator:
|
|
| 1075 |
|
| 1076 |
# Key metrics in a clean table
|
| 1077 |
key_metrics_data = [[Paragraph("<b>Verdict</b>", bold_style), Paragraph(f"<font color='{metric_color}'><b>{metric.verdict}</b></font>", bold_style), Paragraph("<b>Weight</b>", bold_style), Paragraph(f"<b>{metric.weight:.1f}%</b>", bold_style)],
|
| 1078 |
-
[Paragraph("<b>
|
| 1079 |
]
|
| 1080 |
|
| 1081 |
key_metrics_table = Table(key_metrics_data, colWidths = [1.5*inch, 1.5*inch, 1.5*inch, 1.5*inch])
|
|
@@ -1095,7 +993,7 @@ class ReportGenerator:
|
|
| 1095 |
# Detailed metrics in a compact table
|
| 1096 |
if metric.detailed_metrics and len(metric.detailed_metrics) > 0:
|
| 1097 |
# Create table with all metrics
|
| 1098 |
-
detailed_data =
|
| 1099 |
|
| 1100 |
# Sort metrics alphabetically
|
| 1101 |
sorted_items = sorted(metric.detailed_metrics.items())
|
|
@@ -1180,6 +1078,4 @@ class ReportGenerator:
|
|
| 1180 |
|
| 1181 |
|
| 1182 |
# Export
|
| 1183 |
-
__all__ = ["ReportGenerator"
|
| 1184 |
-
"DetailedMetric",
|
| 1185 |
-
]
|
|
|
|
| 8 |
from loguru import logger
|
| 9 |
from typing import Optional
|
| 10 |
from datetime import datetime
|
| 11 |
+
from config.schemas import DetectionResult
|
| 12 |
+
from config.schemas import DetailedMetricResult
|
| 13 |
+
from config.schemas import DetailedReasoningResult
|
| 14 |
+
from services.reasoning_generator import ReasoningGenerator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class ReportGenerator:
|
|
|
|
| 43 |
logger.info(f"ReportGenerator initialized (output_dir={self.output_dir})")
|
| 44 |
|
| 45 |
|
| 46 |
+
def generate_complete_report(self, detection_result: DetectionResult, highlighted_sentences: Optional[List] = None, formats: List[str] = ["json", "pdf"],
|
| 47 |
+
filename_prefix: str = "text_authenticity_report") -> Dict[str, str]:
|
| 48 |
"""
|
| 49 |
Generate comprehensive report in JSON and PDF formats with detailed metrics
|
| 50 |
|
|
|
|
| 52 |
----------
|
| 53 |
detection_result : Detection analysis result
|
| 54 |
|
|
|
|
|
|
|
| 55 |
highlighted_sentences : List of highlighted sentences (optional)
|
| 56 |
|
| 57 |
formats : List of formats to generate (json, pdf)
|
|
|
|
| 75 |
logger.info("Using detection_dict directly")
|
| 76 |
|
| 77 |
# Generate detailed reasoning
|
| 78 |
+
reasoning = self.reasoning_generator.generate(ensemble_result = detection_result.ensemble_result,
|
| 79 |
+
metric_results = detection_result.metric_results,
|
| 80 |
+
domain = detection_result.domain_prediction.primary_domain,
|
| 81 |
+
text_length = detection_result.processed_text.word_count,
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
# Extract detailed metrics from ACTUAL detection results
|
|
|
|
| 94 |
json_path = self._generate_json_report(detection_data = detection_data,
|
| 95 |
detection_dict_full = detection_dict,
|
| 96 |
reasoning = reasoning,
|
| 97 |
+
detailed_metrics = detailed_metrics,
|
|
|
|
| 98 |
highlighted_sentences = highlighted_sentences,
|
| 99 |
filename = f"{filename_prefix}_{timestamp}.json",
|
| 100 |
)
|
|
|
|
| 105 |
pdf_path = self._generate_pdf_report(detection_data = detection_data,
|
| 106 |
detection_dict_full = detection_dict,
|
| 107 |
reasoning = reasoning,
|
| 108 |
+
detailed_metrics = detailed_metrics,
|
|
|
|
| 109 |
highlighted_sentences = highlighted_sentences,
|
| 110 |
filename = f"{filename_prefix}_{timestamp}.pdf",
|
| 111 |
)
|
|
|
|
| 120 |
return generated_files
|
| 121 |
|
| 122 |
|
| 123 |
+
def _extract_detailed_metrics(self, detection_data: Dict) -> List[DetailedMetricResult]:
|
| 124 |
"""
|
| 125 |
Extract detailed metrics with sub-metrics from ACTUAL detection result
|
| 126 |
"""
|
|
|
|
| 142 |
continue
|
| 143 |
|
| 144 |
# Get actual probabilities and confidence
|
| 145 |
+
synthetic_prob = metric_result.get("synthetic_probability", 0)
|
| 146 |
+
authentic_prob = metric_result.get("authentic_probability", 0)
|
| 147 |
confidence = metric_result.get("confidence", 0)
|
| 148 |
|
| 149 |
# Determine verdict based on actual probability
|
| 150 |
+
if (authentic_prob >= 0.6):
|
| 151 |
+
verdict = "Authentically-Written"
|
| 152 |
|
| 153 |
+
elif (synthetic_prob >= 0.6):
|
| 154 |
+
verdict = "Synthetically-Generated"
|
| 155 |
|
| 156 |
+
elif (synthetic_prob > 0.4 and synthetic_prob < 0.6):
|
| 157 |
+
verdict = "Hybrid"
|
| 158 |
|
| 159 |
+
elif (authentic_prob > 0.4 and authentic_prob < 0.6):
|
| 160 |
+
verdict = "Hybrid"
|
| 161 |
|
| 162 |
else:
|
| 163 |
# If both low, check which is higher
|
| 164 |
+
if (authentic_prob > synthetic_prob):
|
| 165 |
+
verdict = "Authentically-Written"
|
| 166 |
|
| 167 |
+
elif (synthetic_prob > authentic_prob):
|
| 168 |
+
verdict = "Synthetically-Generated"
|
| 169 |
|
| 170 |
else:
|
| 171 |
+
verdict = "Hybrid"
|
| 172 |
|
| 173 |
# Get actual weight or use default
|
| 174 |
weight = 0.0
|
|
|
|
| 183 |
# Get description based on metric type
|
| 184 |
description = self._get_metric_description(metric_name = metric_name)
|
| 185 |
|
| 186 |
+
detailed_metrics.append(DetailedMetricResult(name = metric_name,
|
| 187 |
+
synthetic_probability = synthetic_prob * 100, # Convert to percentage
|
| 188 |
+
authentic_probability = authentic_prob * 100, # Convert to percentage
|
| 189 |
+
confidence = confidence * 100, # Convert to percentage
|
| 190 |
+
verdict = verdict,
|
| 191 |
+
description = description,
|
| 192 |
+
detailed_metrics = detailed_metrics_data,
|
| 193 |
+
weight = weight * 100, # Convert to percentage
|
| 194 |
+
)
|
| 195 |
)
|
| 196 |
|
| 197 |
logger.info(f"Extracted {len(detailed_metrics)} detailed metrics")
|
|
|
|
| 231 |
|
| 232 |
# If no details available, provide basic calculated values
|
| 233 |
if not details:
|
| 234 |
+
details = {"synthetic_probability" : metric_result.get("synthetic_probability", 0) * 100,
|
| 235 |
+
"authentic_probability" : metric_result.get("authentic_probability", 0) * 100,
|
| 236 |
"confidence" : metric_result.get("confidence", 0) * 100,
|
| 237 |
"score" : metric_result.get("raw_score", 0) * 100,
|
| 238 |
}
|
|
|
|
| 255 |
return descriptions.get(metric_name, "Advanced text analysis metric.")
|
| 256 |
|
| 257 |
|
| 258 |
+
def _generate_json_report(self, detection_data: Dict, detection_dict_full: Dict, reasoning: DetailedReasoningResult, detailed_metrics: List[DetailedMetricResult],
|
| 259 |
+
highlighted_sentences: Optional[List] = None, filename: str = None) -> Path:
|
| 260 |
"""
|
| 261 |
Generate JSON format report with detailed metrics
|
| 262 |
"""
|
|
|
|
| 265 |
|
| 266 |
for metric in detailed_metrics:
|
| 267 |
metrics_data.append({"name" : metric.name,
|
| 268 |
+
"synthetic_probability" : metric.synthetic_probability,
|
| 269 |
+
"authentic_probability" : metric.authentic_probability,
|
| 270 |
"confidence" : metric.confidence,
|
| 271 |
"verdict" : metric.verdict,
|
| 272 |
"description" : metric.description,
|
|
|
|
| 282 |
|
| 283 |
for sent in highlighted_sentences:
|
| 284 |
highlighted_data.append({"text" : sent.text,
|
| 285 |
+
"synthetic_probability" : sent.synthetic_probability,
|
| 286 |
"confidence" : sent.confidence,
|
| 287 |
"color_class" : sent.color_class,
|
| 288 |
"index" : sent.index,
|
| 289 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
# Use detection results from dictionary
|
| 292 |
ensemble_data = detection_data.get("ensemble", {})
|
|
|
|
| 299 |
"format" : "json",
|
| 300 |
"report_id" : filename.replace('.json', ''),
|
| 301 |
},
|
| 302 |
+
"overall_results" : {"final_verdict" : ensemble_data.get("final_verdict", "Unknown"),
|
| 303 |
+
"synthetic_probability" : ensemble_data.get("synthetic_probability", 0),
|
| 304 |
+
"authentic_probability" : ensemble_data.get("authentic_probability", 0),
|
| 305 |
+
"hybrid_probability" : ensemble_data.get("hybrid_probability", 0),
|
| 306 |
+
"overall_confidence" : ensemble_data.get("overall_confidence", 0),
|
| 307 |
+
"uncertainty_score" : ensemble_data.get("uncertainty_score", 0),
|
| 308 |
+
"consensus_level" : ensemble_data.get("consensus_level", 0),
|
| 309 |
+
"domain" : analysis_data.get("domain", "general"),
|
| 310 |
+
"domain_confidence" : analysis_data.get("domain_confidence", 0),
|
| 311 |
+
"text_length" : analysis_data.get("text_length", 0),
|
| 312 |
+
"sentence_count" : analysis_data.get("sentence_count", 0),
|
| 313 |
},
|
| 314 |
"ensemble_analysis" : {"method_used" : "confidence_calibrated",
|
| 315 |
"metric_weights" : ensemble_data.get("metric_contributions", {}),
|
|
|
|
| 328 |
"recommendations" : reasoning.recommendations,
|
| 329 |
},
|
| 330 |
"highlighted_text" : highlighted_data,
|
|
|
|
| 331 |
"performance_metrics" : {"total_processing_time" : performance_data.get("total_time", 0),
|
| 332 |
"metrics_execution_time" : performance_data.get("metrics_time", {}),
|
| 333 |
"warnings" : detection_data.get("warnings", []),
|
|
|
|
| 349 |
return output_path
|
| 350 |
|
| 351 |
|
| 352 |
+
def _generate_pdf_report(self, detection_data: Dict, detection_dict_full: Dict, reasoning: DetailedReasoningResult, detailed_metrics: List[DetailedMetricResult],
|
| 353 |
+
highlighted_sentences: Optional[List] = None, filename: str = None) -> Path:
|
| 354 |
"""
|
| 355 |
Generate PDF format report with detailed metrics
|
| 356 |
"""
|
|
|
|
| 535 |
textColor = GRAY_DARK,
|
| 536 |
alignment = TA_CENTER,
|
| 537 |
)
|
|
|
|
|
|
|
| 538 |
|
| 539 |
# Use detection results from detection_data
|
| 540 |
ensemble_data = detection_data.get("ensemble", {})
|
|
|
|
| 548 |
original_filename = file_info.get("filename", "Unknown")
|
| 549 |
|
| 550 |
# Extract values - handle different data formats
|
| 551 |
+
synthetic_prob = ensemble_data.get("synthetic_probability", 0) * 100 # Convert to percentage
|
| 552 |
+
authentic_prob = ensemble_data.get("authentic_probability", 0) * 100 # Convert to percentage
|
| 553 |
+
hybrid_prob = ensemble_data.get("hybrid_probability", 0) * 100 # Convert to percentage
|
| 554 |
+
confidence = ensemble_data.get("overall_confidence", 0) * 100 # Convert to percentage
|
| 555 |
+
uncertainty = ensemble_data.get("uncertainty_score", 0) * 100 # Convert to percentage
|
| 556 |
+
consensus = ensemble_data.get("consensus_level", 0) * 100 # Convert to percentage
|
| 557 |
final_verdict = ensemble_data.get("final_verdict", "Unknown")
|
| 558 |
total_time = performance_data.get("total_time", 0)
|
| 559 |
|
| 560 |
# Determine colors based on verdict
|
| 561 |
+
if ("Authentically-Written".lower() in final_verdict.lower()):
|
| 562 |
verdict_color = SUCCESS_COLOR
|
| 563 |
|
| 564 |
+
elif ("Synthetically-Generated".lower() in final_verdict.lower()):
|
| 565 |
verdict_color = DANGER_COLOR
|
| 566 |
|
| 567 |
+
elif ("Hybrid".lower() in final_verdict.lower()):
|
| 568 |
verdict_color = WARNING_COLOR
|
| 569 |
|
| 570 |
else:
|
|
|
|
| 580 |
alignment = TA_RIGHT,
|
| 581 |
)
|
| 582 |
|
| 583 |
+
elements.append(Paragraph("TEXT AUTHENTICATION ANALYTICS", header_style))
|
| 584 |
|
| 585 |
elements.append(HRFlowable(width = "100%",
|
| 586 |
thickness = 1,
|
|
|
|
| 590 |
)
|
| 591 |
|
| 592 |
# Title and main sections
|
| 593 |
+
elements.append(Paragraph("Text Authentication Analysis Report", title_style))
|
| 594 |
elements.append(Paragraph(f"Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", subtitle_style))
|
| 595 |
|
| 596 |
# Add original filename
|
|
|
|
| 608 |
)
|
| 609 |
|
| 610 |
# Quick Stats Banner
|
| 611 |
+
stats_data = [['Classification', 'Synthetic', 'Authentic', 'Hybrid'],
|
| 612 |
+
['Probability', f"{synthetic_prob:.1f}%", f"{authentic_prob:.1f}%", f"{hybrid_prob:.1f}%"]
|
| 613 |
]
|
| 614 |
|
| 615 |
stats_table = Table(stats_data, colWidths = [1.5*inch, 1*inch, 1*inch, 1*inch])
|
|
|
|
| 636 |
# Main Verdict Section
|
| 637 |
elements.append(Paragraph("DETECTION VERDICT", section_style))
|
| 638 |
|
| 639 |
+
verdict_box_data = [[Paragraph(f"<font size=10 color='{verdict_color}'><b>{final_verdict.upper()}</b></font>", ParagraphStyle('VerdictText', alignment=TA_CENTER)),
|
| 640 |
Paragraph(f"<font size=12>Confidence: <b>{confidence:.1f}%</b></font><br/>"
|
| 641 |
f"<font size=10>Uncertainty: {uncertainty:.1f}% | Consensus: {consensus:.1f}%</font>",
|
| 642 |
ParagraphStyle('VerdictDetails', alignment=TA_CENTER))
|
|
|
|
| 662 |
elements.append(Paragraph("DETECTION REASONING", section_style))
|
| 663 |
|
| 664 |
# Process summary text and convert to bullet points
|
| 665 |
+
summary_text = reasoning.summary if hasattr(reasoning, 'summary') else "No reasoning summary available."
|
| 666 |
|
| 667 |
# Fix extra spaces first
|
| 668 |
summary_text = ' '.join(summary_text.split())
|
|
|
|
| 869 |
|
| 870 |
elements.append(PageBreak())
|
| 871 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
|
| 873 |
# RECOMMENDATIONS
|
| 874 |
if ((hasattr(reasoning, 'recommendations')) and reasoning.recommendations):
|
|
|
|
| 912 |
# Extract report ID from filename
|
| 913 |
report_id = filename.replace('.pdf', '')
|
| 914 |
|
| 915 |
+
footer_text = (f"Generated by Text Authenticator v1.0 | "
|
| 916 |
f"Processing Time: {total_time:.2f}s | "
|
| 917 |
f"Report ID: {report_id}")
|
| 918 |
|
| 919 |
elements.append(Paragraph(footer_text, footer_style))
|
| 920 |
+
elements.append(Paragraph("Confidential Analysis Report • © 2025 Text Authentication Analytics",
|
| 921 |
ParagraphStyle('Copyright', parent = footer_style, fontSize = 8, textColor = GRAY_MEDIUM)))
|
| 922 |
|
| 923 |
# Build PDF
|
|
|
|
| 941 |
from reportlab.lib.enums import TA_LEFT
|
| 942 |
|
| 943 |
# Determine metric color based on verdict
|
| 944 |
+
if (metric.verdict == "Authentic Text"):
|
| 945 |
metric_color = SUCCESS_COLOR
|
| 946 |
prob_color = SUCCESS_COLOR
|
| 947 |
|
| 948 |
+
elif (metric.verdict == "Synthetic Text"):
|
| 949 |
metric_color = DANGER_COLOR
|
| 950 |
prob_color = DANGER_COLOR
|
| 951 |
|
|
|
|
| 960 |
subsection_style = ParagraphStyle('SubsectionStyle',
|
| 961 |
parent = ParagraphStyle('Normal'),
|
| 962 |
fontName = 'Helvetica-Bold',
|
| 963 |
+
fontSize = 12,
|
| 964 |
textColor = PRIMARY_COLOR,
|
| 965 |
spaceAfter = 8,
|
| 966 |
spaceBefore = 16,
|
|
|
|
| 973 |
|
| 974 |
# Key metrics in a clean table
|
| 975 |
key_metrics_data = [[Paragraph("<b>Verdict</b>", bold_style), Paragraph(f"<font color='{metric_color}'><b>{metric.verdict}</b></font>", bold_style), Paragraph("<b>Weight</b>", bold_style), Paragraph(f"<b>{metric.weight:.1f}%</b>", bold_style)],
|
| 976 |
+
[Paragraph("<b>Synthetic Probability</b>", bold_style), Paragraph(f"<font color='{prob_color}'><b>{metric.synthetic_probability:.1f}%</b></font>", bold_style), Paragraph("<b>Confidence</b>", bold_style), Paragraph(f"<b>{metric.confidence:.1f}%</b>", bold_style)]
|
| 977 |
]
|
| 978 |
|
| 979 |
key_metrics_table = Table(key_metrics_data, colWidths = [1.5*inch, 1.5*inch, 1.5*inch, 1.5*inch])
|
|
|
|
| 993 |
# Detailed metrics in a compact table
|
| 994 |
if metric.detailed_metrics and len(metric.detailed_metrics) > 0:
|
| 995 |
# Create table with all metrics
|
| 996 |
+
detailed_data = list()
|
| 997 |
|
| 998 |
# Sort metrics alphabetically
|
| 999 |
sorted_items = sorted(metric.detailed_metrics.items())
|
|
|
|
| 1078 |
|
| 1079 |
|
| 1080 |
# Export
|
| 1081 |
+
__all__ = ["ReportGenerator"]
|
|
|
|
|
|
run.sh
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
echo "Starting Text Auth AI Detection System..."
|
| 4 |
-
|
| 5 |
-
# Check if Conda is installed
|
| 6 |
-
if ! command -v conda &> /dev/null; then
|
| 7 |
-
echo "Conda is required but not installed. Please install Miniconda or Anaconda."
|
| 8 |
-
exit 1
|
| 9 |
-
fi
|
| 10 |
-
|
| 11 |
-
# Check if Python is installed and is version 3.10+
|
| 12 |
-
if ! command -v python3 &> /dev/null; then
|
| 13 |
-
echo "Python 3 is required but not installed. Please install Python 3.10 or higher."
|
| 14 |
-
exit 1
|
| 15 |
-
fi
|
| 16 |
-
python3 -c "import sys; assert sys.version_info >= (3.10,), 'Python 3.10 or higher is required.'" || exit 1
|
| 17 |
-
|
| 18 |
-
# Conda environment name
|
| 19 |
-
CONDA_ENV_NAME="text_auth_env"
|
| 20 |
-
|
| 21 |
-
# Check if conda environment exists, create if not
|
| 22 |
-
if ! conda info --envs | grep -q "$CONDA_ENV_NAME"; then
|
| 23 |
-
echo "Creating Conda environment '$CONDA_ENV_NAME' with Python 3.10..."
|
| 24 |
-
conda create -n "$CONDA_ENV_NAME" python=3.10 -y
|
| 25 |
-
fi
|
| 26 |
-
|
| 27 |
-
# Activate conda environment
|
| 28 |
-
echo "Activating Conda environment '$CONDA_ENV_NAME'..."
|
| 29 |
-
source $(conda info --base)/etc/profile.d/conda.sh
|
| 30 |
-
conda activate "$CONDA_ENV_NAME"
|
| 31 |
-
|
| 32 |
-
# Install requirements
|
| 33 |
-
echo "Installing dependencies..."
|
| 34 |
-
pip install -r requirements.txt || { echo "Failed to install dependencies."; exit 1; }
|
| 35 |
-
|
| 36 |
-
# Create necessary directories
|
| 37 |
-
mkdir -p logs
|
| 38 |
-
mkdir -p data/uploads
|
| 39 |
-
mkdir -p data/reports
|
| 40 |
-
mkdir -p models/cache
|
| 41 |
-
|
| 42 |
-
# Set environment variables
|
| 43 |
-
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
| 44 |
-
export LOG_LEVEL=${LOG_LEVEL:-INFO}
|
| 45 |
-
export MODEL_CACHE_DIR=$(pwd)/models/cache
|
| 46 |
-
|
| 47 |
-
# Start the FastAPI application
|
| 48 |
-
echo "Starting FastAPI server..."
|
| 49 |
-
echo "Access the application at: http://localhost:8000"
|
| 50 |
-
echo "API documentation at: http://localhost:8000/docs"
|
| 51 |
-
echo "Press Ctrl+C to stop the server"
|
| 52 |
-
|
| 53 |
-
# Deactivate conda environment on exit
|
| 54 |
-
trap 'conda deactivate' EXIT
|
| 55 |
-
|
| 56 |
-
uvicorn app:app --reload --host 0.0.0.0 --port 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
services/__init__.py
ADDED
|
File without changes
|
detector/ensemble.py → services/ensemble_classifier.py
RENAMED
|
@@ -1,64 +1,19 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
import numpy as np
|
| 3 |
-
from typing import Any
|
| 4 |
from typing import List
|
| 5 |
from typing import Dict
|
| 6 |
from loguru import logger
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
-
from config.
|
| 10 |
-
from config.
|
| 11 |
-
from metrics.base_metric import MetricResult
|
| 12 |
-
from sklearn.ensemble import RandomForestClassifier
|
| 13 |
from config.threshold_config import get_threshold_for_domain
|
| 14 |
from config.threshold_config import get_active_metric_weights
|
| 15 |
|
| 16 |
|
| 17 |
-
@dataclass
|
| 18 |
-
class EnsembleResult:
|
| 19 |
-
"""
|
| 20 |
-
Result from ensemble classification
|
| 21 |
-
"""
|
| 22 |
-
final_verdict : str # "AI-Generated", "Human-Written", or "Mixed"
|
| 23 |
-
ai_probability : float
|
| 24 |
-
human_probability : float
|
| 25 |
-
mixed_probability : float
|
| 26 |
-
overall_confidence : float
|
| 27 |
-
domain : Domain
|
| 28 |
-
metric_results : Dict[str, MetricResult]
|
| 29 |
-
metric_weights : Dict[str, float]
|
| 30 |
-
weighted_scores : Dict[str, float]
|
| 31 |
-
reasoning : List[str]
|
| 32 |
-
uncertainty_score : float
|
| 33 |
-
consensus_level : float
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 37 |
-
"""
|
| 38 |
-
Convert to dictionary for JSON serialization
|
| 39 |
-
"""
|
| 40 |
-
return {"final_verdict" : self.final_verdict,
|
| 41 |
-
"ai_probability" : round(self.ai_probability, 4),
|
| 42 |
-
"human_probability" : round(self.human_probability, 4),
|
| 43 |
-
"mixed_probability" : round(self.mixed_probability, 4),
|
| 44 |
-
"overall_confidence" : round(self.overall_confidence, 4),
|
| 45 |
-
"domain" : self.domain.value,
|
| 46 |
-
"uncertainty_score" : round(self.uncertainty_score, 4),
|
| 47 |
-
"consensus_level" : round(self.consensus_level, 4),
|
| 48 |
-
"metric_contributions" : {name: {"weight" : round(self.metric_weights.get(name, 0.0), 4),
|
| 49 |
-
"weighted_score" : round(self.weighted_scores.get(name, 0.0), 4),
|
| 50 |
-
"ai_prob" : round(result.ai_probability, 4),
|
| 51 |
-
"confidence" : round(result.confidence, 4),
|
| 52 |
-
}
|
| 53 |
-
for name, result in self.metric_results.items()
|
| 54 |
-
},
|
| 55 |
-
"reasoning" : self.reasoning,
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
|
| 59 |
class EnsembleClassifier:
|
| 60 |
"""
|
| 61 |
-
|
| 62 |
|
| 63 |
Features:
|
| 64 |
- Domain-aware dynamic weighting
|
|
@@ -66,29 +21,26 @@ class EnsembleClassifier:
|
|
| 66 |
- Uncertainty quantification
|
| 67 |
- Consensus analysis
|
| 68 |
- Fallback strategies
|
| 69 |
-
- Feature-based ML ensemble (optional)
|
| 70 |
"""
|
| 71 |
-
def __init__(self, primary_method: str = "confidence_calibrated", fallback_method: str = "domain_weighted",
|
| 72 |
"""
|
| 73 |
Initialize advanced ensemble classifier
|
| 74 |
|
| 75 |
Arguments:
|
| 76 |
----------
|
| 77 |
-
primary_method : Primary aggregation method : "confidence_calibrated", "
|
| 78 |
|
| 79 |
fallback_method : Fallback method if primary fails : "domain_weighted", "confidence_weighted", "simple_average"
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
min_metrics_required : Minimum number of valid metrics required
|
| 84 |
"""
|
| 85 |
self.primary_method = primary_method
|
| 86 |
self.fallback_method = fallback_method
|
| 87 |
-
self.
|
| 88 |
-
self.
|
| 89 |
-
self.
|
| 90 |
|
| 91 |
-
logger.info(f"
|
| 92 |
|
| 93 |
|
| 94 |
def predict(self, metric_results: Dict[str, MetricResult], domain: Domain = Domain.GENERAL) -> EnsembleResult:
|
|
@@ -106,8 +58,8 @@ class EnsembleClassifier:
|
|
| 106 |
{ EnsembleResult } : EnsembleResult object with final prediction
|
| 107 |
"""
|
| 108 |
try:
|
| 109 |
-
# Filter
|
| 110 |
-
valid_results
|
| 111 |
|
| 112 |
if (len(valid_results) < self.min_metrics_required):
|
| 113 |
logger.warning(f"Insufficient valid metrics: {len(valid_results)}/{self.min_metrics_required}")
|
|
@@ -117,11 +69,11 @@ class EnsembleClassifier:
|
|
| 117 |
enabled_metrics = {name: True for name in valid_results.keys()}
|
| 118 |
base_weights = get_active_metric_weights(domain, enabled_metrics)
|
| 119 |
|
| 120 |
-
# Try primary aggregation method
|
| 121 |
calculated_weights = dict()
|
| 122 |
-
aggregated = {"
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
}
|
| 126 |
|
| 127 |
try:
|
|
@@ -131,29 +83,15 @@ class EnsembleClassifier:
|
|
| 131 |
domain = domain,
|
| 132 |
)
|
| 133 |
|
| 134 |
-
elif (self.primary_method == "domain_adaptive"):
|
| 135 |
-
aggregated, calculated_weights = self._domain_adaptive_aggregation(results = valid_results,
|
| 136 |
-
base_weights = base_weights,
|
| 137 |
-
domain = domain,
|
| 138 |
-
)
|
| 139 |
-
|
| 140 |
elif (self.primary_method == "consensus_based"):
|
| 141 |
aggregated, calculated_weights = self._consensus_based_aggregation(results = valid_results,
|
| 142 |
base_weights = base_weights,
|
| 143 |
-
domain = domain,
|
| 144 |
)
|
| 145 |
|
| 146 |
-
elif ((self.primary_method == "ml_ensemble") and self.use_ml_ensemble):
|
| 147 |
-
aggregated, calculated_weights = self._ml_ensemble_aggregation(results = valid_results,
|
| 148 |
-
base_weights = base_weights,
|
| 149 |
-
domain = domain,
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
else:
|
| 153 |
# Fallback to domain weighted
|
| 154 |
aggregated, calculated_weights = self._domain_weighted_aggregation(results = valid_results,
|
| 155 |
base_weights = base_weights,
|
| 156 |
-
domain = domain,
|
| 157 |
)
|
| 158 |
|
| 159 |
except Exception as e:
|
|
@@ -165,20 +103,18 @@ class EnsembleClassifier:
|
|
| 165 |
# Start with the calculated weights (from valid_results)
|
| 166 |
final_metric_weights = calculated_weights.copy()
|
| 167 |
|
| 168 |
-
#
|
| 169 |
for original_metric_name in metric_results.keys():
|
| 170 |
-
|
| 171 |
-
if original_metric_name not in final_metric_weights:
|
| 172 |
final_metric_weights[original_metric_name] = 0.0
|
| 173 |
|
| 174 |
-
# Calculate advanced metrics
|
| 175 |
-
overall_confidence = self.
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
|
| 180 |
uncertainty_score = self._calculate_uncertainty(results = valid_results,
|
| 181 |
-
weights = calculated_weights,
|
| 182 |
aggregated = aggregated,
|
| 183 |
)
|
| 184 |
|
|
@@ -191,65 +127,53 @@ class EnsembleClassifier:
|
|
| 191 |
uncertainty = uncertainty_score,
|
| 192 |
)
|
| 193 |
|
| 194 |
-
# Generate
|
| 195 |
-
reasoning = self.
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
|
| 203 |
-
# Calculate weighted scores
|
| 204 |
-
weighted_scores = {name: result.
|
|
|
|
| 205 |
|
| 206 |
-
return EnsembleResult(final_verdict
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
overall_confidence
|
| 211 |
-
domain
|
| 212 |
-
metric_results
|
| 213 |
-
metric_weights
|
| 214 |
-
weighted_scores
|
| 215 |
-
reasoning
|
| 216 |
-
uncertainty_score
|
| 217 |
-
consensus_level
|
|
|
|
| 218 |
)
|
| 219 |
|
| 220 |
except Exception as e:
|
| 221 |
-
logger.error(f"Error in
|
| 222 |
return self._create_fallback_result(domain, metric_results, str(e))
|
| 223 |
|
| 224 |
|
| 225 |
-
def
|
| 226 |
"""
|
| 227 |
-
|
|
|
|
| 228 |
"""
|
| 229 |
-
valid_results
|
| 230 |
-
validation_info = {'failed_metrics' : [],
|
| 231 |
-
'low_confidence_metrics' : [],
|
| 232 |
-
'high_confidence_metrics' : [],
|
| 233 |
-
}
|
| 234 |
|
| 235 |
for name, result in results.items():
|
| 236 |
if result.error is not None:
|
| 237 |
-
validation_info['failed_metrics'].append(name)
|
| 238 |
continue
|
| 239 |
|
| 240 |
-
|
| 241 |
-
validation_info['low_confidence_metrics'].append(name)
|
| 242 |
-
# Still include but with lower weight consideration
|
| 243 |
-
valid_results[name] = result
|
| 244 |
-
|
| 245 |
-
elif (result.confidence > 0.7):
|
| 246 |
-
validation_info['high_confidence_metrics'].append(name)
|
| 247 |
-
valid_results[name] = result
|
| 248 |
-
|
| 249 |
-
else:
|
| 250 |
-
valid_results[name] = result
|
| 251 |
|
| 252 |
-
return valid_results
|
| 253 |
|
| 254 |
|
| 255 |
def _confidence_calibrated_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float], domain: Domain) -> tuple:
|
|
@@ -266,10 +190,7 @@ class EnsembleClassifier:
|
|
| 266 |
confidence_weights[name] = base_weight * confidence_factor
|
| 267 |
|
| 268 |
# Normalize weights
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
if (total_weight > 0):
|
| 272 |
-
confidence_weights = {name: w / total_weight for name, w in confidence_weights.items()}
|
| 273 |
|
| 274 |
# Domain-specific calibration
|
| 275 |
domain_calibration = self._get_domain_calibration(domain = domain)
|
|
@@ -281,34 +202,16 @@ class EnsembleClassifier:
|
|
| 281 |
return self._weighted_aggregation(calibrated_results, confidence_weights), confidence_weights
|
| 282 |
|
| 283 |
|
| 284 |
-
def _domain_adaptive_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float], domain: Domain) -> tuple:
|
| 285 |
-
"""
|
| 286 |
-
Domain-adaptive aggregation considering metric performance per domain
|
| 287 |
-
"""
|
| 288 |
-
# Get domain-specific performance weights
|
| 289 |
-
domain_weights = self._get_domain_performance_weights(domain, list(results.keys()))
|
| 290 |
-
|
| 291 |
-
# Combine with base weights
|
| 292 |
-
combined_weights = dict()
|
| 293 |
-
for name in results.keys():
|
| 294 |
-
domain_weight = domain_weights.get(name, 1.0)
|
| 295 |
-
base_weight = base_weights.get(name, 0.0)
|
| 296 |
-
combined_weights[name] = base_weight * domain_weight
|
| 297 |
-
|
| 298 |
-
# Normalize
|
| 299 |
-
total_weight = sum(combined_weights.values())
|
| 300 |
-
if (total_weight > 0):
|
| 301 |
-
combined_weights = {name: w / total_weight for name, w in combined_weights.items()}
|
| 302 |
-
|
| 303 |
-
return self._weighted_aggregation(results, combined_weights), combined_weights
|
| 304 |
-
|
| 305 |
-
|
| 306 |
def _consensus_based_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> tuple:
|
| 307 |
"""
|
| 308 |
Consensus-based aggregation that rewards metric agreement
|
| 309 |
"""
|
| 310 |
# Calculate consensus scores
|
| 311 |
-
consensus_weights = self._calculate_consensus_weights(results,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
aggregations = self._weighted_aggregation(results = results,
|
| 314 |
weights = consensus_weights,
|
|
@@ -316,42 +219,6 @@ class EnsembleClassifier:
|
|
| 316 |
return aggregations, consensus_weights
|
| 317 |
|
| 318 |
|
| 319 |
-
def _ml_ensemble_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> tuple:
|
| 320 |
-
"""
|
| 321 |
-
Machine learning-based ensemble aggregation
|
| 322 |
-
"""
|
| 323 |
-
if self.ml_model is None:
|
| 324 |
-
logger.warning("ML model not available, falling back to weighted average")
|
| 325 |
-
return self._weighted_aggregation(results, base_weights), base_weights
|
| 326 |
-
|
| 327 |
-
try:
|
| 328 |
-
# Extract features from metric results
|
| 329 |
-
features = self._extract_ml_features(results = results)
|
| 330 |
-
|
| 331 |
-
# Predict using ML model
|
| 332 |
-
prediction = self.ml_model.predict_proba([features])[0]
|
| 333 |
-
|
| 334 |
-
# For now, assume binary classification [human_prob, ai_prob]
|
| 335 |
-
if (len(prediction) == 2):
|
| 336 |
-
ai_prob, human_prob = prediction[1], prediction[0]
|
| 337 |
-
mixed_prob = 0.0
|
| 338 |
-
|
| 339 |
-
else:
|
| 340 |
-
# Multi-class - adjust accordingly
|
| 341 |
-
ai_prob, human_prob, mixed_prob = prediction
|
| 342 |
-
|
| 343 |
-
aggregated = {"ai_probability" : ai_prob,
|
| 344 |
-
"human_probability" : human_prob,
|
| 345 |
-
"mixed_probability" : mixed_prob,
|
| 346 |
-
}
|
| 347 |
-
|
| 348 |
-
return aggregated, base_weights
|
| 349 |
-
|
| 350 |
-
except Exception as e:
|
| 351 |
-
logger.warning(f"ML ensemble failed: {e}, using fallback")
|
| 352 |
-
return self._weighted_aggregation(results, base_weights), base_weights
|
| 353 |
-
|
| 354 |
-
|
| 355 |
def _domain_weighted_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> tuple:
|
| 356 |
"""
|
| 357 |
Simple domain-weighted aggregation (fallback method)
|
|
@@ -364,56 +231,56 @@ class EnsembleClassifier:
|
|
| 364 |
Apply fallback aggregation method
|
| 365 |
"""
|
| 366 |
if (self.fallback_method == "confidence_weighted"):
|
| 367 |
-
return self._confidence_weighted_aggregation(results), base_weights
|
| 368 |
|
| 369 |
elif (self.fallback_method == "simple_average"):
|
| 370 |
-
return self._simple_average_aggregation(results), base_weights
|
| 371 |
|
| 372 |
else:
|
| 373 |
-
return self._domain_weighted_aggregation(results, base_weights), base_weights
|
| 374 |
|
| 375 |
|
| 376 |
def _weighted_aggregation(self, results: Dict[str, MetricResult], weights: Dict[str, float]) -> Dict[str, float]:
|
| 377 |
"""
|
| 378 |
Core weighted aggregation logic
|
| 379 |
"""
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
total_weight
|
| 384 |
|
| 385 |
for name, result in results.items():
|
| 386 |
weight = weights.get(name, 0.0)
|
| 387 |
|
| 388 |
if (weight > 0):
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
|
| 393 |
total_weight += weight
|
| 394 |
|
| 395 |
if (total_weight == 0):
|
| 396 |
-
return {"
|
| 397 |
-
"
|
| 398 |
-
"
|
| 399 |
}
|
| 400 |
|
| 401 |
# Calculate weighted averages
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
|
| 406 |
-
# Normalize
|
| 407 |
-
total
|
| 408 |
|
| 409 |
if (total > 0):
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
|
| 414 |
-
return {"
|
| 415 |
-
"
|
| 416 |
-
"
|
| 417 |
}
|
| 418 |
|
| 419 |
|
|
@@ -421,7 +288,9 @@ class EnsembleClassifier:
|
|
| 421 |
"""
|
| 422 |
Confidence-weighted aggregation
|
| 423 |
"""
|
| 424 |
-
|
|
|
|
|
|
|
| 425 |
|
| 426 |
|
| 427 |
def _simple_average_aggregation(self, results: Dict[str, MetricResult]) -> Dict[str, float]:
|
|
@@ -435,8 +304,8 @@ class EnsembleClassifier:
|
|
| 435 |
"""
|
| 436 |
Non-linear confidence adjustment using sigmoid
|
| 437 |
"""
|
| 438 |
-
# Sigmoid that emphasizes differences around
|
| 439 |
-
return 1.0 / (1.0 + np.exp(-
|
| 440 |
|
| 441 |
|
| 442 |
def _get_domain_calibration(self, domain: Domain) -> Dict[str, float]:
|
|
@@ -444,7 +313,7 @@ class EnsembleClassifier:
|
|
| 444 |
Get domain-specific calibration factors
|
| 445 |
"""
|
| 446 |
# This would typically come from validation data
|
| 447 |
-
# For now, return neutral calibration
|
| 448 |
return {}
|
| 449 |
|
| 450 |
|
|
@@ -453,233 +322,86 @@ class EnsembleClassifier:
|
|
| 453 |
Calibrate probabilities based on domain performance
|
| 454 |
"""
|
| 455 |
calibrated = dict()
|
|
|
|
| 456 |
for name, result in results.items():
|
| 457 |
-
cal_factor
|
| 458 |
-
# Simple calibration
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
|
|
|
| 467 |
return calibrated
|
| 468 |
|
| 469 |
|
| 470 |
-
def _get_domain_performance_weights(self, domain: Domain, metric_names: List[str]) -> Dict[str, float]:
|
| 471 |
-
"""
|
| 472 |
-
Get domain-specific performance weights (would come from validation data)
|
| 473 |
-
"""
|
| 474 |
-
# Placeholder - in practice, this would be based on historical performance per domain : FUTURE WORK
|
| 475 |
-
performance_weights = {'structural' : 1.0,
|
| 476 |
-
'entropy' : 1.0,
|
| 477 |
-
'semantic_analysis' : 1.0,
|
| 478 |
-
'linguistic' : 1.0,
|
| 479 |
-
'perplexity' : 1.0,
|
| 480 |
-
'multi_perturbation_stability' : 1.0,
|
| 481 |
-
}
|
| 482 |
-
|
| 483 |
-
# Domain-specific adjustments for all 16 domains
|
| 484 |
-
domain_adjustments = {Domain.GENERAL : {'structural' : 1.0,
|
| 485 |
-
'perplexity' : 1.0,
|
| 486 |
-
'entropy' : 1.0,
|
| 487 |
-
'semantic_analysis' : 1.0,
|
| 488 |
-
'linguistic' : 1.0,
|
| 489 |
-
'multi_perturbation_stability' : 1.0,
|
| 490 |
-
},
|
| 491 |
-
Domain.ACADEMIC : {'structural' : 1.2,
|
| 492 |
-
'perplexity' : 1.3,
|
| 493 |
-
'entropy' : 0.9,
|
| 494 |
-
'semantic_analysis' : 1.1,
|
| 495 |
-
'linguistic' : 1.3,
|
| 496 |
-
'multi_perturbation_stability' : 0.8,
|
| 497 |
-
},
|
| 498 |
-
Domain.CREATIVE : {'structural' : 0.9,
|
| 499 |
-
'perplexity' : 1.1,
|
| 500 |
-
'entropy' : 1.2,
|
| 501 |
-
'semantic_analysis' : 1.0,
|
| 502 |
-
'linguistic' : 1.1,
|
| 503 |
-
'multi_perturbation_stability' : 0.9,
|
| 504 |
-
},
|
| 505 |
-
Domain.AI_ML : {'structural' : 1.2,
|
| 506 |
-
'perplexity' : 1.3,
|
| 507 |
-
'entropy' : 0.9,
|
| 508 |
-
'semantic_analysis' : 1.1,
|
| 509 |
-
'linguistic' : 1.2,
|
| 510 |
-
'multi_perturbation_stability' : 0.8,
|
| 511 |
-
},
|
| 512 |
-
Domain.SOFTWARE_DEV : {'structural' : 1.2,
|
| 513 |
-
'perplexity' : 1.3,
|
| 514 |
-
'entropy' : 0.9,
|
| 515 |
-
'semantic_analysis' : 1.1,
|
| 516 |
-
'linguistic' : 1.2,
|
| 517 |
-
'multi_perturbation_stability' : 0.8,
|
| 518 |
-
},
|
| 519 |
-
Domain.TECHNICAL_DOC : {'structural' : 1.3,
|
| 520 |
-
'perplexity' : 1.3,
|
| 521 |
-
'entropy' : 0.9,
|
| 522 |
-
'semantic_analysis' : 1.2,
|
| 523 |
-
'linguistic' : 1.2,
|
| 524 |
-
'multi_perturbation_stability' : 0.8,
|
| 525 |
-
},
|
| 526 |
-
Domain.ENGINEERING : {'structural' : 1.2,
|
| 527 |
-
'perplexity' : 1.3,
|
| 528 |
-
'entropy' : 0.9,
|
| 529 |
-
'semantic_analysis' : 1.1,
|
| 530 |
-
'linguistic' : 1.2,
|
| 531 |
-
'multi_perturbation_stability' : 0.8,
|
| 532 |
-
},
|
| 533 |
-
Domain.SCIENCE : {'structural' : 1.2,
|
| 534 |
-
'perplexity' : 1.3,
|
| 535 |
-
'entropy' : 0.9,
|
| 536 |
-
'semantic_analysis' : 1.1,
|
| 537 |
-
'linguistic' : 1.2,
|
| 538 |
-
'multi_perturbation_stability' : 0.8,
|
| 539 |
-
},
|
| 540 |
-
Domain.BUSINESS : {'structural' : 1.1,
|
| 541 |
-
'perplexity' : 1.2,
|
| 542 |
-
'entropy' : 1.0,
|
| 543 |
-
'semantic_analysis' : 1.1,
|
| 544 |
-
'linguistic' : 1.1,
|
| 545 |
-
'multi_perturbation_stability' : 0.9,
|
| 546 |
-
},
|
| 547 |
-
Domain.LEGAL : {'structural' : 1.3,
|
| 548 |
-
'perplexity' : 1.3,
|
| 549 |
-
'entropy' : 0.9,
|
| 550 |
-
'semantic_analysis' : 1.2,
|
| 551 |
-
'linguistic' : 1.3,
|
| 552 |
-
'multi_perturbation_stability' : 0.8,
|
| 553 |
-
},
|
| 554 |
-
Domain.MEDICAL : {'structural' : 1.2,
|
| 555 |
-
'perplexity' : 1.3,
|
| 556 |
-
'entropy' : 0.9,
|
| 557 |
-
'semantic_analysis' : 1.2,
|
| 558 |
-
'linguistic' : 1.2,
|
| 559 |
-
'multi_perturbation_stability' : 0.8,
|
| 560 |
-
},
|
| 561 |
-
Domain.JOURNALISM : {'structural' : 1.1,
|
| 562 |
-
'perplexity' : 1.2,
|
| 563 |
-
'entropy' : 1.0,
|
| 564 |
-
'semantic_analysis' : 1.1,
|
| 565 |
-
'linguistic' : 1.1,
|
| 566 |
-
'multi_perturbation_stability' : 0.8,
|
| 567 |
-
},
|
| 568 |
-
Domain.MARKETING : {'structural' : 1.0,
|
| 569 |
-
'perplexity' : 1.1,
|
| 570 |
-
'entropy' : 1.1,
|
| 571 |
-
'semantic_analysis' : 1.0,
|
| 572 |
-
'linguistic' : 1.2,
|
| 573 |
-
'multi_perturbation_stability' : 0.8,
|
| 574 |
-
},
|
| 575 |
-
Domain.SOCIAL_MEDIA : {'structural' : 0.8,
|
| 576 |
-
'perplexity' : 1.0,
|
| 577 |
-
'entropy' : 1.3,
|
| 578 |
-
'semantic_analysis' : 0.9,
|
| 579 |
-
'linguistic' : 0.7,
|
| 580 |
-
'multi_perturbation_stability' : 0.9,
|
| 581 |
-
},
|
| 582 |
-
Domain.BLOG_PERSONAL : {'structural' : 0.9,
|
| 583 |
-
'perplexity' : 1.1,
|
| 584 |
-
'entropy' : 1.2,
|
| 585 |
-
'semantic_analysis' : 1.0,
|
| 586 |
-
'linguistic' : 1.0,
|
| 587 |
-
'multi_perturbation_stability' : 0.8,
|
| 588 |
-
},
|
| 589 |
-
Domain.TUTORIAL : {'structural' : 1.1,
|
| 590 |
-
'perplexity' : 1.2,
|
| 591 |
-
'entropy' : 1.0,
|
| 592 |
-
'semantic_analysis' : 1.1,
|
| 593 |
-
'linguistic' : 1.1,
|
| 594 |
-
'multi_perturbation_stability' : 0.8,
|
| 595 |
-
},
|
| 596 |
-
}
|
| 597 |
-
|
| 598 |
-
adjustments = domain_adjustments.get(domain, {})
|
| 599 |
-
|
| 600 |
-
return {name: performance_weights.get(name, 1.0) * adjustments.get(name, 1.0) for name in metric_names}
|
| 601 |
-
|
| 602 |
-
|
| 603 |
def _calculate_consensus_weights(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> Dict[str, float]:
|
| 604 |
"""
|
| 605 |
Calculate weights based on metric consensus
|
| 606 |
"""
|
| 607 |
-
# Calculate average
|
| 608 |
-
|
| 609 |
|
| 610 |
-
consensus_weights
|
| 611 |
|
| 612 |
for name, result in results.items():
|
| 613 |
base_weight = base_weights.get(name, 0.0)
|
| 614 |
# Reward metrics that agree with consensus
|
| 615 |
-
agreement = 1.0 - abs(result.
|
| 616 |
consensus_weights[name] = base_weight * (0.5 + 0.5 * agreement) # 0.5-1.0 range
|
| 617 |
|
| 618 |
-
# Normalize
|
| 619 |
-
total_weight = sum(consensus_weights.values())
|
| 620 |
-
if (total_weight > 0):
|
| 621 |
-
consensus_weights = {name: w / total_weight for name, w in consensus_weights.items()}
|
| 622 |
-
|
| 623 |
return consensus_weights
|
| 624 |
|
| 625 |
|
| 626 |
-
def
|
| 627 |
-
"""
|
| 628 |
-
Extract features for ML ensemble
|
| 629 |
-
"""
|
| 630 |
-
features = list()
|
| 631 |
-
for name in sorted(results.keys()): # Ensure consistent order
|
| 632 |
-
result = results[name]
|
| 633 |
-
features.extend([result.ai_probability,
|
| 634 |
-
result.human_probability,
|
| 635 |
-
result.mixed_probability,
|
| 636 |
-
result.confidence
|
| 637 |
-
])
|
| 638 |
-
|
| 639 |
-
return features
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
def _calculate_advanced_confidence(self, results: Dict[str, MetricResult], weights: Dict[str, float], aggregated: Dict[str, float]) -> float:
|
| 643 |
"""
|
| 644 |
-
Calculate
|
| 645 |
"""
|
| 646 |
# Base confidence from metric confidences
|
| 647 |
base_confidence = sum(result.confidence * weights.get(name, 0.0) for name, result in results.items())
|
| 648 |
|
| 649 |
# Agreement factor
|
| 650 |
-
|
| 651 |
-
agreement = 1.0 - min(1.0, np.std(
|
| 652 |
|
| 653 |
# Certainty factor (how far from 0.5)
|
| 654 |
-
certainty = 1.0 - 2.0 * abs(aggregated["
|
| 655 |
|
| 656 |
# Metric quality factor
|
| 657 |
-
high_confidence_metrics = sum(1 for r in results.values() if r.confidence >
|
| 658 |
quality_factor = high_confidence_metrics / len(results) if results else 0.0
|
| 659 |
|
| 660 |
# Combined confidence
|
| 661 |
-
confidence = (base_confidence *
|
|
|
|
|
|
|
|
|
|
| 662 |
|
| 663 |
return max(0.0, min(1.0, confidence))
|
| 664 |
|
| 665 |
|
| 666 |
-
def _calculate_uncertainty(self, results: Dict[str, MetricResult],
|
| 667 |
"""
|
| 668 |
Calculate uncertainty score
|
| 669 |
"""
|
| 670 |
# Variance in predictions
|
| 671 |
-
|
| 672 |
-
variance_uncertainty = np.var(
|
| 673 |
|
| 674 |
# Confidence uncertainty
|
| 675 |
avg_confidence = np.mean([r.confidence for r in results.values()])
|
| 676 |
confidence_uncertainty = 1.0 - avg_confidence
|
| 677 |
|
| 678 |
# Decision uncertainty (how close to 0.5)
|
| 679 |
-
decision_uncertainty = 1.0 - 2.0 * abs(aggregated["
|
| 680 |
|
| 681 |
# Combined uncertainty
|
| 682 |
-
uncertainty = (variance_uncertainty *
|
|
|
|
|
|
|
| 683 |
|
| 684 |
return max(0.0, min(1.0, uncertainty))
|
| 685 |
|
|
@@ -692,11 +414,11 @@ class EnsembleClassifier:
|
|
| 692 |
# Perfect consensus with only one metric
|
| 693 |
return 1.0
|
| 694 |
|
| 695 |
-
|
| 696 |
-
std_dev
|
| 697 |
|
| 698 |
# Convert to consensus level (1.0 = perfect consensus, 0.0 = no consensus)
|
| 699 |
-
consensus
|
| 700 |
|
| 701 |
return consensus
|
| 702 |
|
|
@@ -705,42 +427,43 @@ class EnsembleClassifier:
|
|
| 705 |
"""
|
| 706 |
Apply adaptive threshold considering uncertainty
|
| 707 |
"""
|
| 708 |
-
|
| 709 |
-
|
| 710 |
|
| 711 |
# Adjust threshold based on uncertainty : Higher uncertainty requires more confidence
|
| 712 |
-
adjusted_threshold = base_threshold + (uncertainty *
|
| 713 |
|
| 714 |
-
# Check for
|
| 715 |
-
|
| 716 |
-
|
|
|
|
|
|
|
| 717 |
|
| 718 |
# Apply adjusted threshold
|
| 719 |
-
if (
|
| 720 |
-
return "
|
| 721 |
|
| 722 |
-
elif (
|
| 723 |
-
return "
|
| 724 |
|
| 725 |
else:
|
| 726 |
return "Uncertain"
|
| 727 |
|
| 728 |
|
| 729 |
-
def
|
| 730 |
-
verdict: str, uncertainty: float, consensus: float) -> List[str]:
|
| 731 |
"""
|
| 732 |
-
Generate
|
| 733 |
"""
|
| 734 |
-
reasoning
|
| 735 |
|
| 736 |
# Overall assessment
|
| 737 |
-
|
| 738 |
-
|
| 739 |
|
| 740 |
reasoning.append(f"## Ensemble Analysis Result")
|
| 741 |
reasoning.append(f"**Final Verdict**: {verdict}")
|
| 742 |
-
reasoning.append(f"**
|
| 743 |
-
reasoning.append(f"**Confidence Level**: {self._get_confidence_label(
|
| 744 |
reasoning.append(f"**Uncertainty**: {uncertainty:.1%}")
|
| 745 |
reasoning.append(f"**Consensus**: {consensus:.1%}")
|
| 746 |
|
|
@@ -751,9 +474,17 @@ class EnsembleClassifier:
|
|
| 751 |
|
| 752 |
for name, result in sorted_metrics:
|
| 753 |
weight = weights.get(name, 0.0)
|
| 754 |
-
contribution = "High" if (weight > 0.15) else "Medium" if (weight > 0.08) else "Low"
|
| 755 |
|
| 756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
f"(Confidence: {result.confidence:.1%}, "
|
| 758 |
f"Contribution: {contribution})")
|
| 759 |
|
|
@@ -771,49 +502,64 @@ class EnsembleClassifier:
|
|
| 771 |
if (top_metric and (weights.get(top_metric[0], 0.0) > 0.2)):
|
| 772 |
reasoning.append(f"🎯 **Dominant metric** - {top_metric[0]} had strongest influence")
|
| 773 |
|
| 774 |
-
if (
|
| 775 |
-
reasoning.append("🔀 **Mixed signals** - Content shows characteristics of both
|
| 776 |
|
| 777 |
return reasoning
|
| 778 |
|
| 779 |
|
| 780 |
-
def _get_confidence_label(self,
|
| 781 |
"""
|
| 782 |
-
Get human-readable confidence label
|
| 783 |
"""
|
| 784 |
-
|
|
|
|
| 785 |
return "Very High"
|
| 786 |
-
|
| 787 |
-
|
|
|
|
| 788 |
return "High"
|
| 789 |
-
|
| 790 |
-
|
|
|
|
| 791 |
return "Moderate"
|
| 792 |
-
|
|
|
|
| 793 |
else:
|
| 794 |
return "Low"
|
| 795 |
|
| 796 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 797 |
def _create_fallback_result(self, domain: Domain, metric_results: Dict[str, MetricResult], error: str) -> EnsembleResult:
|
| 798 |
"""
|
| 799 |
Create fallback result when ensemble cannot make a confident decision
|
| 800 |
"""
|
| 801 |
-
return EnsembleResult(final_verdict
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
overall_confidence
|
| 806 |
-
domain
|
| 807 |
-
metric_results
|
| 808 |
-
metric_weights
|
| 809 |
-
weighted_scores
|
| 810 |
-
reasoning
|
| 811 |
-
uncertainty_score
|
| 812 |
-
consensus_level
|
|
|
|
| 813 |
)
|
| 814 |
|
| 815 |
|
| 816 |
# Export
|
| 817 |
-
__all__ = ["
|
| 818 |
-
"EnsembleClassifier",
|
| 819 |
-
]
|
|
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
import numpy as np
|
|
|
|
| 3 |
from typing import List
|
| 4 |
from typing import Dict
|
| 5 |
from loguru import logger
|
| 6 |
+
from config.enums import Domain
|
| 7 |
+
from config.schemas import MetricResult
|
| 8 |
+
from config.schemas import EnsembleResult
|
| 9 |
+
from config.constants import metrics_ensemble_params
|
|
|
|
|
|
|
| 10 |
from config.threshold_config import get_threshold_for_domain
|
| 11 |
from config.threshold_config import get_active_metric_weights
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
class EnsembleClassifier:
|
| 15 |
"""
|
| 16 |
+
Ensemble classifier with multiple aggregation strategies
|
| 17 |
|
| 18 |
Features:
|
| 19 |
- Domain-aware dynamic weighting
|
|
|
|
| 21 |
- Uncertainty quantification
|
| 22 |
- Consensus analysis
|
| 23 |
- Fallback strategies
|
|
|
|
| 24 |
"""
|
| 25 |
+
def __init__(self, primary_method: str = "confidence_calibrated", fallback_method: str = "domain_weighted", min_metrics_required: int = None, execution_mode = "parallel"):
|
| 26 |
"""
|
| 27 |
Initialize advanced ensemble classifier
|
| 28 |
|
| 29 |
Arguments:
|
| 30 |
----------
|
| 31 |
+
primary_method : Primary aggregation method : "confidence_calibrated", "consensus_based"
|
| 32 |
|
| 33 |
fallback_method : Fallback method if primary fails : "domain_weighted", "confidence_weighted", "simple_average"
|
| 34 |
|
| 35 |
+
min_metrics_required : Minimum number of valid metrics required (overrides default)
|
|
|
|
|
|
|
| 36 |
"""
|
| 37 |
self.primary_method = primary_method
|
| 38 |
self.fallback_method = fallback_method
|
| 39 |
+
self.min_metrics_required = min_metrics_required or metrics_ensemble_params.MIN_METRICS_REQUIRED
|
| 40 |
+
self.params = metrics_ensemble_params
|
| 41 |
+
self.execution_mode = execution_mode
|
| 42 |
|
| 43 |
+
logger.info(f"EnsembleClassifier initialized (primary={primary_method}, fallback={fallback_method})")
|
| 44 |
|
| 45 |
|
| 46 |
def predict(self, metric_results: Dict[str, MetricResult], domain: Domain = Domain.GENERAL) -> EnsembleResult:
|
|
|
|
| 58 |
{ EnsembleResult } : EnsembleResult object with final prediction
|
| 59 |
"""
|
| 60 |
try:
|
| 61 |
+
# Filter out metrics with errors
|
| 62 |
+
valid_results = self._filter_valid_metrics(results = metric_results)
|
| 63 |
|
| 64 |
if (len(valid_results) < self.min_metrics_required):
|
| 65 |
logger.warning(f"Insufficient valid metrics: {len(valid_results)}/{self.min_metrics_required}")
|
|
|
|
| 69 |
enabled_metrics = {name: True for name in valid_results.keys()}
|
| 70 |
base_weights = get_active_metric_weights(domain, enabled_metrics)
|
| 71 |
|
| 72 |
+
# Try primary aggregation method
|
| 73 |
calculated_weights = dict()
|
| 74 |
+
aggregated = {"synthetic_probability" : self.params.DEFAULT_SYNTHETIC_PROB,
|
| 75 |
+
"authentic_probability" : self.params.DEFAULT_AUTHENTIC_PROB,
|
| 76 |
+
"hybrid_probability" : self.params.DEFAULT_HYBRID_PROB,
|
| 77 |
}
|
| 78 |
|
| 79 |
try:
|
|
|
|
| 83 |
domain = domain,
|
| 84 |
)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
elif (self.primary_method == "consensus_based"):
|
| 87 |
aggregated, calculated_weights = self._consensus_based_aggregation(results = valid_results,
|
| 88 |
base_weights = base_weights,
|
|
|
|
| 89 |
)
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
else:
|
| 92 |
# Fallback to domain weighted
|
| 93 |
aggregated, calculated_weights = self._domain_weighted_aggregation(results = valid_results,
|
| 94 |
base_weights = base_weights,
|
|
|
|
| 95 |
)
|
| 96 |
|
| 97 |
except Exception as e:
|
|
|
|
| 103 |
# Start with the calculated weights (from valid_results)
|
| 104 |
final_metric_weights = calculated_weights.copy()
|
| 105 |
|
| 106 |
+
# Assign zero weight to any original metrics that weren't included in valid_results
|
| 107 |
for original_metric_name in metric_results.keys():
|
| 108 |
+
if (original_metric_name not in final_metric_weights):
|
|
|
|
| 109 |
final_metric_weights[original_metric_name] = 0.0
|
| 110 |
|
| 111 |
+
# Calculate advanced metrics
|
| 112 |
+
overall_confidence = self._calculate_confidence(results = valid_results,
|
| 113 |
+
weights = calculated_weights,
|
| 114 |
+
aggregated = aggregated,
|
| 115 |
+
)
|
| 116 |
|
| 117 |
uncertainty_score = self._calculate_uncertainty(results = valid_results,
|
|
|
|
| 118 |
aggregated = aggregated,
|
| 119 |
)
|
| 120 |
|
|
|
|
| 127 |
uncertainty = uncertainty_score,
|
| 128 |
)
|
| 129 |
|
| 130 |
+
# Generate reasoning
|
| 131 |
+
reasoning = self._generate_reasoning(results = valid_results,
|
| 132 |
+
weights = calculated_weights,
|
| 133 |
+
aggregated = aggregated,
|
| 134 |
+
verdict = final_verdict,
|
| 135 |
+
uncertainty = uncertainty_score,
|
| 136 |
+
consensus = consensus_level,
|
| 137 |
+
)
|
| 138 |
|
| 139 |
+
# Calculate weighted scores
|
| 140 |
+
weighted_scores = {name: result.synthetic_probability * calculated_weights.get(name, 0.0)
|
| 141 |
+
for name, result in valid_results.items()}
|
| 142 |
|
| 143 |
+
return EnsembleResult(final_verdict = final_verdict,
|
| 144 |
+
synthetic_probability = aggregated["synthetic_probability"],
|
| 145 |
+
authentic_probability = aggregated["authentic_probability"],
|
| 146 |
+
hybrid_probability = aggregated["hybrid_probability"],
|
| 147 |
+
overall_confidence = overall_confidence,
|
| 148 |
+
domain = domain,
|
| 149 |
+
metric_results = metric_results,
|
| 150 |
+
metric_weights = final_metric_weights,
|
| 151 |
+
weighted_scores = weighted_scores,
|
| 152 |
+
reasoning = reasoning,
|
| 153 |
+
uncertainty_score = uncertainty_score,
|
| 154 |
+
consensus_level = consensus_level,
|
| 155 |
+
execution_mode = self.execution_mode,
|
| 156 |
)
|
| 157 |
|
| 158 |
except Exception as e:
|
| 159 |
+
logger.error(f"Error in ensemble prediction: {e}")
|
| 160 |
return self._create_fallback_result(domain, metric_results, str(e))
|
| 161 |
|
| 162 |
|
| 163 |
+
def _filter_valid_metrics(self, results: Dict[str, MetricResult]) -> Dict[str, MetricResult]:
|
| 164 |
"""
|
| 165 |
+
Filter out failed metrics (error != None).
|
| 166 |
+
Confidence is handled during aggregation, not validation.
|
| 167 |
"""
|
| 168 |
+
valid_results = dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
for name, result in results.items():
|
| 171 |
if result.error is not None:
|
|
|
|
| 172 |
continue
|
| 173 |
|
| 174 |
+
valid_results[name] = result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
+
return valid_results
|
| 177 |
|
| 178 |
|
| 179 |
def _confidence_calibrated_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float], domain: Domain) -> tuple:
|
|
|
|
| 190 |
confidence_weights[name] = base_weight * confidence_factor
|
| 191 |
|
| 192 |
# Normalize weights
|
| 193 |
+
confidence_weights = self._normalize_weights(confidence_weights)
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
# Domain-specific calibration
|
| 196 |
domain_calibration = self._get_domain_calibration(domain = domain)
|
|
|
|
| 202 |
return self._weighted_aggregation(calibrated_results, confidence_weights), confidence_weights
|
| 203 |
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
def _consensus_based_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> tuple:
|
| 206 |
"""
|
| 207 |
Consensus-based aggregation that rewards metric agreement
|
| 208 |
"""
|
| 209 |
# Calculate consensus scores
|
| 210 |
+
consensus_weights = self._calculate_consensus_weights(results = results,
|
| 211 |
+
base_weights = base_weights,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
consensus_weights = self._normalize_weights(consensus_weights)
|
| 215 |
|
| 216 |
aggregations = self._weighted_aggregation(results = results,
|
| 217 |
weights = consensus_weights,
|
|
|
|
| 219 |
return aggregations, consensus_weights
|
| 220 |
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
def _domain_weighted_aggregation(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> tuple:
|
| 223 |
"""
|
| 224 |
Simple domain-weighted aggregation (fallback method)
|
|
|
|
| 231 |
Apply fallback aggregation method
|
| 232 |
"""
|
| 233 |
if (self.fallback_method == "confidence_weighted"):
|
| 234 |
+
return self._confidence_weighted_aggregation(results = results), base_weights
|
| 235 |
|
| 236 |
elif (self.fallback_method == "simple_average"):
|
| 237 |
+
return self._simple_average_aggregation(results = results), base_weights
|
| 238 |
|
| 239 |
else:
|
| 240 |
+
return self._domain_weighted_aggregation(results = results, base_weights = base_weights), base_weights
|
| 241 |
|
| 242 |
|
| 243 |
def _weighted_aggregation(self, results: Dict[str, MetricResult], weights: Dict[str, float]) -> Dict[str, float]:
|
| 244 |
"""
|
| 245 |
Core weighted aggregation logic
|
| 246 |
"""
|
| 247 |
+
synthetic_scores = list()
|
| 248 |
+
authentic_scores = list()
|
| 249 |
+
hybrid_scores = list()
|
| 250 |
+
total_weight = 0.0
|
| 251 |
|
| 252 |
for name, result in results.items():
|
| 253 |
weight = weights.get(name, 0.0)
|
| 254 |
|
| 255 |
if (weight > 0):
|
| 256 |
+
synthetic_scores.append(result.synthetic_probability * weight)
|
| 257 |
+
authentic_scores.append(result.authentic_probability * weight)
|
| 258 |
+
hybrid_scores.append(result.hybrid_probability * weight)
|
| 259 |
|
| 260 |
total_weight += weight
|
| 261 |
|
| 262 |
if (total_weight == 0):
|
| 263 |
+
return {"synthetic_probability" : self.params.DEFAULT_SYNTHETIC_PROB,
|
| 264 |
+
"authentic_probability" : self.params.DEFAULT_AUTHENTIC_PROB,
|
| 265 |
+
"hybrid_probability" : self.params.DEFAULT_HYBRID_PROB,
|
| 266 |
}
|
| 267 |
|
| 268 |
# Calculate weighted averages
|
| 269 |
+
synthetic_prob = sum(synthetic_scores) / total_weight
|
| 270 |
+
authentic_prob = sum(authentic_scores) / total_weight
|
| 271 |
+
hybrid_prob = sum(hybrid_scores) / total_weight
|
| 272 |
|
| 273 |
+
# Normalize probabilities to sum to 1.0
|
| 274 |
+
total = synthetic_prob + authentic_prob + hybrid_prob
|
| 275 |
|
| 276 |
if (total > 0):
|
| 277 |
+
synthetic_prob /= total
|
| 278 |
+
authentic_prob /= total
|
| 279 |
+
hybrid_prob /= total
|
| 280 |
|
| 281 |
+
return {"synthetic_probability" : synthetic_prob,
|
| 282 |
+
"authentic_probability" : authentic_prob,
|
| 283 |
+
"hybrid_probability" : hybrid_prob,
|
| 284 |
}
|
| 285 |
|
| 286 |
|
|
|
|
| 288 |
"""
|
| 289 |
Confidence-weighted aggregation
|
| 290 |
"""
|
| 291 |
+
weights = {name: result.confidence for name, result in results.items()}
|
| 292 |
+
weights = self._normalize_weights(weights)
|
| 293 |
+
return self._weighted_aggregation(results, weights)
|
| 294 |
|
| 295 |
|
| 296 |
def _simple_average_aggregation(self, results: Dict[str, MetricResult]) -> Dict[str, float]:
|
|
|
|
| 304 |
"""
|
| 305 |
Non-linear confidence adjustment using sigmoid
|
| 306 |
"""
|
| 307 |
+
# Sigmoid that emphasizes differences around the center
|
| 308 |
+
return 1.0 / (1.0 + np.exp(-self.params.SIGMOID_CONFIDENCE_SCALE * (confidence - self.params.SIGMOID_CENTER)))
|
| 309 |
|
| 310 |
|
| 311 |
def _get_domain_calibration(self, domain: Domain) -> Dict[str, float]:
|
|
|
|
| 313 |
Get domain-specific calibration factors
|
| 314 |
"""
|
| 315 |
# This would typically come from validation data
|
| 316 |
+
# For now, return neutral calibration
|
| 317 |
return {}
|
| 318 |
|
| 319 |
|
|
|
|
| 322 |
Calibrate probabilities based on domain performance
|
| 323 |
"""
|
| 324 |
calibrated = dict()
|
| 325 |
+
|
| 326 |
for name, result in results.items():
|
| 327 |
+
cal_factor = calibration.get(name, 1.0)
|
| 328 |
+
# Simple calibration
|
| 329 |
+
new_synthetic_prob = min(1.0, max(0.0, result.synthetic_probability * cal_factor))
|
| 330 |
+
|
| 331 |
+
calibrated[name] = MetricResult(metric_name = result.metric_name,
|
| 332 |
+
synthetic_probability = new_synthetic_prob,
|
| 333 |
+
authentic_probability = 1.0 - new_synthetic_prob,
|
| 334 |
+
hybrid_probability = result.hybrid_probability,
|
| 335 |
+
confidence = result.confidence,
|
| 336 |
+
details = result.details
|
| 337 |
+
)
|
| 338 |
return calibrated
|
| 339 |
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def _calculate_consensus_weights(self, results: Dict[str, MetricResult], base_weights: Dict[str, float]) -> Dict[str, float]:
|
| 342 |
"""
|
| 343 |
Calculate weights based on metric consensus
|
| 344 |
"""
|
| 345 |
+
# Calculate average synthetic probability
|
| 346 |
+
avg_synthetic_prob = np.mean([r.synthetic_probability for r in results.values()])
|
| 347 |
|
| 348 |
+
consensus_weights = dict()
|
| 349 |
|
| 350 |
for name, result in results.items():
|
| 351 |
base_weight = base_weights.get(name, 0.0)
|
| 352 |
# Reward metrics that agree with consensus
|
| 353 |
+
agreement = 1.0 - abs(result.synthetic_probability - avg_synthetic_prob)
|
| 354 |
consensus_weights[name] = base_weight * (0.5 + 0.5 * agreement) # 0.5-1.0 range
|
| 355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
return consensus_weights
|
| 357 |
|
| 358 |
|
| 359 |
+
def _calculate_confidence(self, results: Dict[str, MetricResult], weights: Dict[str, float], aggregated: Dict[str, float]) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
"""
|
| 361 |
+
Calculate confidence considering multiple factors
|
| 362 |
"""
|
| 363 |
# Base confidence from metric confidences
|
| 364 |
base_confidence = sum(result.confidence * weights.get(name, 0.0) for name, result in results.items())
|
| 365 |
|
| 366 |
# Agreement factor
|
| 367 |
+
synthetic_probs = [r.synthetic_probability for r in results.values()]
|
| 368 |
+
agreement = 1.0 - min(1.0, np.std(synthetic_probs) * self.params.CONSENSUS_STD_SCALING)
|
| 369 |
|
| 370 |
# Certainty factor (how far from 0.5)
|
| 371 |
+
certainty = 1.0 - 2.0 * abs(aggregated["synthetic_probability"] - 0.5)
|
| 372 |
|
| 373 |
# Metric quality factor
|
| 374 |
+
high_confidence_metrics = sum(1 for r in results.values() if r.confidence > self.params.HIGH_CONFIDENCE_THRESHOLD)
|
| 375 |
quality_factor = high_confidence_metrics / len(results) if results else 0.0
|
| 376 |
|
| 377 |
# Combined confidence
|
| 378 |
+
confidence = (base_confidence * self.params.CONFIDENCE_WEIGHT_BASE +
|
| 379 |
+
agreement * self.params.CONFIDENCE_WEIGHT_AGREEMENT +
|
| 380 |
+
certainty * self.params.CONFIDENCE_WEIGHT_CERTAINTY +
|
| 381 |
+
quality_factor * self.params.CONFIDENCE_WEIGHT_QUALITY)
|
| 382 |
|
| 383 |
return max(0.0, min(1.0, confidence))
|
| 384 |
|
| 385 |
|
| 386 |
+
def _calculate_uncertainty(self, results: Dict[str, MetricResult], aggregated: Dict[str, float]) -> float:
|
| 387 |
"""
|
| 388 |
Calculate uncertainty score
|
| 389 |
"""
|
| 390 |
# Variance in predictions
|
| 391 |
+
synthetic_probs = [r.synthetic_probability for r in results.values()]
|
| 392 |
+
variance_uncertainty = np.var(synthetic_probs) if len(synthetic_probs) > 1 else 0.0
|
| 393 |
|
| 394 |
# Confidence uncertainty
|
| 395 |
avg_confidence = np.mean([r.confidence for r in results.values()])
|
| 396 |
confidence_uncertainty = 1.0 - avg_confidence
|
| 397 |
|
| 398 |
# Decision uncertainty (how close to 0.5)
|
| 399 |
+
decision_uncertainty = 1.0 - 2.0 * abs(aggregated["synthetic_probability"] - 0.5)
|
| 400 |
|
| 401 |
# Combined uncertainty
|
| 402 |
+
uncertainty = (variance_uncertainty * self.params.UNCERTAINTY_WEIGHT_VARIANCE +
|
| 403 |
+
confidence_uncertainty * self.params.UNCERTAINTY_WEIGHT_CONFIDENCE +
|
| 404 |
+
decision_uncertainty * self.params.UNCERTAINTY_WEIGHT_DECISION)
|
| 405 |
|
| 406 |
return max(0.0, min(1.0, uncertainty))
|
| 407 |
|
|
|
|
| 414 |
# Perfect consensus with only one metric
|
| 415 |
return 1.0
|
| 416 |
|
| 417 |
+
synthetic_probs = [r.synthetic_probability for r in results.values()]
|
| 418 |
+
std_dev = np.std(synthetic_probs)
|
| 419 |
|
| 420 |
# Convert to consensus level (1.0 = perfect consensus, 0.0 = no consensus)
|
| 421 |
+
consensus = 1.0 - min(1.0, std_dev * self.params.CONSENSUS_STD_SCALING)
|
| 422 |
|
| 423 |
return consensus
|
| 424 |
|
|
|
|
| 427 |
"""
|
| 428 |
Apply adaptive threshold considering uncertainty
|
| 429 |
"""
|
| 430 |
+
synthetic_prob = aggregated.get("synthetic_probability", self.params.DEFAULT_SYNTHETIC_PROB)
|
| 431 |
+
hybrid_prob = aggregated.get("hybrid_probability", self.params.DEFAULT_HYBRID_PROB)
|
| 432 |
|
| 433 |
# Adjust threshold based on uncertainty : Higher uncertainty requires more confidence
|
| 434 |
+
adjusted_threshold = base_threshold + (uncertainty * self.params.UNCERTAINTY_THRESHOLD_ADJUSTMENT)
|
| 435 |
|
| 436 |
+
# Check for hybrid content
|
| 437 |
+
# Case 1: Explicit hybrid probability from metrics
|
| 438 |
+
# Case 2: High uncertainty + ambiguous synthetic score
|
| 439 |
+
if ((hybrid_prob > self.params.HYBRID_PROB_THRESHOLD) or ((uncertainty > self.params.HYBRID_UNCERTAINTY_THRESHOLD) and (self.params.HYBRID_SYNTHETIC_RANGE_LOW < synthetic_prob < self.params.HYBRID_SYNTHETIC_RANGE_HIGH))):
|
| 440 |
+
return "Hybrid"
|
| 441 |
|
| 442 |
# Apply adjusted threshold
|
| 443 |
+
if (synthetic_prob >= adjusted_threshold):
|
| 444 |
+
return "Synthetically-Generated"
|
| 445 |
|
| 446 |
+
elif (synthetic_prob <= (1.0 - adjusted_threshold)):
|
| 447 |
+
return "Authentically-Written"
|
| 448 |
|
| 449 |
else:
|
| 450 |
return "Uncertain"
|
| 451 |
|
| 452 |
|
| 453 |
+
def _generate_reasoning(self, results: Dict[str, MetricResult], weights: Dict[str, float], aggregated: Dict[str, float], verdict: str, uncertainty: float, consensus: float) -> List[str]:
|
|
|
|
| 454 |
"""
|
| 455 |
+
Generate reasoning for the prediction
|
| 456 |
"""
|
| 457 |
+
reasoning = list()
|
| 458 |
|
| 459 |
# Overall assessment
|
| 460 |
+
synthetic_prob = aggregated.get("synthetic_probability", self.params.DEFAULT_SYNTHETIC_PROB)
|
| 461 |
+
hybrid_prob = aggregated.get("hybrid_probability", self.params.DEFAULT_HYBRID_PROB)
|
| 462 |
|
| 463 |
reasoning.append(f"## Ensemble Analysis Result")
|
| 464 |
reasoning.append(f"**Final Verdict**: {verdict}")
|
| 465 |
+
reasoning.append(f"**Synthetic Probability**: {synthetic_prob:.1%}")
|
| 466 |
+
reasoning.append(f"**Confidence Level**: {self._get_confidence_label(synthetic_prob)}")
|
| 467 |
reasoning.append(f"**Uncertainty**: {uncertainty:.1%}")
|
| 468 |
reasoning.append(f"**Consensus**: {consensus:.1%}")
|
| 469 |
|
|
|
|
| 474 |
|
| 475 |
for name, result in sorted_metrics:
|
| 476 |
weight = weights.get(name, 0.0)
|
|
|
|
| 477 |
|
| 478 |
+
if (weight > self.params.CONTRIBUTION_HIGH):
|
| 479 |
+
contribution = "High"
|
| 480 |
+
|
| 481 |
+
elif (weight > self.params.CONTRIBUTION_MEDIUM):
|
| 482 |
+
contribution = "Medium"
|
| 483 |
+
|
| 484 |
+
else:
|
| 485 |
+
contribution = "Low"
|
| 486 |
+
|
| 487 |
+
reasoning.append(f"**{name}**: {result.synthetic_probability:.1%} synthetic probability "
|
| 488 |
f"(Confidence: {result.confidence:.1%}, "
|
| 489 |
f"Contribution: {contribution})")
|
| 490 |
|
|
|
|
| 502 |
if (top_metric and (weights.get(top_metric[0], 0.0) > 0.2)):
|
| 503 |
reasoning.append(f"🎯 **Dominant metric** - {top_metric[0]} had strongest influence")
|
| 504 |
|
| 505 |
+
if (hybrid_prob > self.params.HYBRID_PROB_THRESHOLD):
|
| 506 |
+
reasoning.append("🔀 **Mixed signals** - Content shows characteristics of both synthetic and authentic writing")
|
| 507 |
|
| 508 |
return reasoning
|
| 509 |
|
| 510 |
|
| 511 |
+
def _get_confidence_label(self, synthetic_prob: float) -> str:
|
| 512 |
"""
|
| 513 |
+
Get human-readable confidence label based on distance from decision boundaries
|
| 514 |
"""
|
| 515 |
+
# Very high confidence: very clear synthetic or very clear authentic
|
| 516 |
+
if ((synthetic_prob > 0.9) or (synthetic_prob < 0.1)):
|
| 517 |
return "Very High"
|
| 518 |
+
|
| 519 |
+
# High confidence: strongly synthetic or strongly authentic
|
| 520 |
+
elif ((synthetic_prob > 0.8) or (synthetic_prob < 0.2)):
|
| 521 |
return "High"
|
| 522 |
+
|
| 523 |
+
# Moderate confidence: leaning synthetic or leaning authentic
|
| 524 |
+
elif ((synthetic_prob > 0.7) or (synthetic_prob < 0.3)):
|
| 525 |
return "Moderate"
|
| 526 |
+
|
| 527 |
+
# Low confidence: close to decision boundary
|
| 528 |
else:
|
| 529 |
return "Low"
|
| 530 |
|
| 531 |
|
| 532 |
+
def _normalize_weights(self, weights: Dict[str, float]) -> Dict[str, float]:
|
| 533 |
+
"""
|
| 534 |
+
Normalize weights to sum to 1.0
|
| 535 |
+
"""
|
| 536 |
+
total = sum(weights.values())
|
| 537 |
+
|
| 538 |
+
if (total > 0):
|
| 539 |
+
return {k: v / total for k, v in weights.items()}
|
| 540 |
+
|
| 541 |
+
return weights
|
| 542 |
+
|
| 543 |
+
|
| 544 |
def _create_fallback_result(self, domain: Domain, metric_results: Dict[str, MetricResult], error: str) -> EnsembleResult:
|
| 545 |
"""
|
| 546 |
Create fallback result when ensemble cannot make a confident decision
|
| 547 |
"""
|
| 548 |
+
return EnsembleResult(final_verdict = "Uncertain",
|
| 549 |
+
synthetic_probability = self.params.DEFAULT_SYNTHETIC_PROB,
|
| 550 |
+
authentic_probability = self.params.DEFAULT_AUTHENTIC_PROB,
|
| 551 |
+
hybrid_probability = self.params.DEFAULT_HYBRID_PROB,
|
| 552 |
+
overall_confidence = 0.0,
|
| 553 |
+
domain = domain,
|
| 554 |
+
metric_results = metric_results,
|
| 555 |
+
metric_weights = {},
|
| 556 |
+
weighted_scores = {},
|
| 557 |
+
reasoning = [f"Ensemble analysis inconclusive", f"Reason: {error}"],
|
| 558 |
+
uncertainty_score = 1.0,
|
| 559 |
+
consensus_level = 0.0,
|
| 560 |
+
execution_mode = self.execution_mode,
|
| 561 |
)
|
| 562 |
|
| 563 |
|
| 564 |
# Export
|
| 565 |
+
__all__ = ["EnsembleClassifier"]
|
|
|
|
|
|
{detector → services}/highlighter.py
RENAMED
|
@@ -5,37 +5,19 @@ from typing import Dict
|
|
| 5 |
from typing import Tuple
|
| 6 |
from loguru import logger
|
| 7 |
from typing import Optional
|
| 8 |
-
from
|
| 9 |
-
from config.
|
| 10 |
-
from
|
| 11 |
-
from detector.ensemble import EnsembleResult
|
| 12 |
-
from detector.ensemble import EnsembleClassifier
|
| 13 |
from processors.text_processor import TextProcessor
|
| 14 |
from config.threshold_config import ConfidenceLevel
|
|
|
|
| 15 |
from config.threshold_config import MetricThresholds
|
| 16 |
from config.threshold_config import get_confidence_level
|
|
|
|
| 17 |
from config.threshold_config import get_threshold_for_domain
|
| 18 |
from config.threshold_config import get_active_metric_weights
|
| 19 |
|
| 20 |
|
| 21 |
-
@dataclass
|
| 22 |
-
class HighlightedSentence:
|
| 23 |
-
"""
|
| 24 |
-
A sentence with highlighting information
|
| 25 |
-
"""
|
| 26 |
-
text : str
|
| 27 |
-
ai_probability : float
|
| 28 |
-
human_probability : float
|
| 29 |
-
mixed_probability : float
|
| 30 |
-
confidence : float
|
| 31 |
-
confidence_level : ConfidenceLevel
|
| 32 |
-
color_class : str
|
| 33 |
-
tooltip : str
|
| 34 |
-
index : int
|
| 35 |
-
is_mixed_content : bool
|
| 36 |
-
metric_breakdown : Optional[Dict[str, float]] = None
|
| 37 |
-
|
| 38 |
-
|
| 39 |
class TextHighlighter:
|
| 40 |
"""
|
| 41 |
Generates sentence-level highlighting with ensemble results integration
|
|
@@ -43,35 +25,19 @@ class TextHighlighter:
|
|
| 43 |
FEATURES:
|
| 44 |
- Sentence-level highlighting with confidence scores
|
| 45 |
- Domain-aware calibration
|
| 46 |
-
- Ensemble-
|
| 47 |
-
-
|
| 48 |
- Explainable tooltips
|
| 49 |
-
- Highlighting metrics calculation
|
| 50 |
"""
|
| 51 |
-
# Color thresholds
|
| 52 |
-
COLOR_THRESHOLDS
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
(0.75, 0.90, "high-ai", "#fed7aa", "Likely AI-generated"),
|
| 58 |
-
(0.90, 1.00, "very-high-ai", "#fecaca", "Very likely AI-generated"),
|
| 59 |
-
]
|
| 60 |
|
| 61 |
-
#
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
# Risk weights
|
| 65 |
-
RISK_WEIGHTS = {'very-high-ai' : 1.0,
|
| 66 |
-
'high-ai' : 0.8,
|
| 67 |
-
'medium-ai' : 0.6,
|
| 68 |
-
'uncertain' : 0.4,
|
| 69 |
-
'medium-human' : 0.2,
|
| 70 |
-
'high-human' : 0.1,
|
| 71 |
-
'very-high-human' : 0.0,
|
| 72 |
-
'mixed-content' : 0.7,
|
| 73 |
-
}
|
| 74 |
-
|
| 75 |
|
| 76 |
def __init__(self, domain: Domain = Domain.GENERAL, ensemble_classifier: Optional[EnsembleClassifier] = None):
|
| 77 |
"""
|
|
@@ -99,20 +65,21 @@ class TextHighlighter:
|
|
| 99 |
)
|
| 100 |
except Exception as e:
|
| 101 |
logger.warning(f"Failed to create default ensemble: {e}. Using fallback mode.")
|
| 102 |
-
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
def generate_highlights(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult] = None,
|
| 107 |
-
enabled_metrics: Optional[Dict[str, bool]] = None, use_sentence_level: bool = True) -> List[
|
| 108 |
"""
|
| 109 |
Generate sentence-level highlights with ensemble integration
|
| 110 |
|
| 111 |
Arguments:
|
| 112 |
----------
|
| 113 |
text { str } : Original text
|
| 114 |
-
|
| 115 |
-
metric_results { dict } : Results from all
|
| 116 |
|
| 117 |
ensemble_result { EnsembleResult } : Optional document-level ensemble result
|
| 118 |
|
|
@@ -122,12 +89,15 @@ class TextHighlighter:
|
|
| 122 |
|
| 123 |
Returns:
|
| 124 |
--------
|
| 125 |
-
{ list } : List of
|
| 126 |
"""
|
| 127 |
try:
|
| 128 |
# Validate inputs
|
| 129 |
if not text or not text.strip():
|
| 130 |
-
return self._handle_empty_text(text
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
# Get domain-appropriate weights for enabled metrics
|
| 133 |
if enabled_metrics is None:
|
|
@@ -136,7 +106,7 @@ class TextHighlighter:
|
|
| 136 |
weights = get_active_metric_weights(self.domain, enabled_metrics)
|
| 137 |
|
| 138 |
# Split text into sentences with error handling
|
| 139 |
-
sentences = self._split_sentences_with_fallback(text)
|
| 140 |
|
| 141 |
if not sentences:
|
| 142 |
return self._handle_no_sentences(text, metric_results, ensemble_result)
|
|
@@ -147,61 +117,64 @@ class TextHighlighter:
|
|
| 147 |
for idx, sentence in enumerate(sentences):
|
| 148 |
try:
|
| 149 |
if use_sentence_level:
|
| 150 |
-
# Use
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
else:
|
| 157 |
# Use document-level ensemble probabilities
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
|
| 163 |
# Apply domain-specific adjustments with limits
|
| 164 |
-
|
| 165 |
-
|
| 166 |
sentence_length = len(sentence.split()),
|
| 167 |
)
|
| 168 |
|
| 169 |
-
# Determine if this is
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Get confidence level
|
| 173 |
confidence_level = get_confidence_level(confidence)
|
| 174 |
|
| 175 |
-
# Get color class (consider
|
| 176 |
-
color_class, color_hex, tooltip_base = self._get_color_for_probability(
|
| 177 |
-
|
| 178 |
-
|
| 179 |
)
|
| 180 |
|
| 181 |
# Generate enhanced tooltip
|
| 182 |
-
tooltip = self._generate_ensemble_tooltip(sentence
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
confidence
|
| 187 |
-
confidence_level
|
| 188 |
-
tooltip_base
|
| 189 |
-
breakdown
|
| 190 |
-
|
| 191 |
)
|
| 192 |
|
| 193 |
-
highlighted_sentences.append(
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
)
|
| 206 |
|
| 207 |
except Exception as e:
|
|
@@ -216,72 +189,72 @@ class TextHighlighter:
|
|
| 216 |
return self._create_error_fallback(text, metric_results)
|
| 217 |
|
| 218 |
|
| 219 |
-
def _handle_empty_text(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[
|
| 220 |
"""
|
| 221 |
Handle empty input text
|
| 222 |
"""
|
| 223 |
if ensemble_result:
|
| 224 |
-
return [self._create_fallback_sentence(text
|
| 225 |
-
index
|
| 226 |
-
|
| 227 |
-
|
| 228 |
)
|
| 229 |
]
|
| 230 |
|
| 231 |
return [self._create_fallback_sentence("No text content", 0)]
|
| 232 |
|
| 233 |
|
| 234 |
-
def _handle_no_sentences(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[
|
| 235 |
"""
|
| 236 |
Handle case where no sentences could be extracted
|
| 237 |
"""
|
| 238 |
-
if
|
| 239 |
# Treat entire text as one sentence
|
| 240 |
return [self._create_fallback_sentence(text.strip(), 0)]
|
| 241 |
-
|
| 242 |
return [self._create_fallback_sentence("No processable content", 0)]
|
| 243 |
|
| 244 |
|
| 245 |
-
def _create_fallback_sentence(self, text: str, index: int,
|
| 246 |
"""
|
| 247 |
Create a fallback sentence when processing fails
|
| 248 |
"""
|
| 249 |
confidence_level = get_confidence_level(0.3)
|
| 250 |
-
color_class, _, tooltip_base = self._get_color_for_probability(
|
| 251 |
-
|
| 252 |
-
|
| 253 |
)
|
| 254 |
|
| 255 |
-
return
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
def _create_error_fallback(self, text: str, metric_results: Dict[str, MetricResult]) -> List[
|
| 270 |
"""
|
| 271 |
Create fallback when entire processing fails
|
| 272 |
"""
|
| 273 |
-
return [
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
]
|
| 286 |
|
| 287 |
|
|
@@ -291,7 +264,7 @@ class TextHighlighter:
|
|
| 291 |
"""
|
| 292 |
try:
|
| 293 |
sentences = self.text_processor.split_sentences(text)
|
| 294 |
-
filtered_sentences = [s.strip() for s in sentences if len(s.strip()) >= 3]
|
| 295 |
|
| 296 |
if filtered_sentences:
|
| 297 |
return filtered_sentences
|
|
@@ -315,79 +288,78 @@ class TextHighlighter:
|
|
| 315 |
return [text] if text.strip() else []
|
| 316 |
|
| 317 |
|
| 318 |
-
def _calculate_sentence_ensemble_probability(self, sentence: str, metric_results: Dict[str, MetricResult], weights: Dict[str, float],
|
| 319 |
-
ensemble_result: Optional[EnsembleResult] = None) -> Tuple[float, float, float, float, Dict[str, float]]:
|
| 320 |
"""
|
| 321 |
Calculate sentence probabilities using ensemble methods with domain calibration
|
| 322 |
"""
|
| 323 |
sentence_length = len(sentence.split())
|
| 324 |
-
|
| 325 |
-
# Handling short sentences - don't force neutral
|
| 326 |
-
if (sentence_length < 3):
|
| 327 |
-
# Return probabilities with lower confidence for very short sentences
|
| 328 |
-
base_ai_prob = 0.5
|
| 329 |
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
-
breakdown = {"short_sentence" : base_ai_prob}
|
| 334 |
-
|
| 335 |
-
# Try to get some signal from available metrics
|
| 336 |
for name, result in metric_results.items():
|
| 337 |
-
if (
|
| 338 |
-
|
| 339 |
-
breakdown[name]
|
| 340 |
break
|
| 341 |
-
|
| 342 |
-
return
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
sentence_metric_results = dict()
|
| 346 |
breakdown = dict()
|
| 347 |
-
|
| 348 |
for name, doc_result in metric_results.items():
|
| 349 |
-
if doc_result.error is None:
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
# Use ensemble to combine sentence-level metrics
|
| 373 |
if sentence_metric_results:
|
| 374 |
try:
|
| 375 |
ensemble_sentence_result = self.ensemble.predict(metric_results = sentence_metric_results,
|
| 376 |
domain = self.domain,
|
| 377 |
)
|
| 378 |
-
|
| 379 |
-
return (ensemble_sentence_result.
|
| 380 |
-
ensemble_sentence_result.
|
| 381 |
-
ensemble_sentence_result.
|
| 382 |
-
ensemble_sentence_result.overall_confidence,
|
| 383 |
-
breakdown
|
| 384 |
-
|
|
|
|
| 385 |
except Exception as e:
|
| 386 |
logger.warning(f"Sentence ensemble failed: {e}")
|
| 387 |
-
|
| 388 |
-
# Fallback: weighted average
|
| 389 |
-
return self.
|
| 390 |
-
|
| 391 |
|
| 392 |
def _compute_sentence_metric(self, metric_name: str, sentence: str, result: MetricResult, weight: float) -> float:
|
| 393 |
"""
|
|
@@ -397,44 +369,46 @@ class TextHighlighter:
|
|
| 397 |
|
| 398 |
# Get domain-specific threshold for this metric
|
| 399 |
metric_thresholds = getattr(self.domain_thresholds, metric_name, None)
|
| 400 |
-
|
| 401 |
if not metric_thresholds:
|
| 402 |
-
return result.
|
| 403 |
|
| 404 |
# Base probability from document-level result
|
| 405 |
-
base_prob = result.
|
| 406 |
|
| 407 |
# Apply domain-aware sentence-level adjustments
|
| 408 |
-
adjusted_prob = self._apply_metric_specific_adjustments(metric_name = metric_name,
|
| 409 |
-
sentence = sentence,
|
| 410 |
-
base_prob = base_prob,
|
| 411 |
-
sentence_length = sentence_length,
|
| 412 |
thresholds = metric_thresholds,
|
| 413 |
)
|
| 414 |
|
| 415 |
return adjusted_prob
|
| 416 |
|
| 417 |
|
| 418 |
-
def _create_sentence_metric_result(self, metric_name: str,
|
| 419 |
"""
|
| 420 |
Create sentence-level MetricResult from document-level result
|
| 421 |
"""
|
| 422 |
-
#
|
| 423 |
-
sentence_confidence = self._calculate_sentence_confidence(doc_result.confidence,
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
| 432 |
)
|
| 433 |
|
| 434 |
|
| 435 |
def _calculate_sentence_confidence(self, doc_confidence: float, sentence_length: int) -> float:
|
| 436 |
"""
|
| 437 |
-
|
| 438 |
"""
|
| 439 |
base_reduction = 0.8
|
| 440 |
# Scale confidence penalty with sentence length
|
|
@@ -443,65 +417,68 @@ class TextHighlighter:
|
|
| 443 |
return max(0.1, doc_confidence * base_reduction * length_penalty)
|
| 444 |
|
| 445 |
|
| 446 |
-
def
|
| 447 |
"""
|
| 448 |
Fallback weighted probability calculation
|
| 449 |
"""
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
confidences
|
| 453 |
-
total_weight
|
| 454 |
|
| 455 |
for name, result in metric_results.items():
|
| 456 |
-
if
|
| 457 |
weight = weights.get(name, 0.0)
|
| 458 |
|
| 459 |
if (weight > 0):
|
| 460 |
-
|
| 461 |
-
|
| 462 |
confidences.append(result.confidence)
|
| 463 |
total_weight += weight
|
| 464 |
|
| 465 |
-
if
|
| 466 |
return 0.5, 0.5, 0.0, 0.5, breakdown or {}
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5
|
| 472 |
|
| 473 |
-
return
|
| 474 |
|
| 475 |
|
| 476 |
-
def _get_document_ensemble_probability(self, ensemble_result: Optional[EnsembleResult], metric_results: Dict[str, MetricResult],
|
| 477 |
-
weights: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
|
| 478 |
"""
|
| 479 |
Get document-level ensemble probability
|
| 480 |
"""
|
| 481 |
if ensemble_result:
|
| 482 |
# Use existing ensemble result
|
| 483 |
-
breakdown = {name: result.
|
| 484 |
-
return (ensemble_result.
|
| 485 |
-
ensemble_result.
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
else:
|
| 488 |
# Calculate from metrics
|
| 489 |
-
return self.
|
| 490 |
|
| 491 |
|
| 492 |
-
def _apply_domain_specific_adjustments(self, sentence: str,
|
| 493 |
"""
|
| 494 |
-
Apply domain-specific adjustments to
|
| 495 |
"""
|
| 496 |
-
original_prob =
|
| 497 |
adjustments = list()
|
| 498 |
sentence_lower = sentence.lower()
|
| 499 |
|
| 500 |
# Technical & AI/ML domains
|
| 501 |
-
if
|
| 502 |
if self._has_technical_terms(sentence_lower):
|
| 503 |
-
adjustments.append(1.1)
|
| 504 |
-
|
| 505 |
elif self._has_code_like_patterns(sentence):
|
| 506 |
adjustments.append(1.15)
|
| 507 |
|
|
@@ -509,34 +486,34 @@ class TextHighlighter:
|
|
| 509 |
adjustments.append(1.05)
|
| 510 |
|
| 511 |
# Creative & informal domains
|
| 512 |
-
elif
|
| 513 |
if self._has_informal_language(sentence_lower):
|
| 514 |
-
adjustments.append(0.7)
|
| 515 |
-
|
| 516 |
elif self._has_emotional_language(sentence):
|
| 517 |
adjustments.append(0.8)
|
| 518 |
-
|
| 519 |
-
elif
|
| 520 |
adjustments.append(0.8)
|
| 521 |
|
| 522 |
# Academic & formal domains
|
| 523 |
-
elif
|
| 524 |
if self._has_citation_patterns(sentence):
|
| 525 |
-
adjustments.append(0.8)
|
| 526 |
-
|
| 527 |
elif self._has_technical_terms(sentence_lower):
|
| 528 |
adjustments.append(1.1)
|
| 529 |
-
|
| 530 |
elif (sentence_length > 40):
|
| 531 |
adjustments.append(1.1)
|
| 532 |
|
| 533 |
# Business & professional domains
|
| 534 |
-
elif
|
| 535 |
if self._has_business_jargon(sentence_lower):
|
| 536 |
-
adjustments.append(1.05)
|
| 537 |
|
| 538 |
elif self._has_ambiguous_phrasing(sentence_lower):
|
| 539 |
-
adjustments.append(0.9)
|
| 540 |
|
| 541 |
elif (15 <= sentence_length <= 25):
|
| 542 |
adjustments.append(0.9)
|
|
@@ -544,8 +521,8 @@ class TextHighlighter:
|
|
| 544 |
# Tutorial & educational domains
|
| 545 |
elif (self.domain == Domain.TUTORIAL):
|
| 546 |
if self._has_instructional_language(sentence_lower):
|
| 547 |
-
adjustments.append(0.85)
|
| 548 |
-
|
| 549 |
elif self._has_step_by_step_pattern(sentence):
|
| 550 |
adjustments.append(0.8)
|
| 551 |
|
|
@@ -564,15 +541,16 @@ class TextHighlighter:
|
|
| 564 |
if adjustments:
|
| 565 |
# Sort by impact (farthest from 1.0)
|
| 566 |
adjustments.sort(key = lambda x: abs(x - 1.0), reverse = True)
|
|
|
|
| 567 |
# Limit to 2 strongest
|
| 568 |
-
strongest_adjustments = adjustments[:2]
|
| 569 |
|
| 570 |
for adjustment in strongest_adjustments:
|
| 571 |
-
|
| 572 |
|
| 573 |
-
# Ensure probability stays within bounds and doesn't change too drastically
|
| 574 |
-
max_change = 0.3
|
| 575 |
-
bounded_prob = max(original_prob - max_change, min(original_prob + max_change,
|
| 576 |
|
| 577 |
return max(0.0, min(1.0, bounded_prob))
|
| 578 |
|
|
@@ -585,7 +563,7 @@ class TextHighlighter:
|
|
| 585 |
if (metric_name == "perplexity"):
|
| 586 |
if (sentence_length < 8):
|
| 587 |
return min(1.0, base_prob * 1.2)
|
| 588 |
-
|
| 589 |
elif (sentence_length > 25):
|
| 590 |
return max(0.0, base_prob * 0.8)
|
| 591 |
|
|
@@ -630,40 +608,55 @@ class TextHighlighter:
|
|
| 630 |
return base_prob
|
| 631 |
|
| 632 |
|
| 633 |
-
def
|
| 634 |
"""
|
| 635 |
-
|
| 636 |
"""
|
| 637 |
-
#
|
| 638 |
-
if (
|
| 639 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
-
#
|
| 642 |
-
if (
|
| 643 |
-
return
|
| 644 |
|
| 645 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
for min_thresh, max_thresh, color_class, color_hex, tooltip in self.COLOR_THRESHOLDS:
|
| 647 |
-
if (min_thresh <=
|
| 648 |
return color_class, color_hex, tooltip
|
| 649 |
|
| 650 |
-
# Fallback for
|
| 651 |
-
return "
|
| 652 |
-
|
| 653 |
|
| 654 |
-
def _generate_ensemble_tooltip(self, sentence: str,
|
| 655 |
-
tooltip_base: str, breakdown: Optional[Dict[str, float]] = None,
|
| 656 |
"""
|
| 657 |
Generate enhanced tooltip with ENSEMBLE information
|
| 658 |
"""
|
| 659 |
tooltip = f"{tooltip_base}\n"
|
| 660 |
|
| 661 |
-
if
|
| 662 |
-
tooltip += "🔀
|
| 663 |
|
| 664 |
-
tooltip += f"
|
| 665 |
-
tooltip += f"
|
| 666 |
-
tooltip += f"
|
| 667 |
tooltip += f"Confidence: {confidence:.1%} ({confidence_level.value.replace('_', ' ').title()})\n"
|
| 668 |
tooltip += f"Domain: {self.domain.value.replace('_', ' ').title()}\n"
|
| 669 |
tooltip += f"Length: {len(sentence.split())} words"
|
|
@@ -671,7 +664,7 @@ class TextHighlighter:
|
|
| 671 |
if breakdown:
|
| 672 |
tooltip += "\n\nMetric Breakdown:"
|
| 673 |
# Show top 4 metrics
|
| 674 |
-
for metric, prob in list(breakdown.items())[:4]:
|
| 675 |
tooltip += f"\n• {metric}: {prob:.1%}"
|
| 676 |
|
| 677 |
tooltip += f"\n\nEnsemble Method: {getattr(self.ensemble, 'primary_method', 'fallback')}"
|
|
@@ -684,7 +677,6 @@ class TextHighlighter:
|
|
| 684 |
Check for academic citation patterns
|
| 685 |
"""
|
| 686 |
citation_indicators = ['et al.', 'ibid.', 'cf.', 'e.g.', 'i.e.', 'vol.', 'pp.', 'ed.', 'trans.', 'reference', 'cited', 'according to']
|
| 687 |
-
|
| 688 |
return any(indicator in sentence.lower() for indicator in citation_indicators)
|
| 689 |
|
| 690 |
|
|
@@ -693,7 +685,6 @@ class TextHighlighter:
|
|
| 693 |
Check for informal language patterns
|
| 694 |
"""
|
| 695 |
informal_indicators = ['lol', 'omg', 'btw', 'imo', 'tbh', 'afaik', 'smh', '👋', '😂', '❤️', 'haha', 'wow', 'awesome']
|
| 696 |
-
|
| 697 |
return any(indicator in sentence.lower() for indicator in informal_indicators)
|
| 698 |
|
| 699 |
|
|
@@ -713,7 +704,6 @@ class TextHighlighter:
|
|
| 713 |
Check for ambiguous phrasing that might indicate human writing
|
| 714 |
"""
|
| 715 |
ambiguous_indicators = ['perhaps', 'maybe', 'possibly', 'likely', 'appears to', 'seems to', 'might be', 'could be']
|
| 716 |
-
|
| 717 |
return any(indicator in sentence.lower() for indicator in ambiguous_indicators)
|
| 718 |
|
| 719 |
|
|
@@ -726,7 +716,6 @@ class TextHighlighter:
|
|
| 726 |
return False
|
| 727 |
|
| 728 |
complex_indicators = ['which', 'that', 'although', 'because', 'while', 'when', 'if', 'however', 'therefore']
|
| 729 |
-
|
| 730 |
return any(indicator in sentence.lower() for indicator in complex_indicators)
|
| 731 |
|
| 732 |
|
|
@@ -735,7 +724,6 @@ class TextHighlighter:
|
|
| 735 |
Check for emotional or subjective language
|
| 736 |
"""
|
| 737 |
emotional_indicators = ['feel', 'believe', 'think', 'wonder', 'hope', 'wish', 'love', 'hate', 'frustrating', 'exciting']
|
| 738 |
-
|
| 739 |
return any(indicator in sentence.lower() for indicator in emotional_indicators)
|
| 740 |
|
| 741 |
|
|
@@ -744,7 +732,6 @@ class TextHighlighter:
|
|
| 744 |
Check for business jargon
|
| 745 |
"""
|
| 746 |
jargon_indicators = ['synergy', 'leverage', 'bandwidth', 'circle back', 'touch base', 'value add', 'core competency']
|
| 747 |
-
|
| 748 |
return any(indicator in sentence.lower() for indicator in jargon_indicators)
|
| 749 |
|
| 750 |
|
|
@@ -753,7 +740,6 @@ class TextHighlighter:
|
|
| 753 |
Check for instructional language patterns
|
| 754 |
"""
|
| 755 |
instructional_indicators = ['step by step', 'firstly', 'secondly', 'finally', 'note that', 'remember to', 'make sure']
|
| 756 |
-
|
| 757 |
return any(indicator in sentence.lower() for indicator in instructional_indicators)
|
| 758 |
|
| 759 |
|
|
@@ -762,7 +748,6 @@ class TextHighlighter:
|
|
| 762 |
Check for step-by-step instructions
|
| 763 |
"""
|
| 764 |
step_patterns = ['step 1', 'step 2', 'step 3', 'step one', 'step two', 'first step', 'next step']
|
| 765 |
-
|
| 766 |
return any(pattern in sentence.lower() for pattern in step_patterns)
|
| 767 |
|
| 768 |
|
|
@@ -771,7 +756,6 @@ class TextHighlighter:
|
|
| 771 |
Check for example indicators
|
| 772 |
"""
|
| 773 |
example_indicators = ['for example', 'for instance', 'such as', 'e.g.', 'as an example']
|
| 774 |
-
|
| 775 |
return any(indicator in sentence.lower() for indicator in example_indicators)
|
| 776 |
|
| 777 |
|
|
@@ -780,7 +764,6 @@ class TextHighlighter:
|
|
| 780 |
Check for code-like patterns in technical domains
|
| 781 |
"""
|
| 782 |
code_patterns = ['function', 'variable', 'class', 'method', 'import', 'def ', 'void ', 'public ', 'private ']
|
| 783 |
-
|
| 784 |
return any(pattern in sentence for pattern in code_patterns)
|
| 785 |
|
| 786 |
|
|
@@ -802,19 +785,19 @@ class TextHighlighter:
|
|
| 802 |
elif (len(words) > 25):
|
| 803 |
score += 0.5
|
| 804 |
|
| 805 |
-
indicator_count
|
| 806 |
-
score
|
| 807 |
|
| 808 |
-
clause_indicators
|
| 809 |
-
clause_count
|
| 810 |
-
score
|
| 811 |
|
| 812 |
return min(1.0, score)
|
| 813 |
|
| 814 |
|
| 815 |
def _has_repetition(self, sentence: str) -> bool:
|
| 816 |
"""
|
| 817 |
-
Check if sentence has word repetition (common in
|
| 818 |
"""
|
| 819 |
words = sentence.lower().split()
|
| 820 |
if (len(words) < 6):
|
|
@@ -827,48 +810,29 @@ class TextHighlighter:
|
|
| 827 |
word_counts[word] = word_counts.get(word, 0) + 1
|
| 828 |
|
| 829 |
repeated_words = [word for word, count in word_counts.items() if count > 2]
|
| 830 |
-
|
| 831 |
-
return len(repeated_words) > 0
|
| 832 |
|
| 833 |
|
| 834 |
-
def
|
| 835 |
-
"""
|
| 836 |
-
Split the text chunk into multiple sentences
|
| 837 |
-
"""
|
| 838 |
-
sentences = self.text_processor.split_sentences(text)
|
| 839 |
-
filtered_sentences = list()
|
| 840 |
-
|
| 841 |
-
for sentence in sentences:
|
| 842 |
-
clean_sentence = sentence.strip()
|
| 843 |
-
|
| 844 |
-
if (len(clean_sentence) >= 3):
|
| 845 |
-
filtered_sentences.append(clean_sentence)
|
| 846 |
-
|
| 847 |
-
return filtered_sentences
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
def generate_html(self, highlighted_sentences: List[HighlightedSentence], include_legend: bool = False, include_metrics: bool = True) -> str:
|
| 851 |
"""
|
| 852 |
Generate HTML with highlighted sentences
|
| 853 |
|
| 854 |
Arguments:
|
| 855 |
----------
|
| 856 |
-
highlighted_sentences { List[
|
| 857 |
-
|
| 858 |
-
include_legend
|
| 859 |
-
|
| 860 |
-
include_metrics { bool } : Whether to include metrics summary
|
| 861 |
|
| 862 |
Returns:
|
| 863 |
--------
|
| 864 |
-
{ str }
|
| 865 |
"""
|
| 866 |
html_parts = list()
|
| 867 |
|
| 868 |
# Add CSS
|
| 869 |
-
html_parts.append(self.
|
| 870 |
|
| 871 |
-
#
|
| 872 |
if include_legend:
|
| 873 |
html_parts.append(self._generate_legend_html())
|
| 874 |
|
|
@@ -876,33 +840,28 @@ class TextHighlighter:
|
|
| 876 |
html_parts.append('<div class="highlighted-text">')
|
| 877 |
|
| 878 |
for sent in highlighted_sentences:
|
| 879 |
-
extra_class = "
|
| 880 |
html_parts.append(f'<span class="highlight {sent.color_class}{extra_class}" '
|
| 881 |
-
f'data-
|
| 882 |
-
f'data-
|
| 883 |
-
f'data-
|
| 884 |
f'data-confidence="{sent.confidence:.4f}" '
|
| 885 |
f'data-confidence-level="{sent.confidence_level.value}" '
|
| 886 |
f'data-domain="{self.domain.value}" '
|
| 887 |
f'data-sentence-idx="{sent.index}" '
|
| 888 |
-
f'data-is-
|
| 889 |
f'title="{sent.tooltip}">'
|
| 890 |
f'{sent.text}'
|
| 891 |
-
f'</span> '
|
| 892 |
-
)
|
| 893 |
|
| 894 |
html_parts.append('</div>')
|
| 895 |
|
| 896 |
-
# Add metrics summary if requested (separate from legend)
|
| 897 |
-
if include_metrics and highlighted_sentences:
|
| 898 |
-
html_parts.append(self._generate_metrics_summary(highlighted_sentences))
|
| 899 |
-
|
| 900 |
return '\n'.join(html_parts)
|
| 901 |
|
| 902 |
|
| 903 |
-
def
|
| 904 |
"""
|
| 905 |
-
Generate CSS for highlighting for
|
| 906 |
"""
|
| 907 |
return """
|
| 908 |
<style>
|
|
@@ -936,277 +895,65 @@ class TextHighlighter:
|
|
| 936 |
text-shadow: 0 1px 1px rgba(255,255,255,0.8);
|
| 937 |
}
|
| 938 |
|
| 939 |
-
/*
|
| 940 |
-
.
|
| 941 |
-
background-color: #
|
| 942 |
-
border-bottom-color: #
|
| 943 |
-
}
|
| 944 |
-
|
| 945 |
-
.high-ai {
|
| 946 |
-
background-color: #fed7aa;
|
| 947 |
-
border-bottom-color: #f97316;
|
| 948 |
}
|
| 949 |
|
| 950 |
-
|
|
|
|
| 951 |
background-color: #fef3c7;
|
| 952 |
border-bottom-color: #f59e0b;
|
| 953 |
}
|
| 954 |
|
| 955 |
-
/*
|
| 956 |
-
.
|
| 957 |
-
background-color: #fef9c3;
|
| 958 |
-
border-bottom-color: #fbbf24;
|
| 959 |
-
}
|
| 960 |
-
|
| 961 |
-
/* Human indicators - Lighter backgrounds */
|
| 962 |
-
.medium-human {
|
| 963 |
-
background-color: #ecfccb;
|
| 964 |
-
border-bottom-color: #a3e635;
|
| 965 |
-
}
|
| 966 |
-
|
| 967 |
-
.high-human {
|
| 968 |
-
background-color: #bbf7d0;
|
| 969 |
-
border-bottom-color: #4ade80;
|
| 970 |
-
}
|
| 971 |
-
|
| 972 |
-
.very-high-human {
|
| 973 |
-
background-color: #dcfce7;
|
| 974 |
-
border-bottom-color: #22c55e;
|
| 975 |
-
}
|
| 976 |
-
|
| 977 |
-
/* Mixed content */
|
| 978 |
-
.mixed-content {
|
| 979 |
background-color: #e9d5ff;
|
| 980 |
border-bottom-color: #a855f7;
|
| 981 |
-
background-image: repeating-linear-gradient(45deg, transparent, transparent 5px, rgba(168, 85, 247, 0.1) 5px, rgba(168, 85, 247, 0.1) 10px);
|
| 982 |
}
|
| 983 |
|
| 984 |
-
.
|
| 985 |
border: 2px dashed #a855f7;
|
| 986 |
}
|
| 987 |
|
| 988 |
-
/*
|
| 989 |
-
.
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
background: #f9fafb;
|
| 993 |
-
border-radius: 8px;
|
| 994 |
-
border: 1px solid #e5e7eb;
|
| 995 |
-
}
|
| 996 |
-
|
| 997 |
-
.highlight-summary h4 {
|
| 998 |
-
margin: 0 0 10px 0;
|
| 999 |
-
font-size: 14px;
|
| 1000 |
-
font-weight: 600;
|
| 1001 |
-
color: #374151;
|
| 1002 |
-
}
|
| 1003 |
-
|
| 1004 |
-
.summary-stats {
|
| 1005 |
-
display: grid;
|
| 1006 |
-
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 1007 |
-
gap: 10px;
|
| 1008 |
-
}
|
| 1009 |
-
|
| 1010 |
-
.stat-item {
|
| 1011 |
-
display: flex;
|
| 1012 |
-
justify-content: space-between;
|
| 1013 |
-
align-items: center;
|
| 1014 |
-
padding: 8px 12px;
|
| 1015 |
-
background: white;
|
| 1016 |
-
border-radius: 6px;
|
| 1017 |
-
border: 1px solid #e5e7eb;
|
| 1018 |
-
}
|
| 1019 |
-
|
| 1020 |
-
.stat-label {
|
| 1021 |
-
font-size: 13px;
|
| 1022 |
-
color: #6b7280;
|
| 1023 |
-
}
|
| 1024 |
-
|
| 1025 |
-
.stat-value {
|
| 1026 |
-
font-size: 13px;
|
| 1027 |
-
font-weight: 600;
|
| 1028 |
-
color: #374151;
|
| 1029 |
}
|
| 1030 |
</style>
|
| 1031 |
"""
|
| 1032 |
|
| 1033 |
|
| 1034 |
-
def _generate_metrics_summary(self, sentences: List[HighlightedSentence]) -> str:
|
| 1035 |
-
"""
|
| 1036 |
-
Generate summary statistics for highlighted sentences
|
| 1037 |
-
"""
|
| 1038 |
-
if not sentences:
|
| 1039 |
-
return ""
|
| 1040 |
-
|
| 1041 |
-
# Calculate summary metrics
|
| 1042 |
-
total_sentences = len(sentences)
|
| 1043 |
-
|
| 1044 |
-
# Count sentences by category
|
| 1045 |
-
very_high_ai = len([s for s in sentences if s.color_class == "very-high-ai"])
|
| 1046 |
-
high_ai = len([s for s in sentences if s.color_class == "high-ai"])
|
| 1047 |
-
medium_ai = len([s for s in sentences if s.color_class == "medium-ai"])
|
| 1048 |
-
uncertain = len([s for s in sentences if s.color_class == "uncertain"])
|
| 1049 |
-
medium_human = len([s for s in sentences if s.color_class == "medium-human"])
|
| 1050 |
-
high_human = len([s for s in sentences if s.color_class == "high-human"])
|
| 1051 |
-
very_high_human = len([s for s in sentences if s.color_class == "very-high-human"])
|
| 1052 |
-
mixed = len([s for s in sentences if s.color_class == "mixed-content"])
|
| 1053 |
-
|
| 1054 |
-
# Calculate overall risk score (weighted average)
|
| 1055 |
-
weighted_risk = 0.0
|
| 1056 |
-
for sent in sentences:
|
| 1057 |
-
weight = self.RISK_WEIGHTS.get(sent.color_class, 0.4)
|
| 1058 |
-
weighted_risk += sent.ai_probability * weight
|
| 1059 |
-
|
| 1060 |
-
overall_risk_score = weighted_risk / total_sentences if total_sentences else 0.0
|
| 1061 |
-
|
| 1062 |
-
# Calculate average probabilities
|
| 1063 |
-
avg_ai_prob = sum(s.ai_probability for s in sentences) / total_sentences
|
| 1064 |
-
avg_human_prob = sum(s.human_probability for s in sentences) / total_sentences
|
| 1065 |
-
|
| 1066 |
-
# Sentence counts
|
| 1067 |
-
ai_sentences = very_high_ai + high_ai + medium_ai
|
| 1068 |
-
human_sentences = very_high_human + high_human + medium_human
|
| 1069 |
-
|
| 1070 |
-
html = f"""
|
| 1071 |
-
<div class="highlight-summary">
|
| 1072 |
-
<h4>📊 Text Analysis Summary</h4>
|
| 1073 |
-
<div class="summary-stats">
|
| 1074 |
-
<div class="stat-item">
|
| 1075 |
-
<span class="stat-label">Overall Risk Score</span>
|
| 1076 |
-
<span class="stat-value">{overall_risk_score:.1%}</span>
|
| 1077 |
-
</div>
|
| 1078 |
-
<div class="stat-item">
|
| 1079 |
-
<span class="stat-label">Average AI Probability</span>
|
| 1080 |
-
<span class="stat-value">{avg_ai_prob:.1%}</span>
|
| 1081 |
-
</div>
|
| 1082 |
-
<div class="stat-item">
|
| 1083 |
-
<span class="stat-label">AI Sentences</span>
|
| 1084 |
-
<span class="stat-value">{ai_sentences} ({ai_sentences/total_sentences:.1%})</span>
|
| 1085 |
-
</div>
|
| 1086 |
-
<div class="stat-item">
|
| 1087 |
-
<span class="stat-label">Human Sentences</span>
|
| 1088 |
-
<span class="stat-value">{human_sentences} ({human_sentences/total_sentences:.1%})</span>
|
| 1089 |
-
</div>
|
| 1090 |
-
<div class="stat-item">
|
| 1091 |
-
<span class="stat-label">Uncertain Sentences</span>
|
| 1092 |
-
<span class="stat-value">{uncertain} ({uncertain/total_sentences:.1%})</span>
|
| 1093 |
-
</div>
|
| 1094 |
-
<div class="stat-item">
|
| 1095 |
-
<span class="stat-label">Mixed Sentences</span>
|
| 1096 |
-
<span class="stat-value">{mixed} ({mixed/total_sentences:.1%})</span>
|
| 1097 |
-
</div>
|
| 1098 |
-
<div class="stat-item">
|
| 1099 |
-
<span class="stat-label">Total Sentences</span>
|
| 1100 |
-
<span class="stat-value">{total_sentences}</span>
|
| 1101 |
-
</div>
|
| 1102 |
-
<div class="stat-item">
|
| 1103 |
-
<span class="stat-label">Domain</span>
|
| 1104 |
-
<span class="stat-value">{self.domain.value.replace('_', ' ').title()}</span>
|
| 1105 |
-
</div>
|
| 1106 |
-
</div>
|
| 1107 |
-
</div>
|
| 1108 |
-
"""
|
| 1109 |
-
return html
|
| 1110 |
-
|
| 1111 |
-
|
| 1112 |
def _generate_legend_html(self) -> str:
|
| 1113 |
"""
|
| 1114 |
-
Generate legend HTML -
|
| 1115 |
"""
|
| 1116 |
return """
|
| 1117 |
<div class="highlight-legend" style="margin-bottom: 20px; padding: 15px; background: #f8fafc; border-radius: 8px; border: 1px solid #e2e8f0;">
|
| 1118 |
-
<h4 style="margin: 0 0 10px 0; font-size: 14px; font-weight: 600; color: #374151;">
|
| 1119 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 8px;">
|
| 1120 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1121 |
-
<div style="width: 16px; height: 16px; background: #
|
| 1122 |
-
<span style="font-size: 12px; color: #374151;">
|
| 1123 |
-
</div>
|
| 1124 |
-
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1125 |
-
<div style="width: 16px; height: 16px; background: #bbf7d0; border: 1px solid #4ade80; border-radius: 3px;"></div>
|
| 1126 |
-
<span style="font-size: 12px; color: #374151;">Likely Human (10-25%)</span>
|
| 1127 |
-
</div>
|
| 1128 |
-
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1129 |
-
<div style="width: 16px; height: 16px; background: #86efac; border: 1px solid #16a34a; border-radius: 3px;"></div>
|
| 1130 |
-
<span style="font-size: 12px; color: #374151;">Possibly Human (25-40%)</span>
|
| 1131 |
-
</div>
|
| 1132 |
-
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1133 |
-
<div style="width: 16px; height: 16px; background: #fef9c3; border: 1px solid #fbbf24; border-radius: 3px;"></div>
|
| 1134 |
-
<span style="font-size: 12px; color: #374151;">Uncertain (40-60%)</span>
|
| 1135 |
-
</div>
|
| 1136 |
-
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1137 |
-
<div style="width: 16px; height: 16px; background: #fde68a; border: 1px solid #f59e0b; border-radius: 3px;"></div>
|
| 1138 |
-
<span style="font-size: 12px; color: #374151;">Possibly AI (60-75%)</span>
|
| 1139 |
</div>
|
| 1140 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1141 |
-
<div style="width: 16px; height: 16px; background: #
|
| 1142 |
-
<span style="font-size: 12px; color: #374151;">
|
| 1143 |
</div>
|
| 1144 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1145 |
-
<div style="width: 16px; height: 16px; background: #
|
| 1146 |
-
<span style="font-size: 12px; color: #374151;">
|
| 1147 |
</div>
|
| 1148 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 1149 |
-
<div style="width: 16px; height: 16px; background: #
|
| 1150 |
-
<span style="font-size: 12px; color: #374151;">
|
| 1151 |
</div>
|
| 1152 |
</div>
|
| 1153 |
</div>
|
| 1154 |
"""
|
| 1155 |
|
| 1156 |
|
| 1157 |
-
def calculate_metrics(self, highlighted_sentences: List[HighlightedSentence]) -> Dict[str, float]:
|
| 1158 |
-
"""
|
| 1159 |
-
Calculate metrics for external use
|
| 1160 |
-
|
| 1161 |
-
Arguments:
|
| 1162 |
-
----------
|
| 1163 |
-
highlighted_sentences { List[HighlightedSentence] } : Sentences with highlighting data
|
| 1164 |
-
|
| 1165 |
-
Returns:
|
| 1166 |
-
--------
|
| 1167 |
-
{ Dict[str, float] } : Dictionary with metrics
|
| 1168 |
-
"""
|
| 1169 |
-
if not highlighted_sentences:
|
| 1170 |
-
return {}
|
| 1171 |
-
|
| 1172 |
-
total_sentences = len(highlighted_sentences)
|
| 1173 |
-
|
| 1174 |
-
# Calculate weighted risk score
|
| 1175 |
-
weighted_risk = 0.0
|
| 1176 |
-
|
| 1177 |
-
for sent in highlighted_sentences:
|
| 1178 |
-
weight = self.RISK_WEIGHTS.get(sent.color_class, 0.4)
|
| 1179 |
-
weighted_risk += sent.ai_probability * weight
|
| 1180 |
-
|
| 1181 |
-
overall_risk_score = weighted_risk / total_sentences
|
| 1182 |
-
|
| 1183 |
-
# Count sentences by category
|
| 1184 |
-
ai_sentences = len([s for s in highlighted_sentences if s.ai_probability >= 0.6])
|
| 1185 |
-
human_sentences = len([s for s in highlighted_sentences if s.ai_probability <= 0.4])
|
| 1186 |
-
uncertain_sentences = len([s for s in highlighted_sentences if 0.4 < s.ai_probability < 0.6])
|
| 1187 |
-
mixed_sentences = len([s for s in highlighted_sentences if s.is_mixed_content])
|
| 1188 |
-
|
| 1189 |
-
# Average probabilities
|
| 1190 |
-
avg_ai_prob = sum(s.ai_probability for s in highlighted_sentences) / total_sentences
|
| 1191 |
-
avg_human_prob = sum(s.human_probability for s in highlighted_sentences) / total_sentences
|
| 1192 |
-
avg_confidence = sum(s.confidence for s in highlighted_sentences) / total_sentences
|
| 1193 |
-
|
| 1194 |
-
return {'overall_risk_score' : overall_risk_score,
|
| 1195 |
-
'avg_ai_probability' : avg_ai_prob,
|
| 1196 |
-
'avg_human_probability' : avg_human_prob,
|
| 1197 |
-
'avg_confidence' : avg_confidence,
|
| 1198 |
-
'ai_sentence_count' : ai_sentences,
|
| 1199 |
-
'human_sentence_count' : human_sentences,
|
| 1200 |
-
'uncertain_sentence_count' : uncertain_sentences,
|
| 1201 |
-
'mixed_sentence_count' : mixed_sentences,
|
| 1202 |
-
'total_sentences' : total_sentences,
|
| 1203 |
-
'ai_sentence_percentage' : ai_sentences / total_sentences,
|
| 1204 |
-
'human_sentence_percentage' : human_sentences / total_sentences,
|
| 1205 |
-
}
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
# Export
|
| 1210 |
-
__all__ = ["TextHighlighter"
|
| 1211 |
-
"HighlightedSentence",
|
| 1212 |
-
]
|
|
|
|
| 5 |
from typing import Tuple
|
| 6 |
from loguru import logger
|
| 7 |
from typing import Optional
|
| 8 |
+
from config.enums import Domain
|
| 9 |
+
from config.schemas import MetricResult
|
| 10 |
+
from config.schemas import EnsembleResult
|
|
|
|
|
|
|
| 11 |
from processors.text_processor import TextProcessor
|
| 12 |
from config.threshold_config import ConfidenceLevel
|
| 13 |
+
from config.schemas import HighlightedSentenceResult
|
| 14 |
from config.threshold_config import MetricThresholds
|
| 15 |
from config.threshold_config import get_confidence_level
|
| 16 |
+
from services.ensemble_classifier import EnsembleClassifier
|
| 17 |
from config.threshold_config import get_threshold_for_domain
|
| 18 |
from config.threshold_config import get_active_metric_weights
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
class TextHighlighter:
|
| 22 |
"""
|
| 23 |
Generates sentence-level highlighting with ensemble results integration
|
|
|
|
| 25 |
FEATURES:
|
| 26 |
- Sentence-level highlighting with confidence scores
|
| 27 |
- Domain-aware calibration
|
| 28 |
+
- Ensemble-assisted probability aggregation
|
| 29 |
+
- Hybrid content detection
|
| 30 |
- Explainable tooltips
|
|
|
|
| 31 |
"""
|
| 32 |
+
# Color thresholds - 4 categories
|
| 33 |
+
COLOR_THRESHOLDS = [(0.00, 0.40, "authentic", "#d1fae5", "Likely authentically written"), # Authentic: Synthetic probability < 0.4
|
| 34 |
+
(0.40, 0.60, "uncertain", "#fef3c7", "Uncertain authorship"), # Uncertain: 0.4 ≤ Synthetic probability < 0.6
|
| 35 |
+
(0.60, 0.80, "hybrid", "#e9d5ff", "Mixed synthetic/authentic content"), # Hybrid: 0.6 ≤ Synthetic probability < 0.8 OR explicit hybrid detection
|
| 36 |
+
(0.80, 1.01, "synthetic", "#fee2e2", "Likely synthetically generated"), # Synthetic: Synthetic probability ≥ 0.8
|
| 37 |
+
]
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Hybrid detection thresholds
|
| 40 |
+
HYBRID_PROB_THRESHOLD = 0.25 # Minimum hybrid probability to classify as hybrid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def __init__(self, domain: Domain = Domain.GENERAL, ensemble_classifier: Optional[EnsembleClassifier] = None):
|
| 43 |
"""
|
|
|
|
| 65 |
)
|
| 66 |
except Exception as e:
|
| 67 |
logger.warning(f"Failed to create default ensemble: {e}. Using fallback mode.")
|
| 68 |
+
return EnsembleClassifier(primary_method = "domain_weighted",
|
| 69 |
+
fallback_method = "simple_average",
|
| 70 |
+
)
|
| 71 |
|
| 72 |
|
| 73 |
def generate_highlights(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult] = None,
|
| 74 |
+
enabled_metrics: Optional[Dict[str, bool]] = None, use_sentence_level: bool = True) -> List[HighlightedSentenceResult]:
|
| 75 |
"""
|
| 76 |
Generate sentence-level highlights with ensemble integration
|
| 77 |
|
| 78 |
Arguments:
|
| 79 |
----------
|
| 80 |
text { str } : Original text
|
| 81 |
+
|
| 82 |
+
metric_results { dict } : Results from all metrics
|
| 83 |
|
| 84 |
ensemble_result { EnsembleResult } : Optional document-level ensemble result
|
| 85 |
|
|
|
|
| 89 |
|
| 90 |
Returns:
|
| 91 |
--------
|
| 92 |
+
{ list } : List of HighlightedSentenceResult objects
|
| 93 |
"""
|
| 94 |
try:
|
| 95 |
# Validate inputs
|
| 96 |
if not text or not text.strip():
|
| 97 |
+
return self._handle_empty_text(text = text,
|
| 98 |
+
metric_results = metric_results,
|
| 99 |
+
ensemble_result = ensemble_result,
|
| 100 |
+
)
|
| 101 |
|
| 102 |
# Get domain-appropriate weights for enabled metrics
|
| 103 |
if enabled_metrics is None:
|
|
|
|
| 106 |
weights = get_active_metric_weights(self.domain, enabled_metrics)
|
| 107 |
|
| 108 |
# Split text into sentences with error handling
|
| 109 |
+
sentences = self._split_sentences_with_fallback(text = text)
|
| 110 |
|
| 111 |
if not sentences:
|
| 112 |
return self._handle_no_sentences(text, metric_results, ensemble_result)
|
|
|
|
| 117 |
for idx, sentence in enumerate(sentences):
|
| 118 |
try:
|
| 119 |
if use_sentence_level:
|
| 120 |
+
# Use ensemble for sentence-level analysis
|
| 121 |
+
synthetic_prob, authentic_prob, hybrid_prob, confidence, breakdown = self._calculate_sentence_ensemble_probability(sentence = sentence,
|
| 122 |
+
metric_results = metric_results,
|
| 123 |
+
weights = weights,
|
| 124 |
+
ensemble_result = ensemble_result,
|
| 125 |
+
)
|
| 126 |
else:
|
| 127 |
# Use document-level ensemble probabilities
|
| 128 |
+
synthetic_prob, authentic_prob, hybrid_prob, confidence, breakdown = self._get_document_ensemble_probability(ensemble_result = ensemble_result,
|
| 129 |
+
metric_results = metric_results,
|
| 130 |
+
weights = weights,
|
| 131 |
+
)
|
| 132 |
|
| 133 |
# Apply domain-specific adjustments with limits
|
| 134 |
+
synthetic_prob = self._apply_domain_specific_adjustments(sentence = sentence,
|
| 135 |
+
synthetic_prob = synthetic_prob,
|
| 136 |
sentence_length = len(sentence.split()),
|
| 137 |
)
|
| 138 |
|
| 139 |
+
# Determine if this is hybrid content
|
| 140 |
+
is_hybrid_content = self._is_hybrid_content(synthetic_prob = synthetic_prob,
|
| 141 |
+
hybrid_prob = hybrid_prob,
|
| 142 |
+
confidence = confidence,
|
| 143 |
+
)
|
| 144 |
|
| 145 |
# Get confidence level
|
| 146 |
confidence_level = get_confidence_level(confidence)
|
| 147 |
|
| 148 |
+
# Get color class (consider hybrid content)
|
| 149 |
+
color_class, color_hex, tooltip_base = self._get_color_for_probability(synthetic_prob = synthetic_prob,
|
| 150 |
+
is_hybrid_content = is_hybrid_content,
|
| 151 |
+
hybrid_prob = hybrid_prob,
|
| 152 |
)
|
| 153 |
|
| 154 |
# Generate enhanced tooltip
|
| 155 |
+
tooltip = self._generate_ensemble_tooltip(sentence = sentence,
|
| 156 |
+
synthetic_prob = synthetic_prob,
|
| 157 |
+
authentic_prob = authentic_prob,
|
| 158 |
+
hybrid_prob = hybrid_prob,
|
| 159 |
+
confidence = confidence,
|
| 160 |
+
confidence_level = confidence_level,
|
| 161 |
+
tooltip_base = tooltip_base,
|
| 162 |
+
breakdown = breakdown,
|
| 163 |
+
is_hybrid_content = is_hybrid_content,
|
| 164 |
)
|
| 165 |
|
| 166 |
+
highlighted_sentences.append(HighlightedSentenceResult(text = sentence,
|
| 167 |
+
synthetic_probability = synthetic_prob,
|
| 168 |
+
authentic_probability = authentic_prob,
|
| 169 |
+
hybrid_probability = hybrid_prob,
|
| 170 |
+
confidence = confidence,
|
| 171 |
+
confidence_level = confidence_level,
|
| 172 |
+
color_class = color_class,
|
| 173 |
+
tooltip = tooltip,
|
| 174 |
+
index = idx,
|
| 175 |
+
is_hybrid_content = is_hybrid_content,
|
| 176 |
+
metric_breakdown = breakdown,
|
| 177 |
+
)
|
| 178 |
)
|
| 179 |
|
| 180 |
except Exception as e:
|
|
|
|
| 189 |
return self._create_error_fallback(text, metric_results)
|
| 190 |
|
| 191 |
|
| 192 |
+
def _handle_empty_text(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentenceResult]:
|
| 193 |
"""
|
| 194 |
Handle empty input text
|
| 195 |
"""
|
| 196 |
if ensemble_result:
|
| 197 |
+
return [self._create_fallback_sentence(text = "No text content",
|
| 198 |
+
index = 0,
|
| 199 |
+
synthetic_prob = ensemble_result.synthetic_probability,
|
| 200 |
+
authentic_prob = ensemble_result.authentic_probability,
|
| 201 |
)
|
| 202 |
]
|
| 203 |
|
| 204 |
return [self._create_fallback_sentence("No text content", 0)]
|
| 205 |
|
| 206 |
|
| 207 |
+
def _handle_no_sentences(self, text: str, metric_results: Dict[str, MetricResult], ensemble_result: Optional[EnsembleResult]) -> List[HighlightedSentenceResult]:
|
| 208 |
"""
|
| 209 |
Handle case where no sentences could be extracted
|
| 210 |
"""
|
| 211 |
+
if text and text.strip():
|
| 212 |
# Treat entire text as one sentence
|
| 213 |
return [self._create_fallback_sentence(text.strip(), 0)]
|
| 214 |
+
|
| 215 |
return [self._create_fallback_sentence("No processable content", 0)]
|
| 216 |
|
| 217 |
|
| 218 |
+
def _create_fallback_sentence(self, text: str, index: int, synthetic_prob: float = 0.5, authentic_prob: float = 0.5) -> HighlightedSentenceResult:
|
| 219 |
"""
|
| 220 |
Create a fallback sentence when processing fails
|
| 221 |
"""
|
| 222 |
confidence_level = get_confidence_level(0.3)
|
| 223 |
+
color_class, _, tooltip_base = self._get_color_for_probability(synthetic_prob = synthetic_prob,
|
| 224 |
+
is_hybrid_content = False,
|
| 225 |
+
hybrid_prob = 0.0,
|
| 226 |
)
|
| 227 |
|
| 228 |
+
return HighlightedSentenceResult(text = text,
|
| 229 |
+
synthetic_probability = synthetic_prob,
|
| 230 |
+
authentic_probability = authentic_prob,
|
| 231 |
+
hybrid_probability = 0.0,
|
| 232 |
+
confidence = 0.3,
|
| 233 |
+
confidence_level = confidence_level,
|
| 234 |
+
color_class = color_class,
|
| 235 |
+
tooltip = f"Fallback: {tooltip_base}\nProcessing failed for this sentence",
|
| 236 |
+
index = index,
|
| 237 |
+
is_hybrid_content = False,
|
| 238 |
+
metric_breakdown = {"fallback": synthetic_prob},
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def _create_error_fallback(self, text: str, metric_results: Dict[str, MetricResult]) -> List[HighlightedSentenceResult]:
|
| 243 |
"""
|
| 244 |
Create fallback when entire processing fails
|
| 245 |
"""
|
| 246 |
+
return [HighlightedSentenceResult(text = text[:100] + "..." if len(text) > 100 else text,
|
| 247 |
+
synthetic_probability = 0.5,
|
| 248 |
+
authentic_probability = 0.5,
|
| 249 |
+
hybrid_probability = 0.0,
|
| 250 |
+
confidence = 0.1,
|
| 251 |
+
confidence_level = get_confidence_level(0.1),
|
| 252 |
+
color_class = "uncertain",
|
| 253 |
+
tooltip = "Error in text processing",
|
| 254 |
+
index = 0,
|
| 255 |
+
is_hybrid_content = False,
|
| 256 |
+
metric_breakdown = {"error": 0.5},
|
| 257 |
+
)
|
| 258 |
]
|
| 259 |
|
| 260 |
|
|
|
|
| 264 |
"""
|
| 265 |
try:
|
| 266 |
sentences = self.text_processor.split_sentences(text)
|
| 267 |
+
filtered_sentences = [s.strip() for s in sentences if len(s.strip()) >= 3]
|
| 268 |
|
| 269 |
if filtered_sentences:
|
| 270 |
return filtered_sentences
|
|
|
|
| 288 |
return [text] if text.strip() else []
|
| 289 |
|
| 290 |
|
| 291 |
+
def _calculate_sentence_ensemble_probability(self, sentence: str, metric_results: Dict[str, MetricResult], weights: Dict[str, float], ensemble_result: Optional[EnsembleResult] = None) -> Tuple[float, float, float, float, Dict[str, float]]:
|
|
|
|
| 292 |
"""
|
| 293 |
Calculate sentence probabilities using ensemble methods with domain calibration
|
| 294 |
"""
|
| 295 |
sentence_length = len(sentence.split())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
+
# Handling very short sentences – do not force neutral, but reduce confidence
|
| 298 |
+
if (sentence_length < 3):
|
| 299 |
+
base_synthetic_prob = 0.5
|
| 300 |
+
base_confidence = 0.2
|
| 301 |
+
breakdown = {"short_sentence": base_synthetic_prob}
|
| 302 |
|
|
|
|
|
|
|
|
|
|
| 303 |
for name, result in metric_results.items():
|
| 304 |
+
if (result.error is None and weights.get(name, 0.0) > 0):
|
| 305 |
+
base_synthetic_prob = result.synthetic_probability
|
| 306 |
+
breakdown[name] = base_synthetic_prob
|
| 307 |
break
|
| 308 |
+
|
| 309 |
+
return (base_synthetic_prob,
|
| 310 |
+
1.0 - base_synthetic_prob,
|
| 311 |
+
0.0,
|
| 312 |
+
base_confidence,
|
| 313 |
+
breakdown
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
# Build sentence-level metric results
|
| 317 |
sentence_metric_results = dict()
|
| 318 |
breakdown = dict()
|
| 319 |
+
|
| 320 |
for name, doc_result in metric_results.items():
|
| 321 |
+
if doc_result.error is not None:
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
sentence_prob = self._compute_sentence_metric(metric_name = name,
|
| 326 |
+
sentence = sentence,
|
| 327 |
+
result = doc_result,
|
| 328 |
+
weight = weights.get(name, 0.0),
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
sentence_metric_results[name] = self._create_sentence_metric_result(metric_name = name,
|
| 332 |
+
synthetic_prob = sentence_prob,
|
| 333 |
+
doc_result = doc_result,
|
| 334 |
+
sentence_length = sentence_length,
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
breakdown[name] = sentence_prob
|
| 338 |
+
|
| 339 |
+
except Exception as e:
|
| 340 |
+
logger.warning(f"Metric {name} failed for sentence: {e}")
|
| 341 |
+
breakdown[name] = doc_result.synthetic_probability
|
| 342 |
+
|
| 343 |
+
# Ensemble aggregation (PRIMARY PATH)
|
|
|
|
| 344 |
if sentence_metric_results:
|
| 345 |
try:
|
| 346 |
ensemble_sentence_result = self.ensemble.predict(metric_results = sentence_metric_results,
|
| 347 |
domain = self.domain,
|
| 348 |
)
|
| 349 |
+
|
| 350 |
+
return (ensemble_sentence_result.synthetic_probability,
|
| 351 |
+
ensemble_sentence_result.authentic_probability,
|
| 352 |
+
ensemble_sentence_result.hybrid_probability,
|
| 353 |
+
ensemble_sentence_result.overall_confidence,
|
| 354 |
+
breakdown,
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
except Exception as e:
|
| 358 |
logger.warning(f"Sentence ensemble failed: {e}")
|
| 359 |
+
|
| 360 |
+
# Fallback: weighted average aggregation
|
| 361 |
+
return self._fallback_weighted_probability(metric_results, weights, breakdown)
|
| 362 |
+
|
| 363 |
|
| 364 |
def _compute_sentence_metric(self, metric_name: str, sentence: str, result: MetricResult, weight: float) -> float:
|
| 365 |
"""
|
|
|
|
| 369 |
|
| 370 |
# Get domain-specific threshold for this metric
|
| 371 |
metric_thresholds = getattr(self.domain_thresholds, metric_name, None)
|
| 372 |
+
|
| 373 |
if not metric_thresholds:
|
| 374 |
+
return result.synthetic_probability
|
| 375 |
|
| 376 |
# Base probability from document-level result
|
| 377 |
+
base_prob = result.synthetic_probability
|
| 378 |
|
| 379 |
# Apply domain-aware sentence-level adjustments
|
| 380 |
+
adjusted_prob = self._apply_metric_specific_adjustments(metric_name = metric_name,
|
| 381 |
+
sentence = sentence,
|
| 382 |
+
base_prob = base_prob,
|
| 383 |
+
sentence_length = sentence_length,
|
| 384 |
thresholds = metric_thresholds,
|
| 385 |
)
|
| 386 |
|
| 387 |
return adjusted_prob
|
| 388 |
|
| 389 |
|
| 390 |
+
def _create_sentence_metric_result(self, metric_name: str, synthetic_prob: float, doc_result: MetricResult, sentence_length: int) -> MetricResult:
|
| 391 |
"""
|
| 392 |
Create sentence-level MetricResult from document-level result
|
| 393 |
"""
|
| 394 |
+
# Calculate confidence based on sentence characteristics
|
| 395 |
+
sentence_confidence = self._calculate_sentence_confidence(doc_confidence = doc_result.confidence,
|
| 396 |
+
sentence_length = sentence_length,
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
return MetricResult(metric_name = metric_name,
|
| 400 |
+
synthetic_probability = synthetic_prob,
|
| 401 |
+
authentic_probability = 1.0 - synthetic_prob,
|
| 402 |
+
hybrid_probability = 0.0,
|
| 403 |
+
confidence = sentence_confidence,
|
| 404 |
+
details = doc_result.details,
|
| 405 |
+
error = None,
|
| 406 |
)
|
| 407 |
|
| 408 |
|
| 409 |
def _calculate_sentence_confidence(self, doc_confidence: float, sentence_length: int) -> float:
|
| 410 |
"""
|
| 411 |
+
Calculate confidence for sentence-level analysis with length consideration
|
| 412 |
"""
|
| 413 |
base_reduction = 0.8
|
| 414 |
# Scale confidence penalty with sentence length
|
|
|
|
| 417 |
return max(0.1, doc_confidence * base_reduction * length_penalty)
|
| 418 |
|
| 419 |
|
| 420 |
+
def _fallback_weighted_probability(self, metric_results: Dict[str, MetricResult], weights: Dict[str, float], breakdown: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
|
| 421 |
"""
|
| 422 |
Fallback weighted probability calculation
|
| 423 |
"""
|
| 424 |
+
weighted_synthetic_probs = list()
|
| 425 |
+
weighted_authentic_probs = list()
|
| 426 |
+
confidences = list()
|
| 427 |
+
total_weight = 0.0
|
| 428 |
|
| 429 |
for name, result in metric_results.items():
|
| 430 |
+
if result.error is None:
|
| 431 |
weight = weights.get(name, 0.0)
|
| 432 |
|
| 433 |
if (weight > 0):
|
| 434 |
+
weighted_synthetic_probs.append(result.synthetic_probability * weight)
|
| 435 |
+
weighted_authentic_probs.append(result.authentic_probability * weight)
|
| 436 |
confidences.append(result.confidence)
|
| 437 |
total_weight += weight
|
| 438 |
|
| 439 |
+
if not weighted_synthetic_probs or total_weight == 0:
|
| 440 |
return 0.5, 0.5, 0.0, 0.5, breakdown or {}
|
| 441 |
|
| 442 |
+
synthetic_prob = sum(weighted_synthetic_probs) / total_weight
|
| 443 |
+
authentic_prob = sum(weighted_authentic_probs) / total_weight
|
| 444 |
+
hybrid_prob = 0.0 # Fallback
|
| 445 |
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.5
|
| 446 |
|
| 447 |
+
return synthetic_prob, authentic_prob, hybrid_prob, avg_confidence, breakdown
|
| 448 |
|
| 449 |
|
| 450 |
+
def _get_document_ensemble_probability(self, ensemble_result: Optional[EnsembleResult], metric_results: Dict[str, MetricResult], weights: Dict[str, float]) -> Tuple[float, float, float, float, Dict[str, float]]:
|
|
|
|
| 451 |
"""
|
| 452 |
Get document-level ensemble probability
|
| 453 |
"""
|
| 454 |
if ensemble_result:
|
| 455 |
# Use existing ensemble result
|
| 456 |
+
breakdown = {name: result.synthetic_probability for name, result in metric_results.items()}
|
| 457 |
+
return (ensemble_result.synthetic_probability,
|
| 458 |
+
ensemble_result.authentic_probability,
|
| 459 |
+
ensemble_result.hybrid_probability,
|
| 460 |
+
ensemble_result.overall_confidence,
|
| 461 |
+
breakdown
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
else:
|
| 465 |
# Calculate from metrics
|
| 466 |
+
return self._fallback_weighted_probability(metric_results, weights, {})
|
| 467 |
|
| 468 |
|
| 469 |
+
def _apply_domain_specific_adjustments(self, sentence: str, synthetic_prob: float, sentence_length: int) -> float:
|
| 470 |
"""
|
| 471 |
+
Apply domain-specific adjustments to Synthetic probability with limits
|
| 472 |
"""
|
| 473 |
+
original_prob = synthetic_prob
|
| 474 |
adjustments = list()
|
| 475 |
sentence_lower = sentence.lower()
|
| 476 |
|
| 477 |
# Technical & AI/ML domains
|
| 478 |
+
if self.domain in [Domain.AI_ML, Domain.SOFTWARE_DEV, Domain.TECHNICAL_DOC, Domain.ENGINEERING, Domain.SCIENCE]:
|
| 479 |
if self._has_technical_terms(sentence_lower):
|
| 480 |
+
adjustments.append(1.1)
|
| 481 |
+
|
| 482 |
elif self._has_code_like_patterns(sentence):
|
| 483 |
adjustments.append(1.15)
|
| 484 |
|
|
|
|
| 486 |
adjustments.append(1.05)
|
| 487 |
|
| 488 |
# Creative & informal domains
|
| 489 |
+
elif self.domain in [Domain.CREATIVE, Domain.SOCIAL_MEDIA, Domain.BLOG_PERSONAL]:
|
| 490 |
if self._has_informal_language(sentence_lower):
|
| 491 |
+
adjustments.append(0.7)
|
| 492 |
+
|
| 493 |
elif self._has_emotional_language(sentence):
|
| 494 |
adjustments.append(0.8)
|
| 495 |
+
|
| 496 |
+
elif sentence_length < 10:
|
| 497 |
adjustments.append(0.8)
|
| 498 |
|
| 499 |
# Academic & formal domains
|
| 500 |
+
elif self.domain in [Domain.ACADEMIC, Domain.LEGAL, Domain.MEDICAL]:
|
| 501 |
if self._has_citation_patterns(sentence):
|
| 502 |
+
adjustments.append(0.8)
|
| 503 |
+
|
| 504 |
elif self._has_technical_terms(sentence_lower):
|
| 505 |
adjustments.append(1.1)
|
| 506 |
+
|
| 507 |
elif (sentence_length > 40):
|
| 508 |
adjustments.append(1.1)
|
| 509 |
|
| 510 |
# Business & professional domains
|
| 511 |
+
elif self.domain in [Domain.BUSINESS, Domain.MARKETING, Domain.JOURNALISM]:
|
| 512 |
if self._has_business_jargon(sentence_lower):
|
| 513 |
+
adjustments.append(1.05)
|
| 514 |
|
| 515 |
elif self._has_ambiguous_phrasing(sentence_lower):
|
| 516 |
+
adjustments.append(0.9)
|
| 517 |
|
| 518 |
elif (15 <= sentence_length <= 25):
|
| 519 |
adjustments.append(0.9)
|
|
|
|
| 521 |
# Tutorial & educational domains
|
| 522 |
elif (self.domain == Domain.TUTORIAL):
|
| 523 |
if self._has_instructional_language(sentence_lower):
|
| 524 |
+
adjustments.append(0.85)
|
| 525 |
+
|
| 526 |
elif self._has_step_by_step_pattern(sentence):
|
| 527 |
adjustments.append(0.8)
|
| 528 |
|
|
|
|
| 541 |
if adjustments:
|
| 542 |
# Sort by impact (farthest from 1.0)
|
| 543 |
adjustments.sort(key = lambda x: abs(x - 1.0), reverse = True)
|
| 544 |
+
|
| 545 |
# Limit to 2 strongest
|
| 546 |
+
strongest_adjustments = adjustments[:2]
|
| 547 |
|
| 548 |
for adjustment in strongest_adjustments:
|
| 549 |
+
synthetic_prob *= adjustment
|
| 550 |
|
| 551 |
+
# Ensure probability stays within bounds and doesn't change too drastically
|
| 552 |
+
max_change = 0.3 # Maximum 30% change from original
|
| 553 |
+
bounded_prob = max(original_prob - max_change, min(original_prob + max_change, synthetic_prob))
|
| 554 |
|
| 555 |
return max(0.0, min(1.0, bounded_prob))
|
| 556 |
|
|
|
|
| 563 |
if (metric_name == "perplexity"):
|
| 564 |
if (sentence_length < 8):
|
| 565 |
return min(1.0, base_prob * 1.2)
|
| 566 |
+
|
| 567 |
elif (sentence_length > 25):
|
| 568 |
return max(0.0, base_prob * 0.8)
|
| 569 |
|
|
|
|
| 608 |
return base_prob
|
| 609 |
|
| 610 |
|
| 611 |
+
def _is_hybrid_content(self, synthetic_prob: float, hybrid_prob: float, confidence: float) -> bool:
|
| 612 |
"""
|
| 613 |
+
Determine if content should be classified as hybrid
|
| 614 |
"""
|
| 615 |
+
# Case 1: Explicit high hybrid probability from ensemble
|
| 616 |
+
if (hybrid_prob > self.HYBRID_PROB_THRESHOLD):
|
| 617 |
+
return True
|
| 618 |
+
|
| 619 |
+
# Case 2: High uncertainty combined with ambiguous synthetic probability
|
| 620 |
+
if (confidence < 0.3 and 0.4 <= synthetic_prob <= 0.7):
|
| 621 |
+
return True
|
| 622 |
|
| 623 |
+
# Case 3: Synthetic probability in hybrid range (0.6-0.8)
|
| 624 |
+
if (0.6 <= synthetic_prob < 0.8):
|
| 625 |
+
return True
|
| 626 |
|
| 627 |
+
return False
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
def _get_color_for_probability(self, synthetic_prob: float, is_hybrid_content: bool = False, hybrid_prob: float = 0.0) -> Tuple[str, str, str]:
|
| 631 |
+
"""
|
| 632 |
+
Get color class with simplified 4-category system
|
| 633 |
+
"""
|
| 634 |
+
# Handle hybrid content first
|
| 635 |
+
if is_hybrid_content:
|
| 636 |
+
return "hybrid", "#e9d5ff", f"Mixed synthetic/authentic content ({hybrid_prob:.1%} hybrid)"
|
| 637 |
+
|
| 638 |
+
# Iterate through simplified thresholds
|
| 639 |
for min_thresh, max_thresh, color_class, color_hex, tooltip in self.COLOR_THRESHOLDS:
|
| 640 |
+
if (min_thresh <= synthetic_prob < max_thresh):
|
| 641 |
return color_class, color_hex, tooltip
|
| 642 |
|
| 643 |
+
# Fallback for edge cases
|
| 644 |
+
return "uncertain", "#fef3c7", "Uncertain authorship"
|
| 645 |
+
|
| 646 |
|
| 647 |
+
def _generate_ensemble_tooltip(self, sentence: str, synthetic_prob: float, authentic_prob: float, hybrid_prob: float, confidence: float, confidence_level: ConfidenceLevel,
|
| 648 |
+
tooltip_base: str, breakdown: Optional[Dict[str, float]] = None, is_hybrid_content: bool = False) -> str:
|
| 649 |
"""
|
| 650 |
Generate enhanced tooltip with ENSEMBLE information
|
| 651 |
"""
|
| 652 |
tooltip = f"{tooltip_base}\n"
|
| 653 |
|
| 654 |
+
if is_hybrid_content:
|
| 655 |
+
tooltip += "🔀 HYBRID CONTENT DETECTED\n"
|
| 656 |
|
| 657 |
+
tooltip += f"Synthetic Probability: {synthetic_prob:.1%}\n"
|
| 658 |
+
tooltip += f"Authentic Probability: {authentic_prob:.1%}\n"
|
| 659 |
+
tooltip += f"Hybrid Probability: {hybrid_prob:.1%}\n"
|
| 660 |
tooltip += f"Confidence: {confidence:.1%} ({confidence_level.value.replace('_', ' ').title()})\n"
|
| 661 |
tooltip += f"Domain: {self.domain.value.replace('_', ' ').title()}\n"
|
| 662 |
tooltip += f"Length: {len(sentence.split())} words"
|
|
|
|
| 664 |
if breakdown:
|
| 665 |
tooltip += "\n\nMetric Breakdown:"
|
| 666 |
# Show top 4 metrics
|
| 667 |
+
for metric, prob in list(breakdown.items())[:4]:
|
| 668 |
tooltip += f"\n• {metric}: {prob:.1%}"
|
| 669 |
|
| 670 |
tooltip += f"\n\nEnsemble Method: {getattr(self.ensemble, 'primary_method', 'fallback')}"
|
|
|
|
| 677 |
Check for academic citation patterns
|
| 678 |
"""
|
| 679 |
citation_indicators = ['et al.', 'ibid.', 'cf.', 'e.g.', 'i.e.', 'vol.', 'pp.', 'ed.', 'trans.', 'reference', 'cited', 'according to']
|
|
|
|
| 680 |
return any(indicator in sentence.lower() for indicator in citation_indicators)
|
| 681 |
|
| 682 |
|
|
|
|
| 685 |
Check for informal language patterns
|
| 686 |
"""
|
| 687 |
informal_indicators = ['lol', 'omg', 'btw', 'imo', 'tbh', 'afaik', 'smh', '👋', '😂', '❤️', 'haha', 'wow', 'awesome']
|
|
|
|
| 688 |
return any(indicator in sentence.lower() for indicator in informal_indicators)
|
| 689 |
|
| 690 |
|
|
|
|
| 704 |
Check for ambiguous phrasing that might indicate human writing
|
| 705 |
"""
|
| 706 |
ambiguous_indicators = ['perhaps', 'maybe', 'possibly', 'likely', 'appears to', 'seems to', 'might be', 'could be']
|
|
|
|
| 707 |
return any(indicator in sentence.lower() for indicator in ambiguous_indicators)
|
| 708 |
|
| 709 |
|
|
|
|
| 716 |
return False
|
| 717 |
|
| 718 |
complex_indicators = ['which', 'that', 'although', 'because', 'while', 'when', 'if', 'however', 'therefore']
|
|
|
|
| 719 |
return any(indicator in sentence.lower() for indicator in complex_indicators)
|
| 720 |
|
| 721 |
|
|
|
|
| 724 |
Check for emotional or subjective language
|
| 725 |
"""
|
| 726 |
emotional_indicators = ['feel', 'believe', 'think', 'wonder', 'hope', 'wish', 'love', 'hate', 'frustrating', 'exciting']
|
|
|
|
| 727 |
return any(indicator in sentence.lower() for indicator in emotional_indicators)
|
| 728 |
|
| 729 |
|
|
|
|
| 732 |
Check for business jargon
|
| 733 |
"""
|
| 734 |
jargon_indicators = ['synergy', 'leverage', 'bandwidth', 'circle back', 'touch base', 'value add', 'core competency']
|
|
|
|
| 735 |
return any(indicator in sentence.lower() for indicator in jargon_indicators)
|
| 736 |
|
| 737 |
|
|
|
|
| 740 |
Check for instructional language patterns
|
| 741 |
"""
|
| 742 |
instructional_indicators = ['step by step', 'firstly', 'secondly', 'finally', 'note that', 'remember to', 'make sure']
|
|
|
|
| 743 |
return any(indicator in sentence.lower() for indicator in instructional_indicators)
|
| 744 |
|
| 745 |
|
|
|
|
| 748 |
Check for step-by-step instructions
|
| 749 |
"""
|
| 750 |
step_patterns = ['step 1', 'step 2', 'step 3', 'step one', 'step two', 'first step', 'next step']
|
|
|
|
| 751 |
return any(pattern in sentence.lower() for pattern in step_patterns)
|
| 752 |
|
| 753 |
|
|
|
|
| 756 |
Check for example indicators
|
| 757 |
"""
|
| 758 |
example_indicators = ['for example', 'for instance', 'such as', 'e.g.', 'as an example']
|
|
|
|
| 759 |
return any(indicator in sentence.lower() for indicator in example_indicators)
|
| 760 |
|
| 761 |
|
|
|
|
| 764 |
Check for code-like patterns in technical domains
|
| 765 |
"""
|
| 766 |
code_patterns = ['function', 'variable', 'class', 'method', 'import', 'def ', 'void ', 'public ', 'private ']
|
|
|
|
| 767 |
return any(pattern in sentence for pattern in code_patterns)
|
| 768 |
|
| 769 |
|
|
|
|
| 785 |
elif (len(words) > 25):
|
| 786 |
score += 0.5
|
| 787 |
|
| 788 |
+
indicator_count = sum(1 for indicator in complexity_indicators if indicator in sentence.lower())
|
| 789 |
+
score += min(0.5, indicator_count * 0.1)
|
| 790 |
|
| 791 |
+
clause_indicators = [',', ';', 'and', 'but', 'or', 'because', 'although']
|
| 792 |
+
clause_count = sum(1 for indicator in clause_indicators if indicator in sentence.lower())
|
| 793 |
+
score += min(0.2, clause_count * 0.05)
|
| 794 |
|
| 795 |
return min(1.0, score)
|
| 796 |
|
| 797 |
|
| 798 |
def _has_repetition(self, sentence: str) -> bool:
|
| 799 |
"""
|
| 800 |
+
Check if sentence has word repetition (common in Synthetic text)
|
| 801 |
"""
|
| 802 |
words = sentence.lower().split()
|
| 803 |
if (len(words) < 6):
|
|
|
|
| 810 |
word_counts[word] = word_counts.get(word, 0) + 1
|
| 811 |
|
| 812 |
repeated_words = [word for word, count in word_counts.items() if count > 2]
|
| 813 |
+
return (len(repeated_words) > 0)
|
|
|
|
| 814 |
|
| 815 |
|
| 816 |
+
def generate_html(self, highlighted_sentences: List[HighlightedSentenceResult], include_legend: bool = True) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
"""
|
| 818 |
Generate HTML with highlighted sentences
|
| 819 |
|
| 820 |
Arguments:
|
| 821 |
----------
|
| 822 |
+
highlighted_sentences { List[HighlightedSentenceResult] } : Sentences with highlighting data
|
| 823 |
+
|
| 824 |
+
include_legend { bool } : Whether to include legend
|
|
|
|
|
|
|
| 825 |
|
| 826 |
Returns:
|
| 827 |
--------
|
| 828 |
+
{ str } : HTML content
|
| 829 |
"""
|
| 830 |
html_parts = list()
|
| 831 |
|
| 832 |
# Add CSS
|
| 833 |
+
html_parts.append(self._generate_css())
|
| 834 |
|
| 835 |
+
# Include legend if requested
|
| 836 |
if include_legend:
|
| 837 |
html_parts.append(self._generate_legend_html())
|
| 838 |
|
|
|
|
| 840 |
html_parts.append('<div class="highlighted-text">')
|
| 841 |
|
| 842 |
for sent in highlighted_sentences:
|
| 843 |
+
extra_class = " hybrid-highlight" if sent.is_hybrid_content else ""
|
| 844 |
html_parts.append(f'<span class="highlight {sent.color_class}{extra_class}" '
|
| 845 |
+
f'data-synthetic-prob="{sent.synthetic_probability:.4f}" '
|
| 846 |
+
f'data-authentic-prob="{sent.authentic_probability:.4f}" '
|
| 847 |
+
f'data-hybrid-prob="{sent.hybrid_probability:.4f}" '
|
| 848 |
f'data-confidence="{sent.confidence:.4f}" '
|
| 849 |
f'data-confidence-level="{sent.confidence_level.value}" '
|
| 850 |
f'data-domain="{self.domain.value}" '
|
| 851 |
f'data-sentence-idx="{sent.index}" '
|
| 852 |
+
f'data-is-hybrid="{str(sent.is_hybrid_content).lower()}" '
|
| 853 |
f'title="{sent.tooltip}">'
|
| 854 |
f'{sent.text}'
|
| 855 |
+
f'</span> ')
|
|
|
|
| 856 |
|
| 857 |
html_parts.append('</div>')
|
| 858 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 859 |
return '\n'.join(html_parts)
|
| 860 |
|
| 861 |
|
| 862 |
+
def _generate_css(self) -> str:
|
| 863 |
"""
|
| 864 |
+
Generate CSS for highlighting for better readability with 4 color types
|
| 865 |
"""
|
| 866 |
return """
|
| 867 |
<style>
|
|
|
|
| 895 |
text-shadow: 0 1px 1px rgba(255,255,255,0.8);
|
| 896 |
}
|
| 897 |
|
| 898 |
+
/* Authentic - Green tones */
|
| 899 |
+
.authentic {
|
| 900 |
+
background-color: #d1fae5;
|
| 901 |
+
border-bottom-color: #10b981;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 902 |
}
|
| 903 |
|
| 904 |
+
/* Uncertain - Yellow tones */
|
| 905 |
+
.uncertain {
|
| 906 |
background-color: #fef3c7;
|
| 907 |
border-bottom-color: #f59e0b;
|
| 908 |
}
|
| 909 |
|
| 910 |
+
/* Hybrid - Purple tones */
|
| 911 |
+
.hybrid {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 912 |
background-color: #e9d5ff;
|
| 913 |
border-bottom-color: #a855f7;
|
|
|
|
| 914 |
}
|
| 915 |
|
| 916 |
+
.hybrid-highlight:hover {
|
| 917 |
border: 2px dashed #a855f7;
|
| 918 |
}
|
| 919 |
|
| 920 |
+
/* Synthetic - Red tones */
|
| 921 |
+
.synthetic {
|
| 922 |
+
background-color: #fee2e2;
|
| 923 |
+
border-bottom-color: #ef4444;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 924 |
}
|
| 925 |
</style>
|
| 926 |
"""
|
| 927 |
|
| 928 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 929 |
def _generate_legend_html(self) -> str:
|
| 930 |
"""
|
| 931 |
+
Generate legend HTML for 4-category system
|
| 932 |
"""
|
| 933 |
return """
|
| 934 |
<div class="highlight-legend" style="margin-bottom: 20px; padding: 15px; background: #f8fafc; border-radius: 8px; border: 1px solid #e2e8f0;">
|
| 935 |
+
<h4 style="margin: 0 0 10px 0; font-size: 14px; font-weight: 600; color: #374151;">Text Analysis Legend</h4>
|
| 936 |
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 8px;">
|
| 937 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 938 |
+
<div style="width: 16px; height: 16px; background: #d1fae5; border: 1px solid #10b981; border-radius: 3px;"></div>
|
| 939 |
+
<span style="font-size: 12px; color: #374151;">Authentic (0-40% synthetic)</span>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 940 |
</div>
|
| 941 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 942 |
+
<div style="width: 16px; height: 16px; background: #fef3c7; border: 1px solid #f59e0b; border-radius: 3px;"></div>
|
| 943 |
+
<span style="font-size: 12px; color: #374151;">Uncertain (40-60% synthetic)</span>
|
| 944 |
</div>
|
| 945 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 946 |
+
<div style="width: 16px; height: 16px; background: #e9d5ff; border: 1px solid #a855f7; border-radius: 3px;"></div>
|
| 947 |
+
<span style="font-size: 12px; color: #374151;">Hybrid (60-80% synthetic)</span>
|
| 948 |
</div>
|
| 949 |
<div style="display: flex; align-items: center; gap: 8px;">
|
| 950 |
+
<div style="width: 16px; height: 16px; background: #fee2e2; border: 1px solid #ef4444; border-radius: 3px;"></div>
|
| 951 |
+
<span style="font-size: 12px; color: #374151;">Synthetic (80-100% synthetic)</span>
|
| 952 |
</div>
|
| 953 |
</div>
|
| 954 |
</div>
|
| 955 |
"""
|
| 956 |
|
| 957 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
# Export
|
| 959 |
+
__all__ = ["TextHighlighter"]
|
|
|
|
|
|
services/orchestrator.py
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DEPENDENCIES
|
| 2 |
+
import time
|
| 3 |
+
from typing import Any
|
| 4 |
+
from typing import Dict
|
| 5 |
+
from typing import List
|
| 6 |
+
from typing import Tuple
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from typing import Optional
|
| 9 |
+
from config.enums import Domain
|
| 10 |
+
from config.settings import settings
|
| 11 |
+
from concurrent.futures import Executor
|
| 12 |
+
from config.schemas import MetricResult
|
| 13 |
+
from config.schemas import EnsembleResult
|
| 14 |
+
from metrics.entropy import EntropyMetric
|
| 15 |
+
from config.schemas import DetectionResult
|
| 16 |
+
from concurrent.futures import as_completed
|
| 17 |
+
from metrics.perplexity import PerplexityMetric
|
| 18 |
+
from metrics.linguistic import LinguisticMetric
|
| 19 |
+
from metrics.structural import StructuralMetric
|
| 20 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 21 |
+
from config.schemas import LanguageDetectionResult
|
| 22 |
+
from processors.text_processor import TextProcessor
|
| 23 |
+
from processors.text_processor import ProcessedText
|
| 24 |
+
from processors.domain_classifier import DomainClassifier
|
| 25 |
+
from processors.domain_classifier import DomainPrediction
|
| 26 |
+
from processors.language_detector import LanguageDetector
|
| 27 |
+
from services.ensemble_classifier import EnsembleClassifier
|
| 28 |
+
from metrics.semantic_analysis import SemanticAnalysisMetric
|
| 29 |
+
from metrics.multi_perturbation_stability import MultiPerturbationStabilityMetric
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class DetectionOrchestrator:
|
| 33 |
+
"""
|
| 34 |
+
Coordinates the entire detection pipeline from text input to final results
|
| 35 |
+
|
| 36 |
+
Pipeline:
|
| 37 |
+
1. Text preprocessing
|
| 38 |
+
2. Domain classification
|
| 39 |
+
3. Language detection (optional)
|
| 40 |
+
4. Metric execution (parallel/sequential)
|
| 41 |
+
5. Ensemble aggregation
|
| 42 |
+
6. Result generation
|
| 43 |
+
"""
|
| 44 |
+
def __init__(self, enable_language_detection: bool = False, skip_expensive_metrics: bool = False, parallel_executor: Optional[Executor] = None, parallel_execution: bool = True):
|
| 45 |
+
"""
|
| 46 |
+
Initialize detection orchestrator
|
| 47 |
+
|
| 48 |
+
Arguments:
|
| 49 |
+
----------
|
| 50 |
+
enable_language_detection { bool } : Enable language detection step
|
| 51 |
+
skip_expensive_metrics { bool } : Skip computationally expensive metrics
|
| 52 |
+
parallel_executor { Executor } : Thread/Process executor for parallel processing
|
| 53 |
+
parallel_execution { bool } : Enable parallel metric execution
|
| 54 |
+
"""
|
| 55 |
+
self.enable_language_detection = enable_language_detection
|
| 56 |
+
self.skip_expensive_metrics = skip_expensive_metrics
|
| 57 |
+
self.parallel_executor = parallel_executor
|
| 58 |
+
self.parallel_execution = parallel_execution
|
| 59 |
+
|
| 60 |
+
# Initialize processors
|
| 61 |
+
self.text_processor = TextProcessor()
|
| 62 |
+
|
| 63 |
+
self.domain_classifier = DomainClassifier()
|
| 64 |
+
self.language_detector = LanguageDetector(use_model = True) if self.enable_language_detection else None
|
| 65 |
+
|
| 66 |
+
# Initialize metrics
|
| 67 |
+
self.metrics = self._initialize_metrics()
|
| 68 |
+
|
| 69 |
+
# Initialize ensemble
|
| 70 |
+
self.ensemble = EnsembleClassifier(primary_method = "confidence_calibrated",
|
| 71 |
+
fallback_method = "domain_weighted",
|
| 72 |
+
min_metrics_required = 3,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
logger.info(f"DetectionOrchestrator initialized (language_detection={enable_language_detection}, "
|
| 76 |
+
f"skip_expensive={skip_expensive_metrics}, parallel={parallel_execution})")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _initialize_metrics(self) -> Dict[str, Any]:
|
| 80 |
+
"""
|
| 81 |
+
Initialize all enabled metrics
|
| 82 |
+
"""
|
| 83 |
+
metrics = dict()
|
| 84 |
+
|
| 85 |
+
# Define metric initialization order (simpler metrics first)
|
| 86 |
+
metric_classes = [("structural", StructuralMetric),
|
| 87 |
+
("entropy", EntropyMetric),
|
| 88 |
+
("perplexity", PerplexityMetric),
|
| 89 |
+
("semantic_analysis", SemanticAnalysisMetric),
|
| 90 |
+
("linguistic", LinguisticMetric),
|
| 91 |
+
("multi_perturbation_stability", MultiPerturbationStabilityMetric),
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
for name, metric_class in metric_classes:
|
| 95 |
+
try:
|
| 96 |
+
metrics[name] = metric_class()
|
| 97 |
+
logger.debug(f"{name} metric initialized")
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"Failed to initialize {name} metric: {repr(e)}")
|
| 101 |
+
|
| 102 |
+
logger.info(f"Initialized {len(metrics)} metrics: {list(metrics.keys())}")
|
| 103 |
+
return metrics
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def initialize(self) -> bool:
|
| 107 |
+
"""
|
| 108 |
+
Initialize all components (load models, etc.)
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
--------
|
| 112 |
+
{ bool } : True if successful, False otherwise
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
logger.info("Initializing detection pipeline...")
|
| 116 |
+
|
| 117 |
+
# Initialize processors
|
| 118 |
+
self._initialize_processors()
|
| 119 |
+
|
| 120 |
+
# Initialize metrics
|
| 121 |
+
successful_metrics = self._initialize_metrics_components()
|
| 122 |
+
|
| 123 |
+
# Need at least 3 metrics for reliable detection
|
| 124 |
+
pipeline_ready = (successful_metrics >= 3)
|
| 125 |
+
|
| 126 |
+
if pipeline_ready:
|
| 127 |
+
logger.success(f"Detection pipeline initialized: {successful_metrics}/{len(self.metrics)} metrics ready")
|
| 128 |
+
|
| 129 |
+
else:
|
| 130 |
+
logger.warning(f"Pipeline may be unreliable: only {successful_metrics} metrics initialized (need at least 3)")
|
| 131 |
+
|
| 132 |
+
return pipeline_ready
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"Failed to initialize detection pipeline: {repr(e)}")
|
| 136 |
+
return False
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _initialize_processors(self) -> None:
|
| 140 |
+
"""
|
| 141 |
+
Initialize processor components
|
| 142 |
+
"""
|
| 143 |
+
# Initialize domain classifier
|
| 144 |
+
if not self.domain_classifier.initialize():
|
| 145 |
+
logger.warning("Domain classifier initialization failed")
|
| 146 |
+
|
| 147 |
+
# Initialize language detector
|
| 148 |
+
if self.language_detector and not self.language_detector.initialize():
|
| 149 |
+
logger.warning("Language detector initialization failed")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _initialize_metrics_components(self) -> int:
|
| 153 |
+
"""
|
| 154 |
+
Initialize metric components and return count of successful initializations
|
| 155 |
+
"""
|
| 156 |
+
successful_metrics = 0
|
| 157 |
+
|
| 158 |
+
for name, metric in self.metrics.items():
|
| 159 |
+
try:
|
| 160 |
+
if metric.initialize():
|
| 161 |
+
successful_metrics += 1
|
| 162 |
+
logger.debug(f"✓ {name} metric ready")
|
| 163 |
+
|
| 164 |
+
else:
|
| 165 |
+
logger.warning(f"✗ {name} metric initialization failed")
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Error initializing {name} metric: {repr(e)}")
|
| 169 |
+
|
| 170 |
+
return successful_metrics
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def analyze(self, text: str, domain: Optional[Domain] = None, **kwargs) -> DetectionResult:
|
| 174 |
+
"""
|
| 175 |
+
Analyze text and detect if synthetically-generated
|
| 176 |
+
|
| 177 |
+
Arguments:
|
| 178 |
+
----------
|
| 179 |
+
text { str } : Input text to analyze
|
| 180 |
+
|
| 181 |
+
domain { Domain } : Override automatic domain detection
|
| 182 |
+
|
| 183 |
+
**kwargs : Additional options
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
--------
|
| 187 |
+
{ DetectionResult } : DetectionResult with complete analysis
|
| 188 |
+
"""
|
| 189 |
+
start_time = time.time()
|
| 190 |
+
warnings = list()
|
| 191 |
+
errors = list()
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
# Step 1: Preprocess text
|
| 195 |
+
processed_text = self._preprocess_text(text = text,
|
| 196 |
+
warnings = warnings,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Step 2: Detect language
|
| 200 |
+
language_result = self._detect_language(processed_text = processed_text,
|
| 201 |
+
warnings = warnings,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Step 3: Classify domain
|
| 205 |
+
domain_prediction, domain = self._classify_domain(processed_text = processed_text,
|
| 206 |
+
user_domain = domain,
|
| 207 |
+
warnings = warnings,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Step 4: Execute metrics (parallel or sequential)
|
| 211 |
+
metric_results, metrics_execution_time = self._execute_metrics_parallel(processed_text = processed_text,
|
| 212 |
+
domain = domain,
|
| 213 |
+
warnings = warnings,
|
| 214 |
+
errors = errors,
|
| 215 |
+
**kwargs
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
# Step 5: Ensemble aggregation
|
| 219 |
+
ensemble_result = self._aggregate_results(metric_results = metric_results,
|
| 220 |
+
domain = domain,
|
| 221 |
+
errors = errors,
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Step 6: Compile final result
|
| 225 |
+
processing_time = time.time() - start_time
|
| 226 |
+
|
| 227 |
+
return self._compile_result(ensemble_result = ensemble_result,
|
| 228 |
+
processed_text = processed_text,
|
| 229 |
+
domain_prediction = domain_prediction,
|
| 230 |
+
language_result = language_result,
|
| 231 |
+
metric_results = metric_results,
|
| 232 |
+
processing_time = processing_time,
|
| 233 |
+
metrics_execution_time = metrics_execution_time,
|
| 234 |
+
warnings = warnings,
|
| 235 |
+
errors = errors,
|
| 236 |
+
**kwargs,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Fatal error in detection pipeline: {repr(e)}")
|
| 241 |
+
return self._create_error_result(text, str(e), start_time)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _preprocess_text(self, text: str, warnings: List[str]) -> ProcessedText:
|
| 245 |
+
"""
|
| 246 |
+
Preprocess text
|
| 247 |
+
"""
|
| 248 |
+
logger.info("Step 1: Preprocessing text...")
|
| 249 |
+
processed_text = self.text_processor.process(text = text)
|
| 250 |
+
|
| 251 |
+
if not processed_text.is_valid:
|
| 252 |
+
logger.warning(f"Text validation failed: {processed_text.validation_errors}")
|
| 253 |
+
warnings.extend(processed_text.validation_errors)
|
| 254 |
+
|
| 255 |
+
return processed_text
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def _detect_language(self, processed_text: ProcessedText, warnings: List[str]) -> Optional[LanguageDetectionResult]:
|
| 259 |
+
"""
|
| 260 |
+
Detect language
|
| 261 |
+
"""
|
| 262 |
+
if not self.language_detector:
|
| 263 |
+
return None
|
| 264 |
+
|
| 265 |
+
logger.info("Step 2: Detecting language...")
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
language_result = self.language_detector.detect(processed_text.cleaned_text)
|
| 269 |
+
|
| 270 |
+
# Add relevant warnings
|
| 271 |
+
if (language_result.primary_language.value != "en"):
|
| 272 |
+
warnings.append(f"Non-English text detected ({language_result.primary_language.value}). Detection accuracy may be reduced.")
|
| 273 |
+
|
| 274 |
+
if language_result.is_multilingual:
|
| 275 |
+
warnings.append("Multilingual content detected")
|
| 276 |
+
|
| 277 |
+
if (language_result.evidence_strength < 0.7):
|
| 278 |
+
warnings.append(f"Low language detection evidence_strength ({language_result.evidence_strength:.2f})")
|
| 279 |
+
|
| 280 |
+
return language_result
|
| 281 |
+
|
| 282 |
+
except Exception as e:
|
| 283 |
+
logger.warning(f"Language detection failed: {repr(e)}")
|
| 284 |
+
warnings.append("Language detection failed")
|
| 285 |
+
return None
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _classify_domain(self, processed_text: ProcessedText, user_domain: Optional[Domain], warnings: List[str]) -> Tuple[DomainPrediction, Domain]:
|
| 289 |
+
"""
|
| 290 |
+
Classify domain
|
| 291 |
+
"""
|
| 292 |
+
logger.info("Step 3: Classifying domain...")
|
| 293 |
+
|
| 294 |
+
if user_domain is not None:
|
| 295 |
+
# Use provided domain
|
| 296 |
+
domain_prediction = DomainPrediction(primary_domain = user_domain,
|
| 297 |
+
secondary_domain = None,
|
| 298 |
+
evidence_strength = 1.0,
|
| 299 |
+
domain_scores = {user_domain.value: 1.0},
|
| 300 |
+
)
|
| 301 |
+
domain = user_domain
|
| 302 |
+
|
| 303 |
+
else:
|
| 304 |
+
# Automatically classify domain
|
| 305 |
+
try:
|
| 306 |
+
domain_prediction = self.domain_classifier.classify(processed_text.cleaned_text)
|
| 307 |
+
domain = domain_prediction.primary_domain
|
| 308 |
+
|
| 309 |
+
if (domain_prediction.evidence_strength < 0.5):
|
| 310 |
+
warnings.append(f"Low domain classification Evidence Strength ({domain_prediction.evidence_strength:.2f})")
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.warning(f"Domain classification failed: {repr(e)}")
|
| 314 |
+
domain_prediction = DomainPrediction(primary_domain = Domain.GENERAL,
|
| 315 |
+
secondary_domain = None,
|
| 316 |
+
evidence_strength = 0.5,
|
| 317 |
+
domain_scores = {},
|
| 318 |
+
)
|
| 319 |
+
domain = Domain.GENERAL
|
| 320 |
+
warnings.append("Domain classification failed, using GENERAL")
|
| 321 |
+
|
| 322 |
+
logger.info(f"Detected domain: {domain.value} (Evidence Strength: {domain_prediction.evidence_strength:.2f})")
|
| 323 |
+
return domain_prediction, domain
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def _execute_metrics_parallel(self, processed_text: ProcessedText, domain: Domain, warnings: List[str], errors: List[str], **kwargs) -> Tuple[Dict[str, MetricResult], Dict[str, float]]:
|
| 327 |
+
"""
|
| 328 |
+
Execute metrics calculations in parallel with fallback to sequential
|
| 329 |
+
|
| 330 |
+
Returns:
|
| 331 |
+
--------
|
| 332 |
+
Tuple[Dict[str, MetricResult], Dict[str, float]]: Metric results and execution times
|
| 333 |
+
"""
|
| 334 |
+
logger.info("Step 4: Executing detection metrics calculations...")
|
| 335 |
+
|
| 336 |
+
# Check if we should use parallel execution
|
| 337 |
+
use_parallel = self.parallel_execution and self.parallel_executor is not None
|
| 338 |
+
|
| 339 |
+
if use_parallel:
|
| 340 |
+
logger.info("Using parallel execution for metrics")
|
| 341 |
+
try:
|
| 342 |
+
return self._execute_metrics_parallel_impl(processed_text = processed_text,
|
| 343 |
+
domain = domain,
|
| 344 |
+
warnings = warnings,
|
| 345 |
+
errors = errors,
|
| 346 |
+
**kwargs
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
logger.warning(f"Parallel execution failed, falling back to sequential: {repr(e)}")
|
| 351 |
+
warnings.append(f"Parallel execution failed: {str(e)[:100]}")
|
| 352 |
+
|
| 353 |
+
return self._execute_metrics_sequential(processed_text = processed_text,
|
| 354 |
+
domain = domain,
|
| 355 |
+
warnings = warnings,
|
| 356 |
+
errors = errors,
|
| 357 |
+
**kwargs
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
else:
|
| 361 |
+
logger.info("Using sequential execution for metrics")
|
| 362 |
+
return self._execute_metrics_sequential(processed_text = processed_text,
|
| 363 |
+
domain = domain,
|
| 364 |
+
warnings = warnings,
|
| 365 |
+
errors = errors,
|
| 366 |
+
**kwargs
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def _execute_metrics_parallel_impl(self, processed_text: ProcessedText, domain: Domain, warnings: List[str], errors: List[str], **kwargs) -> Tuple[Dict[str, MetricResult], Dict[str, float]]:
|
| 371 |
+
"""
|
| 372 |
+
Execute metrics in parallel using thread pool
|
| 373 |
+
"""
|
| 374 |
+
metric_results = dict()
|
| 375 |
+
metrics_execution_time = dict()
|
| 376 |
+
futures = dict()
|
| 377 |
+
|
| 378 |
+
# Submit all metric computations to thread pool
|
| 379 |
+
for name, metric in self.metrics.items():
|
| 380 |
+
# Skip expensive metrics if configured
|
| 381 |
+
if (self.skip_expensive_metrics and (name == "multi_perturbation_stability")):
|
| 382 |
+
logger.info(f"Skipping expensive metric: {name}")
|
| 383 |
+
continue
|
| 384 |
+
|
| 385 |
+
# Submit task to thread pool
|
| 386 |
+
future = self.parallel_executor.submit(self._compute_metric_wrapper,
|
| 387 |
+
name = name,
|
| 388 |
+
metric = metric,
|
| 389 |
+
text = processed_text.cleaned_text,
|
| 390 |
+
domain = domain,
|
| 391 |
+
skip_expensive = self.skip_expensive_metrics,
|
| 392 |
+
warnings = warnings,
|
| 393 |
+
errors = errors
|
| 394 |
+
)
|
| 395 |
+
futures[future] = name
|
| 396 |
+
|
| 397 |
+
# Collect results as they complete
|
| 398 |
+
completed_count = 0
|
| 399 |
+
total_metrics = len(futures)
|
| 400 |
+
|
| 401 |
+
for future in as_completed(futures):
|
| 402 |
+
name = futures[future]
|
| 403 |
+
completed_count += 1
|
| 404 |
+
|
| 405 |
+
try:
|
| 406 |
+
result, execution_time, metric_warnings = future.result(timeout = 300) # 5 minute timeout
|
| 407 |
+
|
| 408 |
+
if result:
|
| 409 |
+
metric_results[name] = result
|
| 410 |
+
metrics_execution_time[name] = execution_time
|
| 411 |
+
|
| 412 |
+
if result.error:
|
| 413 |
+
warnings.append(f"{name} metric error: {result.error}")
|
| 414 |
+
|
| 415 |
+
if metric_warnings:
|
| 416 |
+
warnings.extend(metric_warnings)
|
| 417 |
+
|
| 418 |
+
logger.debug(f"Parallel metric completed: {name} ({execution_time:.2f}s) - {completed_count}/{total_metrics}")
|
| 419 |
+
|
| 420 |
+
except Exception as e:
|
| 421 |
+
logger.error(f"Error computing metric {name} in parallel: {repr(e)}")
|
| 422 |
+
errors.append(f"{name}: {repr(e)}")
|
| 423 |
+
|
| 424 |
+
# Create error result
|
| 425 |
+
metric_results[name] = MetricResult(metric_name = name,
|
| 426 |
+
synthetic_probability = 0.5,
|
| 427 |
+
authentic_probability = 0.5,
|
| 428 |
+
hybrid_probability = 0.0,
|
| 429 |
+
confidence = 0.0,
|
| 430 |
+
error = repr(e),
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
metrics_execution_time[name] = 0.0
|
| 434 |
+
|
| 435 |
+
logger.info(f"Parallel execution completed: {len(metric_results)}/{len(self.metrics)} metrics successful")
|
| 436 |
+
return metric_results, metrics_execution_time
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
def _compute_metric_wrapper(self, name: str, metric: Any, text: str, domain: Domain, skip_expensive: bool, warnings: List[str], errors: List[str]) -> Tuple[Optional[MetricResult], float, List[str]]:
|
| 440 |
+
"""
|
| 441 |
+
Wrapper function for parallel metric computation
|
| 442 |
+
"""
|
| 443 |
+
metric_start = time.time()
|
| 444 |
+
metric_warnings = list()
|
| 445 |
+
|
| 446 |
+
try:
|
| 447 |
+
logger.debug(f"Computing metric in parallel: {name}")
|
| 448 |
+
|
| 449 |
+
result = metric.compute(text = text,
|
| 450 |
+
domain = domain,
|
| 451 |
+
skip_expensive = skip_expensive,
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
execution_time = time.time() - metric_start
|
| 455 |
+
|
| 456 |
+
return result, execution_time, metric_warnings
|
| 457 |
+
|
| 458 |
+
except Exception as e:
|
| 459 |
+
logger.error(f"Error computing metric {name} in wrapper: {repr(e)}")
|
| 460 |
+
execution_time = time.time() - metric_start
|
| 461 |
+
|
| 462 |
+
# Create error result
|
| 463 |
+
error_result = MetricResult(metric_name = name,
|
| 464 |
+
synthetic_probability = 0.5,
|
| 465 |
+
authentic_probability = 0.5,
|
| 466 |
+
hybrid_probability = 0.0,
|
| 467 |
+
confidence = 0.0,
|
| 468 |
+
error = repr(e),
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
return error_result, execution_time, metric_warnings
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def _execute_metrics_sequential(self, processed_text: ProcessedText, domain: Domain, warnings: List[str], errors: List[str], **kwargs) -> Tuple[Dict[str, MetricResult], Dict[str, float]]:
|
| 475 |
+
"""
|
| 476 |
+
Execute metrics calculations sequentially (fallback method)
|
| 477 |
+
"""
|
| 478 |
+
metric_results = dict()
|
| 479 |
+
metrics_execution_time = dict()
|
| 480 |
+
|
| 481 |
+
for name, metric in self.metrics.items():
|
| 482 |
+
metric_start = time.time()
|
| 483 |
+
|
| 484 |
+
try:
|
| 485 |
+
# Skip expensive metrics if configured
|
| 486 |
+
if (self.skip_expensive_metrics and (name == "multi_perturbation_stability")):
|
| 487 |
+
logger.info(f"Skipping expensive metric: {name}")
|
| 488 |
+
continue
|
| 489 |
+
|
| 490 |
+
logger.debug(f"Computing metric sequentially: {name}")
|
| 491 |
+
|
| 492 |
+
result = metric.compute(text = processed_text.cleaned_text,
|
| 493 |
+
domain = domain,
|
| 494 |
+
skip_expensive = self.skip_expensive_metrics,
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
metric_results[name] = result
|
| 498 |
+
|
| 499 |
+
if result.error:
|
| 500 |
+
warnings.append(f"{name} metric error: {result.error}")
|
| 501 |
+
|
| 502 |
+
except Exception as e:
|
| 503 |
+
logger.error(f"Error computing metric {name}: {repr(e)}")
|
| 504 |
+
errors.append(f"{name}: {repr(e)}")
|
| 505 |
+
|
| 506 |
+
# Create error result
|
| 507 |
+
metric_results[name] = MetricResult(metric_name = name,
|
| 508 |
+
synthetic_probability = 0.5,
|
| 509 |
+
authentic_probability = 0.5,
|
| 510 |
+
hybrid_probability = 0.0,
|
| 511 |
+
confidence = 0.0,
|
| 512 |
+
error = repr(e),
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
finally:
|
| 516 |
+
metrics_execution_time[name] = time.time() - metric_start
|
| 517 |
+
|
| 518 |
+
logger.info(f"Sequential execution completed: {len(metric_results)} metrics computed")
|
| 519 |
+
return metric_results, metrics_execution_time
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
def _aggregate_results(self, metric_results: Dict[str, MetricResult], domain: Domain, errors: List[str]) -> EnsembleResult:
|
| 523 |
+
"""
|
| 524 |
+
Ensemble aggregation
|
| 525 |
+
"""
|
| 526 |
+
logger.info("Step 5: Aggregating results with ensemble...")
|
| 527 |
+
|
| 528 |
+
try:
|
| 529 |
+
ensemble_result = self.ensemble.predict(metric_results = metric_results,
|
| 530 |
+
domain = domain,
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
logger.success(f"Ensemble result: {ensemble_result.final_verdict} (Synthetic probability: {ensemble_result.synthetic_probability:.1%}, confidence: {ensemble_result.overall_confidence:.2f})")
|
| 534 |
+
|
| 535 |
+
return ensemble_result
|
| 536 |
+
|
| 537 |
+
except Exception as e:
|
| 538 |
+
logger.error(f"Ensemble prediction failed: {repr(e)}")
|
| 539 |
+
errors.append(f"Ensemble: {repr(e)}")
|
| 540 |
+
|
| 541 |
+
# Create fallback result
|
| 542 |
+
return EnsembleResult(final_verdict = "Uncertain",
|
| 543 |
+
synthetic_probability = 0.5,
|
| 544 |
+
authentic_probability = 0.5,
|
| 545 |
+
hybrid_probability = 0.0,
|
| 546 |
+
overall_confidence = 0.0,
|
| 547 |
+
domain = domain,
|
| 548 |
+
metric_results = metric_results,
|
| 549 |
+
metric_weights = {},
|
| 550 |
+
weighted_scores = {},
|
| 551 |
+
reasoning = ["Ensemble aggregation failed"],
|
| 552 |
+
uncertainty_score = 1.0,
|
| 553 |
+
consensus_level = 0.0,
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
def _compile_result(self, ensemble_result: EnsembleResult, processed_text: ProcessedText, domain_prediction: DomainPrediction, language_result: Optional[LanguageDetectionResult],
|
| 558 |
+
metric_results: Dict[str, MetricResult], processing_time: float, metrics_execution_time: Dict[str, float], warnings: List[str], errors: List[str], **kwargs) -> DetectionResult:
|
| 559 |
+
"""
|
| 560 |
+
Compile final detection result
|
| 561 |
+
"""
|
| 562 |
+
logger.info("Step 6: Compiling final detection result...")
|
| 563 |
+
|
| 564 |
+
# Include file info if provided
|
| 565 |
+
file_info = kwargs.get('file_info')
|
| 566 |
+
|
| 567 |
+
# Add parallel execution info
|
| 568 |
+
execution_mode = "parallel" if (self.parallel_execution and self.parallel_executor) else "sequential"
|
| 569 |
+
|
| 570 |
+
return DetectionResult(ensemble_result = ensemble_result,
|
| 571 |
+
processed_text = processed_text,
|
| 572 |
+
domain_prediction = domain_prediction,
|
| 573 |
+
language_result = language_result,
|
| 574 |
+
metric_results = metric_results,
|
| 575 |
+
processing_time = processing_time,
|
| 576 |
+
metrics_execution_time = metrics_execution_time,
|
| 577 |
+
warnings = warnings,
|
| 578 |
+
errors = errors,
|
| 579 |
+
file_info = file_info,
|
| 580 |
+
execution_mode = execution_mode,
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def _create_error_result(self, text: str, error_message: str, start_time: float) -> DetectionResult:
|
| 585 |
+
"""
|
| 586 |
+
Create error result when pipeline fails
|
| 587 |
+
"""
|
| 588 |
+
processing_time = time.time() - start_time
|
| 589 |
+
|
| 590 |
+
return DetectionResult(ensemble_result = EnsembleResult(final_verdict = "Uncertain",
|
| 591 |
+
synthetic_probability = 0.5,
|
| 592 |
+
authentic_probability = 0.5,
|
| 593 |
+
hybrid_probability = 0.0,
|
| 594 |
+
overall_confidence = 0.0,
|
| 595 |
+
domain = Domain.GENERAL,
|
| 596 |
+
metric_results = {},
|
| 597 |
+
metric_weights = {},
|
| 598 |
+
weighted_scores = {},
|
| 599 |
+
reasoning = [f"Fatal error: {error_message}"],
|
| 600 |
+
uncertainty_score = 1.0,
|
| 601 |
+
consensus_level = 0.0,
|
| 602 |
+
),
|
| 603 |
+
processed_text = ProcessedText(original_text = text,
|
| 604 |
+
cleaned_text = "",
|
| 605 |
+
sentences = [],
|
| 606 |
+
words = [],
|
| 607 |
+
paragraphs = [],
|
| 608 |
+
char_count = 0,
|
| 609 |
+
word_count = 0,
|
| 610 |
+
sentence_count = 0,
|
| 611 |
+
paragraph_count = 0,
|
| 612 |
+
avg_sentence_length = 0.0,
|
| 613 |
+
avg_word_length = 0.0,
|
| 614 |
+
is_valid = False,
|
| 615 |
+
validation_errors = ["Processing failed"],
|
| 616 |
+
metadata = {},
|
| 617 |
+
),
|
| 618 |
+
domain_prediction = DomainPrediction(primary_domain = Domain.GENERAL,
|
| 619 |
+
secondary_domain = None,
|
| 620 |
+
evidence_strength = 0.0,
|
| 621 |
+
domain_scores = {},
|
| 622 |
+
),
|
| 623 |
+
language_result = None,
|
| 624 |
+
metric_results = {},
|
| 625 |
+
processing_time = processing_time,
|
| 626 |
+
metrics_execution_time = {},
|
| 627 |
+
warnings = [],
|
| 628 |
+
errors = [f"Fatal error: {error_message}"],
|
| 629 |
+
file_info = None,
|
| 630 |
+
execution_mode = "error",
|
| 631 |
+
)
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
def batch_analyze(self, texts: List[str], domain: Optional[Domain] = None) -> List[DetectionResult]:
|
| 635 |
+
"""
|
| 636 |
+
Analyze multiple texts
|
| 637 |
+
|
| 638 |
+
Arguments:
|
| 639 |
+
----------
|
| 640 |
+
texts { list } : List of texts to analyze
|
| 641 |
+
|
| 642 |
+
domain { Domain } : Override automatic domain detection
|
| 643 |
+
|
| 644 |
+
Returns:
|
| 645 |
+
--------
|
| 646 |
+
{ list } : List of DetectionResult objects
|
| 647 |
+
"""
|
| 648 |
+
logger.info(f"Batch analyzing {len(texts)} texts...")
|
| 649 |
+
|
| 650 |
+
results = list()
|
| 651 |
+
|
| 652 |
+
for i, text in enumerate(texts):
|
| 653 |
+
logger.info(f"Analyzing text {i+1}/{len(texts)}...")
|
| 654 |
+
try:
|
| 655 |
+
result = self.analyze(text = text,
|
| 656 |
+
domain = domain,
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
results.append(result)
|
| 660 |
+
|
| 661 |
+
except Exception as e:
|
| 662 |
+
logger.error(f"Error analyzing text {i+1}: {repr(e)}")
|
| 663 |
+
# Create error result for this text
|
| 664 |
+
results.append(self._create_error_result(text, str(e), time.time()))
|
| 665 |
+
|
| 666 |
+
successful = sum(1 for r in results if r.ensemble_result.final_verdict != "Uncertain")
|
| 667 |
+
logger.info(f"Batch analysis complete: {successful}/{len(texts)} processed successfully")
|
| 668 |
+
|
| 669 |
+
return results
|
| 670 |
+
|
| 671 |
+
|
| 672 |
+
def cleanup(self):
|
| 673 |
+
"""
|
| 674 |
+
Clean up resources
|
| 675 |
+
"""
|
| 676 |
+
logger.info("Cleaning up detection orchestrator...")
|
| 677 |
+
|
| 678 |
+
# Clean up metrics
|
| 679 |
+
self._cleanup_metrics()
|
| 680 |
+
|
| 681 |
+
# Clean up processors
|
| 682 |
+
self._cleanup_processors()
|
| 683 |
+
|
| 684 |
+
# Clean up parallel executor if we own it
|
| 685 |
+
if hasattr(self, '_own_executor') and self._own_executor:
|
| 686 |
+
try:
|
| 687 |
+
self.parallel_executor.shutdown(wait=True)
|
| 688 |
+
logger.debug("Cleaned up parallel executor")
|
| 689 |
+
except Exception as e:
|
| 690 |
+
logger.warning(f"Error cleaning up parallel executor: {repr(e)}")
|
| 691 |
+
|
| 692 |
+
logger.info("Cleanup complete")
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
def _cleanup_metrics(self) -> None:
|
| 696 |
+
"""
|
| 697 |
+
Clean up metric resources
|
| 698 |
+
"""
|
| 699 |
+
for name, metric in self.metrics.items():
|
| 700 |
+
try:
|
| 701 |
+
metric.cleanup()
|
| 702 |
+
logger.debug(f"Cleaned up metric: {name}")
|
| 703 |
+
|
| 704 |
+
except Exception as e:
|
| 705 |
+
logger.warning(f"Error cleaning up metric {name}: {repr(e)}")
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
def _cleanup_processors(self) -> None:
|
| 709 |
+
"""
|
| 710 |
+
Clean up processor resources
|
| 711 |
+
"""
|
| 712 |
+
if self.domain_classifier:
|
| 713 |
+
try:
|
| 714 |
+
self.domain_classifier.cleanup()
|
| 715 |
+
logger.debug("Cleaned up domain classifier")
|
| 716 |
+
|
| 717 |
+
except Exception as e:
|
| 718 |
+
logger.warning(f"Error cleaning up domain classifier: {repr(e)}")
|
| 719 |
+
|
| 720 |
+
if self.language_detector:
|
| 721 |
+
try:
|
| 722 |
+
self.language_detector.cleanup()
|
| 723 |
+
logger.debug("Cleaned up language detector")
|
| 724 |
+
|
| 725 |
+
except Exception as e:
|
| 726 |
+
logger.warning(f"Error cleaning up language detector: {repr(e)}")
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
@classmethod
|
| 730 |
+
def create_with_executor(cls, max_workers: int = 4, **kwargs):
|
| 731 |
+
"""
|
| 732 |
+
Factory method to create orchestrator with its own executor
|
| 733 |
+
|
| 734 |
+
Arguments:
|
| 735 |
+
----------
|
| 736 |
+
max_workers { int } : Maximum number of parallel workers
|
| 737 |
+
|
| 738 |
+
**kwargs : Additional arguments for DetectionOrchestrator
|
| 739 |
+
|
| 740 |
+
Returns:
|
| 741 |
+
--------
|
| 742 |
+
{ DetectionOrchestrator } : Orchestrator with thread pool executor
|
| 743 |
+
"""
|
| 744 |
+
executor = ThreadPoolExecutor(max_workers = max_workers)
|
| 745 |
+
orchestrator = cls(parallel_executor = executor, **kwargs)
|
| 746 |
+
orchestrator._own_executor = True
|
| 747 |
+
|
| 748 |
+
return orchestrator
|
| 749 |
+
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
# Export
|
| 753 |
+
__all__ = ["DetectionOrchestrator"]
|
{reporter → services}/reasoning_generator.py
RENAMED
|
@@ -1,58 +1,17 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
-
import numpy as np
|
| 3 |
from typing import Any
|
| 4 |
from typing import Dict
|
| 5 |
from typing import List
|
| 6 |
from typing import Optional
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
-
from config.
|
| 10 |
-
from
|
| 11 |
-
from detector.ensemble import EnsembleResult
|
| 12 |
-
from detector.attribution import AttributionResult
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
@dataclass
|
| 17 |
-
class DetailedReasoning:
|
| 18 |
-
"""
|
| 19 |
-
Comprehensive reasoning for detection result with ensemble integration
|
| 20 |
-
"""
|
| 21 |
-
summary : str
|
| 22 |
-
key_indicators : List[str]
|
| 23 |
-
metric_explanations : Dict[str, str]
|
| 24 |
-
supporting_evidence : List[str]
|
| 25 |
-
contradicting_evidence : List[str]
|
| 26 |
-
confidence_explanation : str
|
| 27 |
-
domain_analysis : str
|
| 28 |
-
ensemble_analysis : str
|
| 29 |
-
attribution_reasoning : Optional[str]
|
| 30 |
-
recommendations : List[str]
|
| 31 |
-
uncertainty_analysis : str
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 35 |
-
"""
|
| 36 |
-
Convert to dictionary
|
| 37 |
-
"""
|
| 38 |
-
return {"summary" : self.summary,
|
| 39 |
-
"key_indicators" : self.key_indicators,
|
| 40 |
-
"metric_explanations" : self.metric_explanations,
|
| 41 |
-
"supporting_evidence" : self.supporting_evidence,
|
| 42 |
-
"contradicting_evidence" : self.contradicting_evidence,
|
| 43 |
-
"confidence_explanation" : self.confidence_explanation,
|
| 44 |
-
"domain_analysis" : self.domain_analysis,
|
| 45 |
-
"ensemble_analysis" : self.ensemble_analysis,
|
| 46 |
-
"attribution_reasoning" : self.attribution_reasoning,
|
| 47 |
-
"recommendations" : self.recommendations,
|
| 48 |
-
"uncertainty_analysis" : self.uncertainty_analysis,
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
|
| 52 |
|
| 53 |
class ReasoningGenerator:
|
| 54 |
"""
|
| 55 |
-
Generates detailed, human-readable reasoning for
|
| 56 |
|
| 57 |
Features:
|
| 58 |
- Ensemble method explanation
|
|
@@ -62,43 +21,42 @@ class ReasoningGenerator:
|
|
| 62 |
- Actionable recommendations
|
| 63 |
"""
|
| 64 |
# Metric descriptions
|
| 65 |
-
METRIC_DESCRIPTIONS
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
|
| 73 |
# Ensemble method descriptions
|
| 74 |
-
ENSEMBLE_METHODS
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
}
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
|
| 92 |
-
#
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
|
| 103 |
|
| 104 |
def __init__(self):
|
|
@@ -108,85 +66,97 @@ class ReasoningGenerator:
|
|
| 108 |
pass
|
| 109 |
|
| 110 |
|
| 111 |
-
def generate(self, ensemble_result: EnsembleResult, metric_results: Dict[str, MetricResult], domain: Domain,
|
| 112 |
-
text_length: int = 0, ensemble_method: str = "confidence_calibrated") -> DetailedReasoning:
|
| 113 |
"""
|
| 114 |
Generate comprehensive reasoning for detection result with ensemble integration
|
| 115 |
|
| 116 |
Arguments:
|
| 117 |
----------
|
| 118 |
-
ensemble_result
|
| 119 |
|
| 120 |
-
metric_results
|
| 121 |
-
|
| 122 |
-
domain : Detected text domain for context-aware analysis
|
| 123 |
|
| 124 |
-
|
| 125 |
|
| 126 |
-
text_length
|
| 127 |
|
| 128 |
-
ensemble_method
|
| 129 |
|
| 130 |
Returns:
|
| 131 |
--------
|
| 132 |
-
|
| 133 |
"""
|
| 134 |
# Generate summary with ensemble context
|
| 135 |
-
summary = self._generate_ensemble_summary(ensemble_result
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
# Identify key indicators with metric weights
|
| 138 |
-
key_indicators = self._identify_weighted_indicators(ensemble_result,
|
|
|
|
|
|
|
| 139 |
|
| 140 |
# Generate metric explanations with confidence
|
| 141 |
-
metric_explanations = self._generate_metric_explanations(metric_results,
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# Compile evidence with ensemble consensus
|
| 144 |
-
supporting_evidence, contradicting_evidence = self._compile_ensemble_evidence(ensemble_result,
|
|
|
|
|
|
|
| 145 |
|
| 146 |
# Explain confidence with uncertainty
|
| 147 |
-
confidence_explanation = self._explain_confidence_with_uncertainty(ensemble_result,
|
|
|
|
|
|
|
| 148 |
|
| 149 |
# Domain-specific analysis
|
| 150 |
-
domain_analysis = self._generate_domain_analysis(domain
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# Ensemble methodology explanation
|
| 153 |
-
ensemble_analysis = self._explain_ensemble_methodology(ensemble_result,
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
attribution_reasoning = None
|
| 157 |
-
|
| 158 |
-
if attribution_result:
|
| 159 |
-
attribution_reasoning = self._generate_attribution_reasoning(attribution_result)
|
| 160 |
|
| 161 |
# Uncertainty analysis
|
| 162 |
-
uncertainty_analysis
|
| 163 |
|
| 164 |
# Generate recommendations
|
| 165 |
-
recommendations
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
| 179 |
|
| 180 |
|
| 181 |
def _generate_ensemble_summary(self, ensemble_result: EnsembleResult, domain: Domain, text_length: int, ensemble_method: str) -> str:
|
| 182 |
"""
|
| 183 |
Generate executive summary with ensemble context
|
| 184 |
"""
|
| 185 |
-
verdict
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
| 190 |
|
| 191 |
# Confidence level description
|
| 192 |
if (confidence >= 0.8):
|
|
@@ -211,34 +181,34 @@ class ReasoningGenerator:
|
|
| 211 |
else:
|
| 212 |
consensus_desc = "low consensus"
|
| 213 |
|
| 214 |
-
# Build summary based on verdict
|
| 215 |
summary_parts = list()
|
| 216 |
|
| 217 |
-
if ("
|
| 218 |
summary_parts.append(f"Ensemble analysis indicates with {conf_desc} ({confidence:.1%}) that this text is "
|
| 219 |
-
f"**likely
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
elif ("Human-Written" in verdict):
|
| 223 |
-
human_prob = ensemble_result.human_probability
|
| 224 |
summary_parts.append(f"Ensemble analysis indicates with {conf_desc} ({confidence:.1%}) that this text is "
|
| 225 |
-
f"**likely
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
elif( "Mixed" in verdict):
|
| 229 |
-
mixed_prob = ensemble_result.mixed_probability
|
| 230 |
summary_parts.append(f"Ensemble analysis indicates with {conf_desc} ({confidence:.1%}) that this text "
|
| 231 |
-
f"**contains mixed
|
| 232 |
-
)
|
| 233 |
|
| 234 |
-
else:
|
|
|
|
| 235 |
summary_parts.append(f"Ensemble analysis is **inconclusive** (confidence: {confidence:.1%}).")
|
| 236 |
|
| 237 |
# Add ensemble context
|
| 238 |
summary_parts.append(f"Metrics show {consensus_desc} among detection methods. Uncertainty level: {uncertainty:.1%}.")
|
| 239 |
|
| 240 |
# Add domain and length context
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
return " ".join(summary_parts)
|
| 244 |
|
|
@@ -248,26 +218,28 @@ class ReasoningGenerator:
|
|
| 248 |
Identify top indicators considering metric weights and contributions
|
| 249 |
"""
|
| 250 |
indicators = list()
|
| 251 |
-
|
| 252 |
|
| 253 |
# Use ensemble weights to prioritize indicators
|
| 254 |
-
weighted_metrics =
|
| 255 |
-
|
| 256 |
for name, result in metric_results.items():
|
| 257 |
if result.error:
|
| 258 |
continue
|
|
|
|
| 259 |
weight = ensemble_result.metric_weights.get(name, 0.0)
|
| 260 |
confidence = result.confidence
|
| 261 |
# Combine weight and confidence for prioritization
|
| 262 |
priority_score = weight * confidence
|
| 263 |
-
|
| 264 |
weighted_metrics.append((name, result, priority_score))
|
| 265 |
|
| 266 |
# Sort by priority score
|
| 267 |
weighted_metrics.sort(key = lambda x: x[2], reverse = True)
|
| 268 |
-
|
| 269 |
-
for name, result, priority_score in weighted_metrics[:5]:
|
| 270 |
-
|
|
|
|
| 271 |
|
| 272 |
if key_feature:
|
| 273 |
weight_pct = ensemble_result.metric_weights.get(name, 0.0) * 100
|
|
@@ -276,7 +248,7 @@ class ReasoningGenerator:
|
|
| 276 |
return indicators
|
| 277 |
|
| 278 |
|
| 279 |
-
def _extract_ensemble_feature(self, metric_name: str, result: MetricResult,
|
| 280 |
"""
|
| 281 |
Extract significant features considering ensemble context
|
| 282 |
"""
|
|
@@ -286,66 +258,58 @@ class ReasoningGenerator:
|
|
| 286 |
burstiness = details.get("burstiness_score", 0.5)
|
| 287 |
uniformity = details.get("length_uniformity", 0.5)
|
| 288 |
|
| 289 |
-
if (
|
| 290 |
-
return f"Low burstiness ({burstiness:.2f}) suggests uniform
|
| 291 |
|
| 292 |
-
elif (not
|
| 293 |
return f"High burstiness ({burstiness:.2f}) indicates natural variation"
|
| 294 |
-
|
| 295 |
-
elif (
|
| 296 |
-
return f"High structural uniformity ({uniformity:.2f}) typical of
|
| 297 |
-
|
| 298 |
-
|
| 299 |
elif (metric_name == "perplexity"):
|
| 300 |
perplexity = details.get("overall_perplexity", 50)
|
| 301 |
|
| 302 |
-
if (
|
| 303 |
return f"Low perplexity ({perplexity:.1f}) indicates high predictability"
|
| 304 |
-
|
| 305 |
-
elif (not
|
| 306 |
return f"High perplexity ({perplexity:.1f}) suggests human creativity"
|
| 307 |
-
|
| 308 |
-
|
| 309 |
elif (metric_name == "entropy"):
|
| 310 |
-
token_diversity
|
| 311 |
-
sequence_entropy = details.get("sequence_entropy", 0.5)
|
| 312 |
|
| 313 |
-
if (
|
| 314 |
-
return f"Low token diversity ({token_diversity:.2f}) suggests
|
| 315 |
-
|
| 316 |
-
elif (not
|
| 317 |
return f"High token diversity ({token_diversity:.2f}) indicates human variety"
|
| 318 |
-
|
| 319 |
-
|
| 320 |
elif (metric_name == "semantic_analysis"):
|
| 321 |
-
coherence
|
| 322 |
-
consistency = details.get("consistency_score", 0.5)
|
| 323 |
|
| 324 |
-
if (
|
| 325 |
-
return f"Unnaturally high coherence ({coherence:.2f}) typical of
|
| 326 |
-
|
| 327 |
-
elif (not
|
| 328 |
return f"Natural coherence variation ({coherence:.2f})"
|
| 329 |
-
|
| 330 |
-
|
| 331 |
elif (metric_name == "linguistic"):
|
| 332 |
-
pos_diversity
|
| 333 |
-
syntactic_complexity = details.get("syntactic_complexity", 2.5)
|
| 334 |
|
| 335 |
-
if (
|
| 336 |
return f"Limited grammatical diversity ({pos_diversity:.2f})"
|
| 337 |
-
|
| 338 |
-
elif (not
|
| 339 |
return f"Rich grammatical variety ({pos_diversity:.2f})"
|
| 340 |
|
| 341 |
elif (metric_name == "multi_perturbation_stability"):
|
| 342 |
stability = details.get("stability_score", 0.5)
|
| 343 |
-
curvature = details.get("curvature_score", 0.5)
|
| 344 |
|
| 345 |
-
if (
|
| 346 |
-
return f"High perturbation
|
| 347 |
-
|
| 348 |
-
elif (not
|
| 349 |
return f"Text stability under perturbation ({stability:.2f})"
|
| 350 |
|
| 351 |
return None
|
|
@@ -363,30 +327,27 @@ class ReasoningGenerator:
|
|
| 363 |
continue
|
| 364 |
|
| 365 |
# Get metric description
|
| 366 |
-
desc
|
| 367 |
|
| 368 |
# Get weight information
|
| 369 |
weight = metric_weights.get(name, 0.0)
|
| 370 |
weight_info = f" (ensemble weight: {weight:.1%})" if weight > 0 else " (low weight in ensemble)"
|
| 371 |
|
| 372 |
-
# Determine verdict
|
| 373 |
-
if (result.
|
| 374 |
-
verdict = "suggests
|
| 375 |
-
prob = result.
|
| 376 |
-
|
| 377 |
-
elif (result.
|
| 378 |
-
verdict = "indicates
|
| 379 |
-
prob = result.
|
| 380 |
-
|
| 381 |
else:
|
| 382 |
verdict = "shows mixed signals"
|
| 383 |
-
prob = max(result.
|
| 384 |
|
| 385 |
# Build explanation with confidence
|
| 386 |
-
explanation
|
| 387 |
-
f"Result: {verdict} ({prob:.1%} probability) "
|
| 388 |
-
f"with {result.confidence:.1%} confidence."
|
| 389 |
-
)
|
| 390 |
|
| 391 |
explanations[name] = explanation
|
| 392 |
|
|
@@ -397,32 +358,32 @@ class ReasoningGenerator:
|
|
| 397 |
"""
|
| 398 |
Compile evidence considering ensemble consensus and weights
|
| 399 |
"""
|
| 400 |
-
|
| 401 |
-
consensus
|
| 402 |
|
| 403 |
-
supporting
|
| 404 |
-
contradicting
|
| 405 |
|
| 406 |
for name, result in metric_results.items():
|
| 407 |
if result.error:
|
| 408 |
continue
|
| 409 |
|
| 410 |
-
weight
|
| 411 |
-
|
| 412 |
|
| 413 |
# Weight the evidence by metric importance
|
| 414 |
-
weight_indicator
|
| 415 |
|
| 416 |
-
if (
|
| 417 |
# Supporting evidence
|
| 418 |
-
indicator = self.
|
| 419 |
|
| 420 |
if indicator:
|
| 421 |
supporting.append(f"{weight_indicator} {indicator}")
|
| 422 |
|
| 423 |
else:
|
| 424 |
# Contradicting evidence
|
| 425 |
-
indicator = self.
|
| 426 |
|
| 427 |
if indicator:
|
| 428 |
contradicting.append(f"{weight_indicator} {indicator}")
|
|
@@ -430,56 +391,56 @@ class ReasoningGenerator:
|
|
| 430 |
# Add consensus context
|
| 431 |
if (consensus > 0.7):
|
| 432 |
supporting.insert(0, "✅ Strong metric consensus supports this conclusion")
|
| 433 |
-
|
| 434 |
elif (consensus < 0.4):
|
| 435 |
contradicting.insert(0, "⚠️ Low metric consensus indicates uncertainty")
|
| 436 |
|
| 437 |
return supporting, contradicting
|
| 438 |
|
| 439 |
|
| 440 |
-
def
|
| 441 |
"""
|
| 442 |
-
Get
|
| 443 |
"""
|
| 444 |
details = result.details
|
| 445 |
|
| 446 |
if (metric_name == "structural"):
|
| 447 |
if (details.get("burstiness_score", 1.0) < 0.4):
|
| 448 |
-
return self.
|
| 449 |
-
|
| 450 |
elif (metric_name == "perplexity"):
|
| 451 |
if (details.get("overall_perplexity", 100) < 35):
|
| 452 |
-
return self.
|
| 453 |
-
|
| 454 |
elif (metric_name == "entropy"):
|
| 455 |
if (details.get("token_diversity", 1.0) < 0.65):
|
| 456 |
-
return self.
|
| 457 |
-
|
| 458 |
elif (metric_name == "semantic_analysis"):
|
| 459 |
if (details.get("coherence_score", 0.5) > 0.75):
|
| 460 |
-
return self.
|
| 461 |
-
|
| 462 |
return None
|
| 463 |
|
| 464 |
|
| 465 |
-
def
|
| 466 |
"""
|
| 467 |
-
Get
|
| 468 |
"""
|
| 469 |
details = result.details
|
| 470 |
|
| 471 |
if (metric_name == "structural"):
|
| 472 |
if (details.get("burstiness_score", 0.0) > 0.6):
|
| 473 |
-
return self.
|
| 474 |
-
|
| 475 |
elif (metric_name == "perplexity"):
|
| 476 |
if (details.get("overall_perplexity", 0) > 55):
|
| 477 |
-
return self.
|
| 478 |
-
|
| 479 |
elif (metric_name == "entropy"):
|
| 480 |
if (details.get("token_diversity", 0.0) > 0.75):
|
| 481 |
-
return self.
|
| 482 |
-
|
| 483 |
return None
|
| 484 |
|
| 485 |
|
|
@@ -499,10 +460,10 @@ class ReasoningGenerator:
|
|
| 499 |
|
| 500 |
if (confidence >= 0.8):
|
| 501 |
explanation += "High confidence due to: strong metric agreement, clear patterns, and reliable signal across multiple detection methods."
|
| 502 |
-
|
| 503 |
elif (confidence >= 0.6):
|
| 504 |
explanation += "Good confidence supported by: general metric agreement and consistent detection patterns."
|
| 505 |
-
|
| 506 |
else:
|
| 507 |
explanation += "Lower confidence reflects: metric disagreement, ambiguous patterns, or borderline characteristics."
|
| 508 |
|
|
@@ -528,9 +489,10 @@ class ReasoningGenerator:
|
|
| 528 |
|
| 529 |
# Add domain-specific threshold context
|
| 530 |
threshold_info = {Domain.ACADEMIC : "Higher detection thresholds applied for academic rigor",
|
| 531 |
-
Domain.TECHNICAL_DOC : "Elevated thresholds for technical precision requirements",
|
| 532 |
Domain.CREATIVE : "Balanced thresholds accounting for creative expression",
|
| 533 |
Domain.SOCIAL_MEDIA : "Adapted thresholds for informal communication patterns",
|
|
|
|
| 534 |
}
|
| 535 |
|
| 536 |
threshold_note = threshold_info.get(domain, "Standard detection thresholds applied")
|
|
@@ -546,8 +508,8 @@ class ReasoningGenerator:
|
|
| 546 |
|
| 547 |
explanation = f"**Ensemble Methodology**: {method_desc}\n\n"
|
| 548 |
|
| 549 |
-
# Explain key top
|
| 550 |
-
top_metrics = sorted(ensemble_result.metric_weights.items(), key
|
| 551 |
|
| 552 |
if top_metrics:
|
| 553 |
explanation += "**Top contributing metrics**:\n"
|
|
@@ -556,11 +518,14 @@ class ReasoningGenerator:
|
|
| 556 |
|
| 557 |
# Add reasoning snippets if available
|
| 558 |
if hasattr(ensemble_result, 'reasoning') and ensemble_result.reasoning:
|
|
|
|
| 559 |
key_reasons = [r for r in ensemble_result.reasoning if not r.startswith('##')][:2]
|
| 560 |
if key_reasons:
|
| 561 |
explanation += "\n**Key ensemble factors**:\n"
|
| 562 |
for reason in key_reasons:
|
| 563 |
-
|
|
|
|
|
|
|
| 564 |
|
| 565 |
return explanation
|
| 566 |
|
|
@@ -573,7 +538,7 @@ class ReasoningGenerator:
|
|
| 573 |
|
| 574 |
if (uncertainty < 0.3):
|
| 575 |
return "**Low Uncertainty**: Clear detection signals with strong metric agreement. Results are highly reliable."
|
| 576 |
-
|
| 577 |
elif (uncertainty < 0.6):
|
| 578 |
return "**Moderate Uncertainty**: Some metric disagreement or borderline characteristics. Consider additional context."
|
| 579 |
|
|
@@ -581,40 +546,6 @@ class ReasoningGenerator:
|
|
| 581 |
return "**High Uncertainty**: Significant metric disagreement or ambiguous patterns. Results should be interpreted with caution and additional verification may be needed."
|
| 582 |
|
| 583 |
|
| 584 |
-
def _generate_attribution_reasoning(self, attribution_result: AttributionResult) -> str:
|
| 585 |
-
"""
|
| 586 |
-
Generate reasoning for model attribution
|
| 587 |
-
"""
|
| 588 |
-
model = attribution_result.predicted_model
|
| 589 |
-
confidence = attribution_result.confidence
|
| 590 |
-
|
| 591 |
-
if ((model == AIModel.UNKNOWN) or (confidence < 0.3)):
|
| 592 |
-
return "**Model Attribution**: Uncertain. Text patterns don't strongly match known AI model fingerprints."
|
| 593 |
-
|
| 594 |
-
model_name = model.value.replace("-", " ").replace("_", " ").title()
|
| 595 |
-
|
| 596 |
-
reasoning = f"**Attributed Model**: {model_name} (confidence: {confidence:.1%})\n\n"
|
| 597 |
-
|
| 598 |
-
# Model characteristics
|
| 599 |
-
model_chars = {AIModel.GPT_3_5: "Characteristic patterns: frequent transitions, consistent structure, balanced explanations.",
|
| 600 |
-
AIModel.GPT_4: "Advanced patterns: sophisticated vocabulary, nuanced analysis, well-structured arguments.",
|
| 601 |
-
AIModel.CLAUDE_3_OPUS: "Distinctive style: thoughtful analysis, balanced perspectives, explanatory depth.",
|
| 602 |
-
AIModel.GEMINI_PRO: "Typical patterns: conversational tone, clear explanations, exploratory language.",
|
| 603 |
-
AIModel.LLAMA_3: "Common traits: direct explanations, structured responses, consistent formatting.",
|
| 604 |
-
}
|
| 605 |
-
|
| 606 |
-
reasoning += model_chars.get(model, "Shows characteristic AI writing patterns.")
|
| 607 |
-
|
| 608 |
-
# Add fingerprint matches if available
|
| 609 |
-
if attribution_result.fingerprint_matches:
|
| 610 |
-
reasoning += "\n\n**Top fingerprint matches**:"
|
| 611 |
-
|
| 612 |
-
for model_name, score in list(attribution_result.fingerprint_matches.items())[:3]:
|
| 613 |
-
reasoning += f"\n• {model_name}: {score}% match"
|
| 614 |
-
|
| 615 |
-
return reasoning
|
| 616 |
-
|
| 617 |
-
|
| 618 |
def _generate_ensemble_recommendations(self, ensemble_result: EnsembleResult, metric_results: Dict[str, MetricResult], domain: Domain) -> List[str]:
|
| 619 |
"""
|
| 620 |
Generate actionable recommendations based on ensemble results
|
|
@@ -625,26 +556,25 @@ class ReasoningGenerator:
|
|
| 625 |
uncertainty = ensemble_result.uncertainty_score
|
| 626 |
|
| 627 |
# Base recommendations by verdict and confidence
|
| 628 |
-
if ("
|
| 629 |
if (confidence >= 0.8):
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
else:
|
| 633 |
-
rec = "**Likely AI involvement**: Recommend discussion about AI tool usage and verification of understanding."
|
| 634 |
-
|
| 635 |
-
recommendations.append(rec)
|
| 636 |
|
| 637 |
-
|
|
|
|
|
|
|
|
|
|
| 638 |
if (confidence >= 0.8):
|
| 639 |
-
|
| 640 |
|
| 641 |
else:
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
|
|
|
| 648 |
|
| 649 |
# Uncertainty-based recommendations
|
| 650 |
if (uncertainty > 0.6):
|
|
@@ -652,9 +582,10 @@ class ReasoningGenerator:
|
|
| 652 |
|
| 653 |
# Domain-specific recommendations
|
| 654 |
domain_recs = {Domain.ACADEMIC : "For academic work: verify subject mastery through targeted questions or practical application.",
|
| 655 |
-
Domain.CREATIVE : "For creative work: assess originality, personal voice, and creative process documentation.",
|
| 656 |
Domain.TECHNICAL_DOC : "For technical content: verify practical expertise and problem-solving ability.",
|
| 657 |
-
|
|
|
|
| 658 |
|
| 659 |
if domain in domain_recs:
|
| 660 |
recommendations.append(domain_recs[domain])
|
|
@@ -670,6 +601,4 @@ class ReasoningGenerator:
|
|
| 670 |
|
| 671 |
|
| 672 |
# Export
|
| 673 |
-
__all__ = ["
|
| 674 |
-
"ReasoningGenerator",
|
| 675 |
-
]
|
|
|
|
| 1 |
# DEPENDENCIES
|
|
|
|
| 2 |
from typing import Any
|
| 3 |
from typing import Dict
|
| 4 |
from typing import List
|
| 5 |
from typing import Optional
|
| 6 |
+
from config.enums import Domain
|
| 7 |
+
from config.schemas import MetricResult
|
| 8 |
+
from config.schemas import EnsembleResult
|
| 9 |
+
from config.schemas import DetailedReasoningResult
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class ReasoningGenerator:
|
| 13 |
"""
|
| 14 |
+
Generates detailed, human-readable reasoning for Synthetic detection results with ensemble and domain-aware integration
|
| 15 |
|
| 16 |
Features:
|
| 17 |
- Ensemble method explanation
|
|
|
|
| 21 |
- Actionable recommendations
|
| 22 |
"""
|
| 23 |
# Metric descriptions
|
| 24 |
+
METRIC_DESCRIPTIONS = {"structural" : "analyzes sentence structure, length patterns, and statistical features",
|
| 25 |
+
"perplexity" : "measures text predictability using language model cross-entropy",
|
| 26 |
+
"entropy" : "evaluates token diversity and sequence unpredictability",
|
| 27 |
+
"semantic_analysis" : "examines semantic coherence, topic consistency, and logical flow",
|
| 28 |
+
"linguistic" : "assesses grammatical patterns, syntactic complexity, and style markers",
|
| 29 |
+
"multi_perturbation_stability" : "tests text stability under perturbation using curvature analysis",
|
| 30 |
+
}
|
| 31 |
|
| 32 |
# Ensemble method descriptions
|
| 33 |
+
ENSEMBLE_METHODS = {"confidence_calibrated" : "confidence-weighted aggregation with domain calibration",
|
| 34 |
+
"consensus_based" : "rewarding metric agreement and consensus",
|
| 35 |
+
"domain_weighted" : "domain-aware static weighting of metrics",
|
| 36 |
+
"simple_average" : "equal weighting of all metrics",
|
| 37 |
+
}
|
|
|
|
| 38 |
|
| 39 |
+
# Synthetic indicators aligned with current metric outputs
|
| 40 |
+
SYNTHETIC_INDICATORS = {"low_perplexity" : "Text shows high predictability to language models",
|
| 41 |
+
"low_entropy" : "Limited vocabulary diversity and repetitive patterns",
|
| 42 |
+
"structural_uniformity" : "Consistent sentence lengths and structural patterns",
|
| 43 |
+
"semantic_perfection" : "Unnaturally perfect coherence and logical flow",
|
| 44 |
+
"linguistic_consistency" : "Overly consistent grammatical patterns and style",
|
| 45 |
+
"perturbation_instability": "Text changes significantly under minor modifications",
|
| 46 |
+
"low_burstiness" : "Lacks natural variation in writing intensity",
|
| 47 |
+
"transition_overuse" : "Excessive use of transitional phrases and connectors",
|
| 48 |
+
}
|
| 49 |
|
| 50 |
+
# Authentic indicators
|
| 51 |
+
AUTHENTIC_INDICATORS = {"high_perplexity" : "Creative, unpredictable word choices and phrasing",
|
| 52 |
+
"high_entropy" : "Rich vocabulary diversity and varied expressions",
|
| 53 |
+
"structural_variation" : "Natural variation in sentence lengths and structures",
|
| 54 |
+
"semantic_naturalness" : "Authentic, occasionally imperfect logical flow",
|
| 55 |
+
"linguistic_diversity" : "Varied grammatical constructions and personal style",
|
| 56 |
+
"perturbation_stability": "Text remains consistent under minor modifications",
|
| 57 |
+
"high_burstiness" : "Natural variation in writing intensity and focus",
|
| 58 |
+
"personal_voice" : "Distinctive personal expressions and idioms",
|
| 59 |
+
}
|
| 60 |
|
| 61 |
|
| 62 |
def __init__(self):
|
|
|
|
| 66 |
pass
|
| 67 |
|
| 68 |
|
| 69 |
+
def generate(self, ensemble_result: EnsembleResult, metric_results: Dict[str, MetricResult], domain: Domain, text_length: int = 0, ensemble_method: str = "confidence_calibrated") -> DetailedReasoningResult:
|
|
|
|
| 70 |
"""
|
| 71 |
Generate comprehensive reasoning for detection result with ensemble integration
|
| 72 |
|
| 73 |
Arguments:
|
| 74 |
----------
|
| 75 |
+
ensemble_result { EnsembleResult} : Final ensemble prediction with weights and reasoning
|
| 76 |
|
| 77 |
+
metric_results { dict } : Individual metric results from all metrics
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
domain { Domain } : Detected text domain for context-aware analysis
|
| 80 |
|
| 81 |
+
text_length { int } : Length of analyzed text in words
|
| 82 |
|
| 83 |
+
ensemble_method { str } : Method used for ensemble aggregation
|
| 84 |
|
| 85 |
Returns:
|
| 86 |
--------
|
| 87 |
+
{ DetailedReasoningResult } : DetailedReasoningResult object with ensemble-aware analysis
|
| 88 |
"""
|
| 89 |
# Generate summary with ensemble context
|
| 90 |
+
summary = self._generate_ensemble_summary(ensemble_result = ensemble_result,
|
| 91 |
+
domain = domain,
|
| 92 |
+
text_length = text_length,
|
| 93 |
+
ensemble_method = ensemble_method,
|
| 94 |
+
)
|
| 95 |
|
| 96 |
# Identify key indicators with metric weights
|
| 97 |
+
key_indicators = self._identify_weighted_indicators(ensemble_result = ensemble_result,
|
| 98 |
+
metric_results = metric_results,
|
| 99 |
+
)
|
| 100 |
|
| 101 |
# Generate metric explanations with confidence
|
| 102 |
+
metric_explanations = self._generate_metric_explanations(metric_results = metric_results,
|
| 103 |
+
metric_weights = ensemble_result.metric_weights,
|
| 104 |
+
)
|
| 105 |
|
| 106 |
# Compile evidence with ensemble consensus
|
| 107 |
+
supporting_evidence, contradicting_evidence = self._compile_ensemble_evidence(ensemble_result = ensemble_result,
|
| 108 |
+
metric_results = metric_results,
|
| 109 |
+
)
|
| 110 |
|
| 111 |
# Explain confidence with uncertainty
|
| 112 |
+
confidence_explanation = self._explain_confidence_with_uncertainty(ensemble_result = ensemble_result,
|
| 113 |
+
metric_results = metric_results,
|
| 114 |
+
)
|
| 115 |
|
| 116 |
# Domain-specific analysis
|
| 117 |
+
domain_analysis = self._generate_domain_analysis(domain = domain,
|
| 118 |
+
metric_results = metric_results,
|
| 119 |
+
ensemble_result = ensemble_result,
|
| 120 |
+
)
|
| 121 |
|
| 122 |
# Ensemble methodology explanation
|
| 123 |
+
ensemble_analysis = self._explain_ensemble_methodology(ensemble_result = ensemble_result,
|
| 124 |
+
ensemble_method = ensemble_method,
|
| 125 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
# Uncertainty analysis
|
| 128 |
+
uncertainty_analysis = self._analyze_uncertainty(ensemble_result = ensemble_result)
|
| 129 |
|
| 130 |
# Generate recommendations
|
| 131 |
+
recommendations = self._generate_ensemble_recommendations(ensemble_result = ensemble_result,
|
| 132 |
+
metric_results = metric_results,
|
| 133 |
+
domain = domain,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return DetailedReasoningResult(summary = summary,
|
| 137 |
+
key_indicators = key_indicators,
|
| 138 |
+
metric_explanations = metric_explanations,
|
| 139 |
+
supporting_evidence = supporting_evidence,
|
| 140 |
+
contradicting_evidence = contradicting_evidence,
|
| 141 |
+
confidence_explanation = confidence_explanation,
|
| 142 |
+
domain_analysis = domain_analysis,
|
| 143 |
+
ensemble_analysis = ensemble_analysis,
|
| 144 |
+
recommendations = recommendations,
|
| 145 |
+
uncertainty_analysis = uncertainty_analysis,
|
| 146 |
+
)
|
| 147 |
|
| 148 |
|
| 149 |
def _generate_ensemble_summary(self, ensemble_result: EnsembleResult, domain: Domain, text_length: int, ensemble_method: str) -> str:
|
| 150 |
"""
|
| 151 |
Generate executive summary with ensemble context
|
| 152 |
"""
|
| 153 |
+
verdict = ensemble_result.final_verdict
|
| 154 |
+
synthetic_prob = ensemble_result.synthetic_probability
|
| 155 |
+
authentic_prob = ensemble_result.authentic_probability
|
| 156 |
+
hybrid_prob = ensemble_result.hybrid_probability
|
| 157 |
+
confidence = ensemble_result.overall_confidence
|
| 158 |
+
uncertainty = ensemble_result.uncertainty_score
|
| 159 |
+
consensus = ensemble_result.consensus_level
|
| 160 |
|
| 161 |
# Confidence level description
|
| 162 |
if (confidence >= 0.8):
|
|
|
|
| 181 |
else:
|
| 182 |
consensus_desc = "low consensus"
|
| 183 |
|
| 184 |
+
# Build summary based on verdict
|
| 185 |
summary_parts = list()
|
| 186 |
|
| 187 |
+
if (verdict == "Synthetically-Generated"):
|
| 188 |
summary_parts.append(f"Ensemble analysis indicates with {conf_desc} ({confidence:.1%}) that this text is "
|
| 189 |
+
f"**likely synthetically-generated** (synthetic probability: {synthetic_prob:.1%}).")
|
| 190 |
+
|
| 191 |
+
elif( verdict == "Authentically-Written"):
|
|
|
|
|
|
|
| 192 |
summary_parts.append(f"Ensemble analysis indicates with {conf_desc} ({confidence:.1%}) that this text is "
|
| 193 |
+
f"**likely authentically-written** (authentic probability: {authentic_prob:.1%}).")
|
| 194 |
+
|
| 195 |
+
elif (verdict == "Hybrid"):
|
|
|
|
|
|
|
| 196 |
summary_parts.append(f"Ensemble analysis indicates with {conf_desc} ({confidence:.1%}) that this text "
|
| 197 |
+
f"**contains mixed synthetic/authentic content** (hybrid probability: {hybrid_prob:.1%}).")
|
|
|
|
| 198 |
|
| 199 |
+
else:
|
| 200 |
+
# Uncertain
|
| 201 |
summary_parts.append(f"Ensemble analysis is **inconclusive** (confidence: {confidence:.1%}).")
|
| 202 |
|
| 203 |
# Add ensemble context
|
| 204 |
summary_parts.append(f"Metrics show {consensus_desc} among detection methods. Uncertainty level: {uncertainty:.1%}.")
|
| 205 |
|
| 206 |
# Add domain and length context
|
| 207 |
+
if (text_length > 0):
|
| 208 |
+
summary_parts.append(f"Analysis of {text_length:,} words in **{domain.value}** domain using {self.ENSEMBLE_METHODS.get(ensemble_method, ensemble_method)} ensemble method.")
|
| 209 |
+
|
| 210 |
+
else:
|
| 211 |
+
summary_parts.append(f"Analysis in **{domain.value}** domain using {self.ENSEMBLE_METHODS.get(ensemble_method, ensemble_method)} ensemble method.")
|
| 212 |
|
| 213 |
return " ".join(summary_parts)
|
| 214 |
|
|
|
|
| 218 |
Identify top indicators considering metric weights and contributions
|
| 219 |
"""
|
| 220 |
indicators = list()
|
| 221 |
+
is_synthetic = (ensemble_result.final_verdict == "Synthetically-Generated")
|
| 222 |
|
| 223 |
# Use ensemble weights to prioritize indicators
|
| 224 |
+
weighted_metrics = []
|
| 225 |
+
|
| 226 |
for name, result in metric_results.items():
|
| 227 |
if result.error:
|
| 228 |
continue
|
| 229 |
+
|
| 230 |
weight = ensemble_result.metric_weights.get(name, 0.0)
|
| 231 |
confidence = result.confidence
|
| 232 |
# Combine weight and confidence for prioritization
|
| 233 |
priority_score = weight * confidence
|
| 234 |
+
|
| 235 |
weighted_metrics.append((name, result, priority_score))
|
| 236 |
|
| 237 |
# Sort by priority score
|
| 238 |
weighted_metrics.sort(key = lambda x: x[2], reverse = True)
|
| 239 |
+
|
| 240 |
+
for name, result, priority_score in weighted_metrics[:5]:
|
| 241 |
+
# Top 5 metrics
|
| 242 |
+
key_feature = self._extract_ensemble_feature(name, result, is_synthetic, priority_score)
|
| 243 |
|
| 244 |
if key_feature:
|
| 245 |
weight_pct = ensemble_result.metric_weights.get(name, 0.0) * 100
|
|
|
|
| 248 |
return indicators
|
| 249 |
|
| 250 |
|
| 251 |
+
def _extract_ensemble_feature(self, metric_name: str, result: MetricResult, is_synthetic: bool, priority_score: float) -> Optional[str]:
|
| 252 |
"""
|
| 253 |
Extract significant features considering ensemble context
|
| 254 |
"""
|
|
|
|
| 258 |
burstiness = details.get("burstiness_score", 0.5)
|
| 259 |
uniformity = details.get("length_uniformity", 0.5)
|
| 260 |
|
| 261 |
+
if (is_synthetic and (burstiness < 0.4)):
|
| 262 |
+
return f"Low burstiness ({burstiness:.2f}) suggests uniform synthetic patterns"
|
| 263 |
|
| 264 |
+
elif (not is_synthetic and (burstiness > 0.6)):
|
| 265 |
return f"High burstiness ({burstiness:.2f}) indicates natural variation"
|
| 266 |
+
|
| 267 |
+
elif (is_synthetic and (uniformity > 0.7)):
|
| 268 |
+
return f"High structural uniformity ({uniformity:.2f}) typical of synthetic text"
|
| 269 |
+
|
|
|
|
| 270 |
elif (metric_name == "perplexity"):
|
| 271 |
perplexity = details.get("overall_perplexity", 50)
|
| 272 |
|
| 273 |
+
if (is_synthetic and perplexity < 35):
|
| 274 |
return f"Low perplexity ({perplexity:.1f}) indicates high predictability"
|
| 275 |
+
|
| 276 |
+
elif (not is_synthetic and (perplexity > 55)):
|
| 277 |
return f"High perplexity ({perplexity:.1f}) suggests human creativity"
|
| 278 |
+
|
|
|
|
| 279 |
elif (metric_name == "entropy"):
|
| 280 |
+
token_diversity = details.get("token_diversity", 0.5)
|
|
|
|
| 281 |
|
| 282 |
+
if (is_synthetic and (token_diversity < 0.65)):
|
| 283 |
+
return f"Low token diversity ({token_diversity:.2f}) suggests synthetic patterns"
|
| 284 |
+
|
| 285 |
+
elif (not is_synthetic and (token_diversity > 0.75)):
|
| 286 |
return f"High token diversity ({token_diversity:.2f}) indicates human variety"
|
| 287 |
+
|
|
|
|
| 288 |
elif (metric_name == "semantic_analysis"):
|
| 289 |
+
coherence = details.get("coherence_score", 0.5)
|
|
|
|
| 290 |
|
| 291 |
+
if (is_synthetic and (coherence > 0.8)):
|
| 292 |
+
return f"Unnaturally high coherence ({coherence:.2f}) typical of synthetic text"
|
| 293 |
+
|
| 294 |
+
elif (not is_synthetic and (0.4 <= coherence <= 0.7)):
|
| 295 |
return f"Natural coherence variation ({coherence:.2f})"
|
| 296 |
+
|
|
|
|
| 297 |
elif (metric_name == "linguistic"):
|
| 298 |
+
pos_diversity = details.get("pos_diversity", 0.5)
|
|
|
|
| 299 |
|
| 300 |
+
if (is_synthetic and (pos_diversity < 0.4)):
|
| 301 |
return f"Limited grammatical diversity ({pos_diversity:.2f})"
|
| 302 |
+
|
| 303 |
+
elif (not is_synthetic and (pos_diversity > 0.55)):
|
| 304 |
return f"Rich grammatical variety ({pos_diversity:.2f})"
|
| 305 |
|
| 306 |
elif (metric_name == "multi_perturbation_stability"):
|
| 307 |
stability = details.get("stability_score", 0.5)
|
|
|
|
| 308 |
|
| 309 |
+
if (is_synthetic and (stability > 0.6)):
|
| 310 |
+
return f"High perturbation sensitivity ({stability:.2f})"
|
| 311 |
+
|
| 312 |
+
elif (not is_synthetic and (stability < 0.4)):
|
| 313 |
return f"Text stability under perturbation ({stability:.2f})"
|
| 314 |
|
| 315 |
return None
|
|
|
|
| 327 |
continue
|
| 328 |
|
| 329 |
# Get metric description
|
| 330 |
+
desc = self.METRIC_DESCRIPTIONS.get(name, "analyzes text characteristics")
|
| 331 |
|
| 332 |
# Get weight information
|
| 333 |
weight = metric_weights.get(name, 0.0)
|
| 334 |
weight_info = f" (ensemble weight: {weight:.1%})" if weight > 0 else " (low weight in ensemble)"
|
| 335 |
|
| 336 |
+
# Determine verdict based on probabilities
|
| 337 |
+
if (result.synthetic_probability > 0.6):
|
| 338 |
+
verdict = "suggests synthetic generation"
|
| 339 |
+
prob = result.synthetic_probability
|
| 340 |
+
|
| 341 |
+
elif (result.authentic_probability > 0.6):
|
| 342 |
+
verdict = "indicates authentic writing"
|
| 343 |
+
prob = result.authentic_probability
|
| 344 |
+
|
| 345 |
else:
|
| 346 |
verdict = "shows mixed signals"
|
| 347 |
+
prob = max(result.synthetic_probability, result.authentic_probability)
|
| 348 |
|
| 349 |
# Build explanation with confidence
|
| 350 |
+
explanation = (f"This metric {desc}.{weight_info} Result: {verdict} ({prob:.1%} probability) with {result.confidence:.1%} confidence.")
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
explanations[name] = explanation
|
| 353 |
|
|
|
|
| 358 |
"""
|
| 359 |
Compile evidence considering ensemble consensus and weights
|
| 360 |
"""
|
| 361 |
+
is_synthetic_verdict = (ensemble_result.final_verdict == "Synthetically-Generated")
|
| 362 |
+
consensus = ensemble_result.consensus_level
|
| 363 |
|
| 364 |
+
supporting = list()
|
| 365 |
+
contradicting = list()
|
| 366 |
|
| 367 |
for name, result in metric_results.items():
|
| 368 |
if result.error:
|
| 369 |
continue
|
| 370 |
|
| 371 |
+
weight = ensemble_result.metric_weights.get(name, 0.0)
|
| 372 |
+
metric_suggests_synthetic = (result.synthetic_probability > result.authentic_probability)
|
| 373 |
|
| 374 |
# Weight the evidence by metric importance
|
| 375 |
+
weight_indicator = "🟢" if (weight > 0.15) else "🟡" if (weight > 0.08) else "⚪"
|
| 376 |
|
| 377 |
+
if (metric_suggests_synthetic == is_synthetic_verdict):
|
| 378 |
# Supporting evidence
|
| 379 |
+
indicator = self._get_synthetic_indicator_from_metric(name, result) if is_synthetic_verdict else self._get_authentic_indicator_from_metric(name, result)
|
| 380 |
|
| 381 |
if indicator:
|
| 382 |
supporting.append(f"{weight_indicator} {indicator}")
|
| 383 |
|
| 384 |
else:
|
| 385 |
# Contradicting evidence
|
| 386 |
+
indicator = self._get_authentic_indicator_from_metric(name, result) if is_synthetic_verdict else self._get_synthetic_indicator_from_metric(name, result)
|
| 387 |
|
| 388 |
if indicator:
|
| 389 |
contradicting.append(f"{weight_indicator} {indicator}")
|
|
|
|
| 391 |
# Add consensus context
|
| 392 |
if (consensus > 0.7):
|
| 393 |
supporting.insert(0, "✅ Strong metric consensus supports this conclusion")
|
| 394 |
+
|
| 395 |
elif (consensus < 0.4):
|
| 396 |
contradicting.insert(0, "⚠️ Low metric consensus indicates uncertainty")
|
| 397 |
|
| 398 |
return supporting, contradicting
|
| 399 |
|
| 400 |
|
| 401 |
+
def _get_synthetic_indicator_from_metric(self, metric_name: str, result: MetricResult) -> Optional[str]:
|
| 402 |
"""
|
| 403 |
+
Get synthetic indicator from metric result
|
| 404 |
"""
|
| 405 |
details = result.details
|
| 406 |
|
| 407 |
if (metric_name == "structural"):
|
| 408 |
if (details.get("burstiness_score", 1.0) < 0.4):
|
| 409 |
+
return self.SYNTHETIC_INDICATORS["low_burstiness"]
|
| 410 |
+
|
| 411 |
elif (metric_name == "perplexity"):
|
| 412 |
if (details.get("overall_perplexity", 100) < 35):
|
| 413 |
+
return self.SYNTHETIC_INDICATORS["low_perplexity"]
|
| 414 |
+
|
| 415 |
elif (metric_name == "entropy"):
|
| 416 |
if (details.get("token_diversity", 1.0) < 0.65):
|
| 417 |
+
return self.SYNTHETIC_INDICATORS["low_entropy"]
|
| 418 |
+
|
| 419 |
elif (metric_name == "semantic_analysis"):
|
| 420 |
if (details.get("coherence_score", 0.5) > 0.75):
|
| 421 |
+
return self.SYNTHETIC_INDICATORS["semantic_perfection"]
|
| 422 |
+
|
| 423 |
return None
|
| 424 |
|
| 425 |
|
| 426 |
+
def _get_authentic_indicator_from_metric(self, metric_name: str, result: MetricResult) -> Optional[str]:
|
| 427 |
"""
|
| 428 |
+
Get authentic indicator from metric result
|
| 429 |
"""
|
| 430 |
details = result.details
|
| 431 |
|
| 432 |
if (metric_name == "structural"):
|
| 433 |
if (details.get("burstiness_score", 0.0) > 0.6):
|
| 434 |
+
return self.AUTHENTIC_INDICATORS["high_burstiness"]
|
| 435 |
+
|
| 436 |
elif (metric_name == "perplexity"):
|
| 437 |
if (details.get("overall_perplexity", 0) > 55):
|
| 438 |
+
return self.AUTHENTIC_INDICATORS["high_perplexity"]
|
| 439 |
+
|
| 440 |
elif (metric_name == "entropy"):
|
| 441 |
if (details.get("token_diversity", 0.0) > 0.75):
|
| 442 |
+
return self.AUTHENTIC_INDICATORS["high_entropy"]
|
| 443 |
+
|
| 444 |
return None
|
| 445 |
|
| 446 |
|
|
|
|
| 460 |
|
| 461 |
if (confidence >= 0.8):
|
| 462 |
explanation += "High confidence due to: strong metric agreement, clear patterns, and reliable signal across multiple detection methods."
|
| 463 |
+
|
| 464 |
elif (confidence >= 0.6):
|
| 465 |
explanation += "Good confidence supported by: general metric agreement and consistent detection patterns."
|
| 466 |
+
|
| 467 |
else:
|
| 468 |
explanation += "Lower confidence reflects: metric disagreement, ambiguous patterns, or borderline characteristics."
|
| 469 |
|
|
|
|
| 489 |
|
| 490 |
# Add domain-specific threshold context
|
| 491 |
threshold_info = {Domain.ACADEMIC : "Higher detection thresholds applied for academic rigor",
|
| 492 |
+
Domain.TECHNICAL_DOC : "Elevated thresholds for technical precision requirements",
|
| 493 |
Domain.CREATIVE : "Balanced thresholds accounting for creative expression",
|
| 494 |
Domain.SOCIAL_MEDIA : "Adapted thresholds for informal communication patterns",
|
| 495 |
+
Domain.GENERAL : "Standard detection thresholds applied",
|
| 496 |
}
|
| 497 |
|
| 498 |
threshold_note = threshold_info.get(domain, "Standard detection thresholds applied")
|
|
|
|
| 508 |
|
| 509 |
explanation = f"**Ensemble Methodology**: {method_desc}\n\n"
|
| 510 |
|
| 511 |
+
# Explain key top metrics
|
| 512 |
+
top_metrics = sorted(ensemble_result.metric_weights.items(), key=lambda x: x[1], reverse=True)[:3]
|
| 513 |
|
| 514 |
if top_metrics:
|
| 515 |
explanation += "**Top contributing metrics**:\n"
|
|
|
|
| 518 |
|
| 519 |
# Add reasoning snippets if available
|
| 520 |
if hasattr(ensemble_result, 'reasoning') and ensemble_result.reasoning:
|
| 521 |
+
# Filter out section headers and take first 2 key reasons
|
| 522 |
key_reasons = [r for r in ensemble_result.reasoning if not r.startswith('##')][:2]
|
| 523 |
if key_reasons:
|
| 524 |
explanation += "\n**Key ensemble factors**:\n"
|
| 525 |
for reason in key_reasons:
|
| 526 |
+
# Clean up the reason text
|
| 527 |
+
clean_reason = reason.replace('**', '').replace('✓', '').replace('⚠', '').strip()
|
| 528 |
+
explanation += f"• {clean_reason}\n"
|
| 529 |
|
| 530 |
return explanation
|
| 531 |
|
|
|
|
| 538 |
|
| 539 |
if (uncertainty < 0.3):
|
| 540 |
return "**Low Uncertainty**: Clear detection signals with strong metric agreement. Results are highly reliable."
|
| 541 |
+
|
| 542 |
elif (uncertainty < 0.6):
|
| 543 |
return "**Moderate Uncertainty**: Some metric disagreement or borderline characteristics. Consider additional context."
|
| 544 |
|
|
|
|
| 546 |
return "**High Uncertainty**: Significant metric disagreement or ambiguous patterns. Results should be interpreted with caution and additional verification may be needed."
|
| 547 |
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
def _generate_ensemble_recommendations(self, ensemble_result: EnsembleResult, metric_results: Dict[str, MetricResult], domain: Domain) -> List[str]:
|
| 550 |
"""
|
| 551 |
Generate actionable recommendations based on ensemble results
|
|
|
|
| 556 |
uncertainty = ensemble_result.uncertainty_score
|
| 557 |
|
| 558 |
# Base recommendations by verdict and confidence
|
| 559 |
+
if (verdict == "Synthetically-Generated"):
|
| 560 |
if (confidence >= 0.8):
|
| 561 |
+
recommendations.append("**High-confidence synthetic detection**: Consider verified original drafts or alternative assessment methods.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
+
else:
|
| 564 |
+
recommendations.append("**Likely synthetic involvement**: Recommend discussion about AI tool usage and verification of understanding.")
|
| 565 |
+
|
| 566 |
+
elif (verdict == "Authentically-Written"):
|
| 567 |
if (confidence >= 0.8):
|
| 568 |
+
recommendations.append("**High-confidence authentic authorship**: No additional verification typically needed.")
|
| 569 |
|
| 570 |
else:
|
| 571 |
+
recommendations.append("**Likely authentically-written**: Consider context and writing history for complete assessment.")
|
| 572 |
+
|
| 573 |
+
elif (verdict == "Hybrid"):
|
| 574 |
+
recommendations.append("**Mixed synthetic/authentic content**: Common in collaborative writing. Discuss appropriate AI use guidelines.")
|
| 575 |
+
|
| 576 |
+
elif (verdict == "Uncertain"):
|
| 577 |
+
recommendations.append("**Inconclusive result**: The analysis could not reach a clear determination. Additional context or verification may be needed.")
|
| 578 |
|
| 579 |
# Uncertainty-based recommendations
|
| 580 |
if (uncertainty > 0.6):
|
|
|
|
| 582 |
|
| 583 |
# Domain-specific recommendations
|
| 584 |
domain_recs = {Domain.ACADEMIC : "For academic work: verify subject mastery through targeted questions or practical application.",
|
| 585 |
+
Domain.CREATIVE : "For creative work: assess originality, personal voice, and creative process documentation.",
|
| 586 |
Domain.TECHNICAL_DOC : "For technical content: verify practical expertise and problem-solving ability.",
|
| 587 |
+
Domain.SOFTWARE_DEV : "For code documentation: verify understanding through code review or implementation questions.",
|
| 588 |
+
}
|
| 589 |
|
| 590 |
if domain in domain_recs:
|
| 591 |
recommendations.append(domain_recs[domain])
|
|
|
|
| 601 |
|
| 602 |
|
| 603 |
# Export
|
| 604 |
+
__all__ = ["ReasoningGenerator"]
|
|
|
|
|
|
setup.sh
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
# Post-installation setup script for Hugging Face Spaces
|
| 4 |
-
echo "Starting setup for Text-Authentication Platform ..."
|
| 5 |
-
|
| 6 |
-
# Download Spacy Model
|
| 7 |
-
echo "Downloading SpaCy English model ..."
|
| 8 |
-
python -n spacy download en_core_web_sm
|
| 9 |
-
|
| 10 |
-
# Download NLTK data
|
| 11 |
-
echo "Downloading NLTK data ..."
|
| 12 |
-
python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger')"
|
| 13 |
-
|
| 14 |
-
# Create necessary directories
|
| 15 |
-
echo "Creating directories ..."
|
| 16 |
-
mkdir -p data/reports data/uploads
|
| 17 |
-
|
| 18 |
-
# Verify installation
|
| 19 |
-
echo "Verifying installations ..."
|
| 20 |
-
python -c "import transformers; import torch; import spacy; print('All core libraries imported successfully.')"
|
| 21 |
-
|
| 22 |
-
echo "Setup complete !"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_integration.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# test_integration.py
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from io import StringIO
|
| 7 |
+
import contextlib
|
| 8 |
+
|
| 9 |
+
# Add the project root to Python path for imports
|
| 10 |
+
project_root = Path(__file__).parent.parent
|
| 11 |
+
sys.path.insert(0, str(project_root))
|
| 12 |
+
|
| 13 |
+
# Create a string buffer to capture output
|
| 14 |
+
output_buffer = StringIO()
|
| 15 |
+
|
| 16 |
+
with contextlib.redirect_stdout(output_buffer):
|
| 17 |
+
# Now import modules
|
| 18 |
+
from config.enums import ModelType, Domain, Language
|
| 19 |
+
from config.schemas import ModelConfig, ExtractedDocument, ProcessedText
|
| 20 |
+
from config.constants import document_extraction_params
|
| 21 |
+
from config.model_config import MODEL_REGISTRY, get_model_config
|
| 22 |
+
from config.settings import settings
|
| 23 |
+
from config.threshold_config import get_threshold_for_domain
|
| 24 |
+
|
| 25 |
+
print("=" * 70)
|
| 26 |
+
print("CONFIG MODULE INTEGRATION TEST")
|
| 27 |
+
print("=" * 70)
|
| 28 |
+
|
| 29 |
+
# Test 1: Enum usage
|
| 30 |
+
print(f"\n✓ Model Types: {[m.value for m in ModelType][:5]}...")
|
| 31 |
+
|
| 32 |
+
# Test 2: Schema instantiation
|
| 33 |
+
config = ModelConfig(
|
| 34 |
+
model_id="test",
|
| 35 |
+
model_type=ModelType.TRANSFORMER,
|
| 36 |
+
description="Test",
|
| 37 |
+
size_mb=100
|
| 38 |
+
)
|
| 39 |
+
print(f"✓ Schema instantiation: {config.model_id}")
|
| 40 |
+
|
| 41 |
+
# Test 3: Constants usage
|
| 42 |
+
print(f"✓ Max file size: {document_extraction_params.MAX_FILE_SIZE / 1024 / 1024:.1f} MB")
|
| 43 |
+
|
| 44 |
+
# Test 4: Model registry
|
| 45 |
+
print(f"✓ Available models: {list(MODEL_REGISTRY.keys())}")
|
| 46 |
+
|
| 47 |
+
# Test 5: Settings
|
| 48 |
+
print(f"✓ App name: {settings.APP_NAME}")
|
| 49 |
+
print(f"✓ Environment: {settings.ENVIRONMENT}")
|
| 50 |
+
print(f"✓ Log dir: {settings.LOG_DIR}")
|
| 51 |
+
print(f"✓ Model cache dir: {settings.MODEL_CACHE_DIR}")
|
| 52 |
+
|
| 53 |
+
# Test 6: Thresholds
|
| 54 |
+
thresholds = get_threshold_for_domain(Domain.ACADEMIC)
|
| 55 |
+
print(f"✓ Academic thresholds: {thresholds.ensemble_threshold}")
|
| 56 |
+
|
| 57 |
+
print("\n" + "=" * 70)
|
| 58 |
+
print("PROCESSORS MODULE INTEGRATION TEST")
|
| 59 |
+
print("=" * 70)
|
| 60 |
+
|
| 61 |
+
# Test 7: Document Extractor
|
| 62 |
+
try:
|
| 63 |
+
from processors.document_extractor import DocumentExtractor
|
| 64 |
+
|
| 65 |
+
# Create a test text file
|
| 66 |
+
test_text = "This is a test document for integration testing.\n" * 10
|
| 67 |
+
test_file = Path("test_document.txt")
|
| 68 |
+
|
| 69 |
+
# Write test file
|
| 70 |
+
test_file.write_text(test_text)
|
| 71 |
+
|
| 72 |
+
# Test extractor
|
| 73 |
+
extractor = DocumentExtractor(extract_metadata=True)
|
| 74 |
+
result = extractor.extract(str(test_file))
|
| 75 |
+
|
| 76 |
+
print(f"\n✓ Document Extractor Test:")
|
| 77 |
+
print(f" - Success: {result.is_success}")
|
| 78 |
+
print(f" - Text length: {len(result.text)} chars")
|
| 79 |
+
print(f" - File type: {result.file_type}")
|
| 80 |
+
print(f" - Method: {result.extraction_method}")
|
| 81 |
+
|
| 82 |
+
# Clean up test file
|
| 83 |
+
test_file.unlink()
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"\n✗ Document Extractor failed: {e}")
|
| 87 |
+
|
| 88 |
+
# Test 8: Text Processor
|
| 89 |
+
try:
|
| 90 |
+
# First check if we have the needed constants
|
| 91 |
+
from config.constants import text_processing_params
|
| 92 |
+
print(f"\n✓ Text processing params available")
|
| 93 |
+
|
| 94 |
+
from processors.text_processor import TextProcessor
|
| 95 |
+
|
| 96 |
+
test_text = "This is a sample text for processing. It contains multiple sentences! " \
|
| 97 |
+
"Here is another sentence. And one more for testing."
|
| 98 |
+
|
| 99 |
+
processor = TextProcessor()
|
| 100 |
+
processed = processor.process(test_text)
|
| 101 |
+
|
| 102 |
+
print(f"\n✓ Text Processor Test:")
|
| 103 |
+
print(f" - Is valid: {processed.is_valid}")
|
| 104 |
+
print(f" - Words: {processed.word_count}")
|
| 105 |
+
print(f" - Sentences: {processed.sentence_count}")
|
| 106 |
+
print(f" - Avg sentence length: {processed.avg_sentence_length:.1f}")
|
| 107 |
+
print(f" - Avg word length: {processed.avg_word_length:.1f}")
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f"\n✗ Text Processor failed: {e}")
|
| 111 |
+
print(" Note: You need to add TextProcessingParams to constants.py")
|
| 112 |
+
|
| 113 |
+
# Test 9: Domain Classifier (without model)
|
| 114 |
+
try:
|
| 115 |
+
from processors.domain_classifier import DomainClassifier, get_domain_name, is_technical_domain
|
| 116 |
+
|
| 117 |
+
test_text = "This is a scientific paper about machine learning and artificial intelligence."
|
| 118 |
+
|
| 119 |
+
classifier = DomainClassifier()
|
| 120 |
+
print(f"\n✓ Domain Classifier initialized")
|
| 121 |
+
|
| 122 |
+
# Note: This will fail if models aren't loaded, but we can test the class structure
|
| 123 |
+
print(f" - Class structure verified")
|
| 124 |
+
print(f" - Domain enum available")
|
| 125 |
+
|
| 126 |
+
# Test helper functions
|
| 127 |
+
ai_ml_domain = Domain.AI_ML
|
| 128 |
+
print(f" - AI/ML domain name: {get_domain_name(ai_ml_domain)}")
|
| 129 |
+
print(f" - Is technical domain: {is_technical_domain(ai_ml_domain)}")
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"\n✗ Domain Classifier setup failed: {e}")
|
| 133 |
+
|
| 134 |
+
# Test 10: Language Detector (heuristic mode)
|
| 135 |
+
try:
|
| 136 |
+
from processors.language_detector import LanguageDetector
|
| 137 |
+
|
| 138 |
+
# Test in English
|
| 139 |
+
english_text = "This is an English text for language detection testing."
|
| 140 |
+
|
| 141 |
+
# Use heuristic mode (no model dependency)
|
| 142 |
+
detector = LanguageDetector(use_model=False)
|
| 143 |
+
result = detector.detect(english_text)
|
| 144 |
+
|
| 145 |
+
print(f"\n✓ Language Detector Test (heuristic):")
|
| 146 |
+
print(f" - Primary language: {result.primary_language.value}")
|
| 147 |
+
print(f" - Evidence strength: {result.evidence_strength:.2f}")
|
| 148 |
+
print(f" - Method: {result.detection_method}")
|
| 149 |
+
print(f" - Script: {result.script.value}")
|
| 150 |
+
|
| 151 |
+
# Test language check
|
| 152 |
+
is_english = detector.is_language(english_text, Language.ENGLISH, threshold=0.5)
|
| 153 |
+
print(f" - Is English check: {is_english}")
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"\n✗ Language Detector failed: {e}")
|
| 157 |
+
|
| 158 |
+
print("\n" + "=" * 70)
|
| 159 |
+
print("MODELS MODULE INTEGRATION TEST")
|
| 160 |
+
print("=" * 70)
|
| 161 |
+
|
| 162 |
+
# Test 11: Model Registry
|
| 163 |
+
try:
|
| 164 |
+
from models.model_registry import ModelRegistry, get_model_registry
|
| 165 |
+
|
| 166 |
+
registry = get_model_registry()
|
| 167 |
+
|
| 168 |
+
print(f"\n✓ Model Registry Test:")
|
| 169 |
+
print(f" - Singleton pattern working")
|
| 170 |
+
print(f" - Registry initialized")
|
| 171 |
+
|
| 172 |
+
# Test usage tracking
|
| 173 |
+
registry.record_model_usage("test_model", 1.5)
|
| 174 |
+
stats = registry.get_usage_stats("test_model")
|
| 175 |
+
print(f" - Usage tracking: {stats.usage_count if stats else 'N/A'}")
|
| 176 |
+
|
| 177 |
+
# Test dependency tracking
|
| 178 |
+
registry.add_dependency("model_b", ["model_a"])
|
| 179 |
+
deps = registry.get_dependencies("model_b")
|
| 180 |
+
print(f" - Dependency tracking: {deps}")
|
| 181 |
+
|
| 182 |
+
# Generate report
|
| 183 |
+
report = registry.generate_usage_report()
|
| 184 |
+
print(f" - Report generation: {len(report)} items")
|
| 185 |
+
|
| 186 |
+
# Test reset
|
| 187 |
+
registry.reset_usage_stats("test_model")
|
| 188 |
+
print(f" - Reset functionality working")
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"\n✗ Model Registry failed: {e}")
|
| 192 |
+
|
| 193 |
+
# Test 12: Model Manager (without actual downloads)
|
| 194 |
+
try:
|
| 195 |
+
from models.model_manager import ModelManager, get_model_manager
|
| 196 |
+
|
| 197 |
+
manager = get_model_manager()
|
| 198 |
+
|
| 199 |
+
print(f"\n✓ Model Manager Test:")
|
| 200 |
+
print(f" - Singleton pattern working")
|
| 201 |
+
print(f" - Device: {manager.device}")
|
| 202 |
+
print(f" - Cache directory: {manager.cache_dir}")
|
| 203 |
+
|
| 204 |
+
# Test metadata
|
| 205 |
+
metadata = manager.metadata
|
| 206 |
+
print(f" - Metadata loaded: {len(metadata)} entries")
|
| 207 |
+
|
| 208 |
+
# Test cache
|
| 209 |
+
cache_size = manager.cache.size()
|
| 210 |
+
print(f" - Cache initialized: size {cache_size}")
|
| 211 |
+
|
| 212 |
+
# Test model info check
|
| 213 |
+
model_name = list(MODEL_REGISTRY.keys())[0] if MODEL_REGISTRY else "perplexity_reference_lm"
|
| 214 |
+
is_downloaded = manager.is_model_downloaded(model_name)
|
| 215 |
+
print(f" - Model check: {model_name} downloaded={is_downloaded}")
|
| 216 |
+
|
| 217 |
+
# Test memory usage
|
| 218 |
+
memory_info = manager.get_memory_usage()
|
| 219 |
+
print(f" - Memory monitoring: {len(memory_info)} metrics")
|
| 220 |
+
|
| 221 |
+
# Test model configuration access
|
| 222 |
+
model_config = get_model_config(model_name)
|
| 223 |
+
if model_config:
|
| 224 |
+
print(f" - Model config access: {model_config.model_id}")
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
print(f"\n✗ Model Manager failed: {e}")
|
| 228 |
+
|
| 229 |
+
# Test 13: Integration between models and config
|
| 230 |
+
try:
|
| 231 |
+
print(f"\n✓ Config-Models Integration Test:")
|
| 232 |
+
|
| 233 |
+
# Check model config from registry
|
| 234 |
+
for model_name, config in MODEL_REGISTRY.items():
|
| 235 |
+
if config.required:
|
| 236 |
+
print(f" - {model_name}: {config.model_type.value}")
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
# Check settings integration
|
| 240 |
+
print(f" - Max cached models from settings: {settings.MAX_CACHED_MODELS}")
|
| 241 |
+
print(f" - Use quantization from settings: {settings.USE_QUANTIZATION}")
|
| 242 |
+
|
| 243 |
+
except Exception as e:
|
| 244 |
+
print(f"\n✗ Config-Models integration failed: {e}")
|
| 245 |
+
|
| 246 |
+
# Test 14: End-to-End System Integration
|
| 247 |
+
try:
|
| 248 |
+
print(f"\n" + "=" * 70)
|
| 249 |
+
print("FULL SYSTEM INTEGRATION TEST")
|
| 250 |
+
print("=" * 70)
|
| 251 |
+
|
| 252 |
+
# Create a test scenario
|
| 253 |
+
sample_text = """
|
| 254 |
+
Machine learning is a subset of artificial intelligence.
|
| 255 |
+
It involves algorithms that learn patterns from data.
|
| 256 |
+
Deep learning uses neural networks with multiple layers.
|
| 257 |
+
"""
|
| 258 |
+
|
| 259 |
+
# 1. Process text
|
| 260 |
+
from processors.text_processor import TextProcessor
|
| 261 |
+
processor = TextProcessor()
|
| 262 |
+
processed = processor.process(sample_text)
|
| 263 |
+
|
| 264 |
+
print(f"✓ 1. Text Processing Complete:")
|
| 265 |
+
print(f" - Cleaned text: {len(processed.cleaned_text)} chars")
|
| 266 |
+
print(f" - Valid: {processed.is_valid}")
|
| 267 |
+
|
| 268 |
+
# 2. Detect language
|
| 269 |
+
from processors.language_detector import LanguageDetector
|
| 270 |
+
detector = LanguageDetector(use_model=False)
|
| 271 |
+
lang_result = detector.detect(processed.cleaned_text)
|
| 272 |
+
|
| 273 |
+
print(f"\n✓ 2. Language Detection Complete:")
|
| 274 |
+
print(f" - Language: {lang_result.primary_language.value}")
|
| 275 |
+
print(f" - Script: {lang_result.script.value}")
|
| 276 |
+
|
| 277 |
+
# 3. Domain classification structure
|
| 278 |
+
from processors.domain_classifier import get_domain_name, is_technical_domain
|
| 279 |
+
ai_ml_domain = Domain.AI_ML
|
| 280 |
+
|
| 281 |
+
print(f"\n✓ 3. Domain System Ready:")
|
| 282 |
+
print(f" - Domain enum: {ai_ml_domain.value}")
|
| 283 |
+
print(f" - Human name: {get_domain_name(ai_ml_domain)}")
|
| 284 |
+
print(f" - Is technical: {is_technical_domain(ai_ml_domain)}")
|
| 285 |
+
|
| 286 |
+
# 4. Model management
|
| 287 |
+
from models.model_manager import get_model_manager
|
| 288 |
+
from models.model_registry import get_model_registry
|
| 289 |
+
|
| 290 |
+
model_manager = get_model_manager()
|
| 291 |
+
model_registry = get_model_registry()
|
| 292 |
+
|
| 293 |
+
print(f"\n✓ 4. Model Management Ready:")
|
| 294 |
+
print(f" - Manager: {type(model_manager).__name__}")
|
| 295 |
+
print(f" - Registry: {type(model_registry).__name__}")
|
| 296 |
+
print(f" - Cache dir exists: {model_manager.cache_dir.exists()}")
|
| 297 |
+
|
| 298 |
+
# 5. Settings integration
|
| 299 |
+
print(f"\n✓ 5. Settings Integration:")
|
| 300 |
+
print(f" - App: {settings.APP_NAME} v{settings.APP_VERSION}")
|
| 301 |
+
print(f" - Environment: {settings.ENVIRONMENT}")
|
| 302 |
+
print(f" - Debug: {settings.DEBUG}")
|
| 303 |
+
|
| 304 |
+
print(f"\n🎯 FULL SYSTEM INTEGRATION SUCCESSFUL!")
|
| 305 |
+
|
| 306 |
+
except Exception as e:
|
| 307 |
+
print(f"\n✗ Full system integration failed: {e}")
|
| 308 |
+
import traceback
|
| 309 |
+
print(traceback.format_exc())
|
| 310 |
+
|
| 311 |
+
print("\n" + "=" * 70)
|
| 312 |
+
print("TEST COMPLETED")
|
| 313 |
+
print("=" * 70)
|
| 314 |
+
|
| 315 |
+
# Get the captured output
|
| 316 |
+
output_text = output_buffer.getvalue()
|
| 317 |
+
|
| 318 |
+
# Print the output
|
| 319 |
+
print(output_text)
|
| 320 |
+
|
| 321 |
+
# Count successes and failures
|
| 322 |
+
success_count = sum(1 for line in output_text.split('\n') if '✓' in line)
|
| 323 |
+
failure_count = sum(1 for line in output_text.split('\n') if '✗' in line)
|
| 324 |
+
|
| 325 |
+
print(f"Successes: {success_count}")
|
| 326 |
+
print(f"Failures: {failure_count}")
|
| 327 |
+
|
| 328 |
+
if failure_count == 0:
|
| 329 |
+
print("\n🎉 ALL TESTS PASSED! Complete system is properly integrated.")
|
| 330 |
+
else:
|
| 331 |
+
print(f"\n⚠️ {failure_count} tests failed. Check the issues above.")
|
text_auth_app.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
| 3 |
import time
|
| 4 |
import json
|
| 5 |
import uvicorn
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
from typing import Any
|
| 8 |
from typing import List
|
|
@@ -19,31 +20,26 @@ from fastapi import Request
|
|
| 19 |
from datetime import datetime
|
| 20 |
from fastapi import UploadFile
|
| 21 |
from pydantic import BaseModel
|
|
|
|
| 22 |
from fastapi import HTTPException
|
| 23 |
from fastapi import BackgroundTasks
|
| 24 |
from config.settings import settings
|
| 25 |
from utils.logger import central_logger
|
| 26 |
from utils.logger import log_api_request
|
| 27 |
-
from detector.attribution import AIModel
|
| 28 |
-
from config.threshold_config import Domain
|
| 29 |
from fastapi.responses import JSONResponse
|
| 30 |
from fastapi.responses import HTMLResponse
|
| 31 |
from fastapi.responses import FileResponse
|
|
|
|
| 32 |
from fastapi.staticfiles import StaticFiles
|
| 33 |
-
from utils.logger import
|
| 34 |
-
from
|
| 35 |
-
from
|
| 36 |
-
from
|
| 37 |
-
from detector.orchestrator import DetectionResult
|
| 38 |
-
from detector.attribution import AttributionResult
|
| 39 |
from fastapi.middleware.cors import CORSMiddleware
|
| 40 |
-
from processors.text_processor import TextProcessor
|
| 41 |
from reporter.report_generator import ReportGenerator
|
| 42 |
-
from
|
| 43 |
-
from processors.domain_classifier import DomainClassifier
|
| 44 |
-
from processors.language_detector import LanguageDetector
|
| 45 |
from processors.document_extractor import DocumentExtractor
|
| 46 |
-
from
|
| 47 |
|
| 48 |
|
| 49 |
|
|
@@ -202,7 +198,6 @@ class TextAnalysisRequest(SerializableBaseModel):
|
|
| 202 |
"""
|
| 203 |
text : str = Field(..., min_length = 50, max_length = 50000, description = "Text to analyze")
|
| 204 |
domain : Optional[str] = Field(None, description = "Override automatic domain detection")
|
| 205 |
-
enable_attribution : bool = Field(True, description = "Enable AI model attribution")
|
| 206 |
enable_highlighting : bool = Field(True, description = "Generate sentence highlighting")
|
| 207 |
skip_expensive_metrics : bool = Field(False, description = "Skip computationally expensive metrics")
|
| 208 |
use_sentence_level : bool = Field(True, description = "Use sentence-level analysis for highlighting")
|
|
@@ -217,7 +212,6 @@ class TextAnalysisResponse(SerializableBaseModel):
|
|
| 217 |
status : str
|
| 218 |
analysis_id : str
|
| 219 |
detection_result : Dict[str, Any]
|
| 220 |
-
attribution : Optional[Dict[str, Any]] = None
|
| 221 |
highlighted_html : Optional[str] = None
|
| 222 |
reasoning : Optional[Dict[str, Any]] = None
|
| 223 |
report_files : Optional[Dict[str, str]] = None
|
|
@@ -231,7 +225,6 @@ class BatchAnalysisRequest(SerializableBaseModel):
|
|
| 231 |
"""
|
| 232 |
texts : List[str] = Field(..., min_items = 1, max_items = 100)
|
| 233 |
domain : Optional[str] = None
|
| 234 |
-
enable_attribution : bool = False
|
| 235 |
skip_expensive_metrics : bool = True
|
| 236 |
generate_reports : bool = False
|
| 237 |
|
|
@@ -243,7 +236,6 @@ class BatchAnalysisResult(SerializableBaseModel):
|
|
| 243 |
index : int
|
| 244 |
status : str
|
| 245 |
detection : Optional[Dict[str, Any]] = None
|
| 246 |
-
attribution : Optional[Dict[str, Any]] = None
|
| 247 |
reasoning : Optional[Dict[str, Any]] = None
|
| 248 |
report_files : Optional[Dict[str, str]] = None
|
| 249 |
error : Optional[str] = None
|
|
@@ -271,7 +263,6 @@ class FileAnalysisResponse(SerializableBaseModel):
|
|
| 271 |
analysis_id : str
|
| 272 |
file_info : Dict[str, Any]
|
| 273 |
detection_result : Dict[str, Any]
|
| 274 |
-
attribution : Optional[Dict[str, Any]] = None
|
| 275 |
highlighted_html : Optional[str] = None
|
| 276 |
reasoning : Optional[Dict[str, Any]] = None
|
| 277 |
report_files : Optional[Dict[str, str]] = None
|
|
@@ -327,6 +318,7 @@ class AnalysisCache:
|
|
| 327 |
self.ttl_seconds = ttl_seconds
|
| 328 |
logger.info(f"AnalysisCache initialized (max_size={max_size}, ttl={ttl_seconds}s)")
|
| 329 |
|
|
|
|
| 330 |
def set(self, analysis_id: str, data: Dict[str, Any]) -> None:
|
| 331 |
"""
|
| 332 |
Store analysis result in cache
|
|
@@ -335,18 +327,20 @@ class AnalysisCache:
|
|
| 335 |
self._cleanup_expired()
|
| 336 |
|
| 337 |
# If cache is full, remove oldest entry
|
| 338 |
-
if len(self.cache) >= self.max_size:
|
| 339 |
-
oldest_key = min(self.cache.keys(), key=lambda k: self.cache[k]['timestamp'])
|
|
|
|
| 340 |
del self.cache[oldest_key]
|
|
|
|
| 341 |
logger.debug(f"Cache full, removed oldest entry: {oldest_key}")
|
| 342 |
|
| 343 |
# Store new entry
|
| 344 |
-
self.cache[analysis_id] = {
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
}
|
| 348 |
logger.debug(f"Cached analysis: {analysis_id} (cache size: {len(self.cache)})")
|
| 349 |
|
|
|
|
| 350 |
def get(self, analysis_id: str) -> Optional[Dict[str, Any]]:
|
| 351 |
"""
|
| 352 |
Retrieve analysis result from cache
|
|
@@ -358,7 +352,7 @@ class AnalysisCache:
|
|
| 358 |
entry = self.cache[analysis_id]
|
| 359 |
|
| 360 |
# Check if expired
|
| 361 |
-
if time.time() - entry['timestamp'] > self.ttl_seconds:
|
| 362 |
del self.cache[analysis_id]
|
| 363 |
logger.debug(f"Cache expired: {analysis_id}")
|
| 364 |
return None
|
|
@@ -366,15 +360,13 @@ class AnalysisCache:
|
|
| 366 |
logger.debug(f"Cache hit: {analysis_id}")
|
| 367 |
return entry['data']
|
| 368 |
|
|
|
|
| 369 |
def _cleanup_expired(self) -> None:
|
| 370 |
"""
|
| 371 |
Remove expired entries from cache
|
| 372 |
"""
|
| 373 |
current_time = time.time()
|
| 374 |
-
expired_keys = [
|
| 375 |
-
key for key, entry in self.cache.items()
|
| 376 |
-
if current_time - entry['timestamp'] > self.ttl_seconds
|
| 377 |
-
]
|
| 378 |
|
| 379 |
for key in expired_keys:
|
| 380 |
del self.cache[key]
|
|
@@ -382,6 +374,7 @@ class AnalysisCache:
|
|
| 382 |
if expired_keys:
|
| 383 |
logger.debug(f"Cleaned up {len(expired_keys)} expired cache entries")
|
| 384 |
|
|
|
|
| 385 |
def clear(self) -> None:
|
| 386 |
"""
|
| 387 |
Clear all cached entries
|
|
@@ -390,6 +383,7 @@ class AnalysisCache:
|
|
| 390 |
self.cache.clear()
|
| 391 |
logger.info(f"Cache cleared ({count} entries removed)")
|
| 392 |
|
|
|
|
| 393 |
def size(self) -> int:
|
| 394 |
"""
|
| 395 |
Get current cache size
|
|
@@ -398,8 +392,8 @@ class AnalysisCache:
|
|
| 398 |
|
| 399 |
|
| 400 |
# ==================== FASTAPI APPLICATION ====================
|
| 401 |
-
app = FastAPI(title = "
|
| 402 |
-
description = "
|
| 403 |
version = "1.0.0",
|
| 404 |
docs_url = "/api/docs",
|
| 405 |
redoc_url = "/api/redoc",
|
|
@@ -423,25 +417,26 @@ if ui_static_path.exists():
|
|
| 423 |
|
| 424 |
# Global instances
|
| 425 |
orchestrator : Optional[DetectionOrchestrator] = None
|
| 426 |
-
attributor : Optional[ModelAttributor] = None
|
| 427 |
highlighter : Optional[TextHighlighter] = None
|
| 428 |
reporter : Optional[ReportGenerator] = None
|
| 429 |
reasoning_generator: Optional[ReasoningGenerator] = None
|
| 430 |
document_extractor : Optional[DocumentExtractor] = None
|
| 431 |
analysis_cache : Optional[AnalysisCache] = None
|
| 432 |
|
|
|
|
|
|
|
| 433 |
|
| 434 |
# App state
|
| 435 |
app_start_time = time.time()
|
| 436 |
|
| 437 |
initialization_status = {"orchestrator" : False,
|
| 438 |
-
"attributor" : False,
|
| 439 |
"highlighter" : False,
|
| 440 |
"reporter" : False,
|
| 441 |
"reasoning_generator" : False,
|
| 442 |
"document_extractor" : False,
|
| 443 |
-
"analysis_cache" : False,
|
| 444 |
-
|
|
|
|
| 445 |
|
| 446 |
|
| 447 |
# ==================== APPLICATION LIFECYCLE ====================
|
|
@@ -451,12 +446,12 @@ async def startup_event():
|
|
| 451 |
Initialize all components on startup
|
| 452 |
"""
|
| 453 |
global orchestrator
|
| 454 |
-
global attributor
|
| 455 |
global highlighter
|
| 456 |
global reporter
|
| 457 |
global reasoning_generator
|
| 458 |
global document_extractor
|
| 459 |
global analysis_cache
|
|
|
|
| 460 |
global initialization_status
|
| 461 |
|
| 462 |
# Initialize centralized logging first
|
|
@@ -464,36 +459,36 @@ async def startup_event():
|
|
| 464 |
raise RuntimeError("Failed to initialize logging system")
|
| 465 |
|
| 466 |
logger.info("=" * 80)
|
| 467 |
-
logger.info("TEXT-AUTH API Starting Up...")
|
| 468 |
logger.info("=" * 80)
|
| 469 |
|
| 470 |
try:
|
| 471 |
-
# Initialize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
logger.info("Initializing Detection Orchestrator...")
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
if orchestrator.initialize():
|
| 479 |
initialization_status["orchestrator"] = True
|
| 480 |
-
logger.success("✓ Detection Orchestrator initialized")
|
| 481 |
|
| 482 |
else:
|
| 483 |
logger.warning("⚠ Detection Orchestrator initialization incomplete")
|
| 484 |
|
| 485 |
-
# Initialize Model Attributor
|
| 486 |
-
logger.info("Initializing Model Attributor...")
|
| 487 |
-
|
| 488 |
-
attributor = ModelAttributor()
|
| 489 |
-
|
| 490 |
-
if attributor.initialize():
|
| 491 |
-
initialization_status["attributor"] = True
|
| 492 |
-
logger.success("✓ Model Attributor initialized")
|
| 493 |
-
|
| 494 |
-
else:
|
| 495 |
-
logger.warning("⚠ Model Attributor initialization incomplete")
|
| 496 |
-
|
| 497 |
# Initialize Text Highlighter
|
| 498 |
logger.info("Initializing Text Highlighter...")
|
| 499 |
|
|
@@ -542,10 +537,11 @@ async def startup_event():
|
|
| 542 |
logger.success("✓ Analysis Cache initialized")
|
| 543 |
|
| 544 |
logger.info("=" * 80)
|
| 545 |
-
logger.success("TEXT-AUTH API Ready!")
|
| 546 |
logger.info(f"Server: {settings.HOST}:{settings.PORT}")
|
| 547 |
logger.info(f"Environment: {settings.ENVIRONMENT}")
|
| 548 |
logger.info(f"Device: {settings.DEVICE}")
|
|
|
|
| 549 |
logger.info("=" * 80)
|
| 550 |
|
| 551 |
except Exception as e:
|
|
@@ -559,6 +555,12 @@ async def shutdown_event():
|
|
| 559 |
"""
|
| 560 |
Cleanup on shutdown
|
| 561 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
if analysis_cache:
|
| 563 |
analysis_cache.clear()
|
| 564 |
|
|
@@ -567,13 +569,12 @@ async def shutdown_event():
|
|
| 567 |
logger.info("Shutdown complete")
|
| 568 |
|
| 569 |
|
| 570 |
-
|
| 571 |
# ==================== UTILITY FUNCTIONS ====================
|
| 572 |
def _get_domain_description(domain: Domain) -> str:
|
| 573 |
"""
|
| 574 |
Get description for a domain
|
| 575 |
"""
|
| 576 |
-
descriptions = {Domain.GENERAL : "General
|
| 577 |
Domain.ACADEMIC : "Academic papers, essays, research",
|
| 578 |
Domain.CREATIVE : "Creative writing, fiction, poetry",
|
| 579 |
Domain.AI_ML : "AI/ML research papers, technical content",
|
|
@@ -693,7 +694,7 @@ def _parse_domain(domain_str: Optional[str]) -> Optional[Domain]:
|
|
| 693 |
|
| 694 |
# Try to match with underscores/spaces variations
|
| 695 |
normalized_with_underscores = normalized_domain.replace(' ', '_')
|
| 696 |
-
if normalized_with_underscores in domain_mapping:
|
| 697 |
return domain_mapping[normalized_with_underscores]
|
| 698 |
|
| 699 |
# Try partial matching for more flexibility
|
|
@@ -724,19 +725,18 @@ def _validate_file_extension(filename: str) -> str:
|
|
| 724 |
return file_extension
|
| 725 |
|
| 726 |
|
| 727 |
-
def _generate_reasoning(detection_result: DetectionResult
|
| 728 |
"""
|
| 729 |
-
Generate detailed reasoning
|
| 730 |
"""
|
| 731 |
if not reasoning_generator:
|
| 732 |
return {}
|
| 733 |
|
| 734 |
try:
|
| 735 |
-
reasoning = reasoning_generator.generate(ensemble_result
|
| 736 |
-
metric_results
|
| 737 |
-
domain
|
| 738 |
-
|
| 739 |
-
text_length = detection_result.processed_text.word_count,
|
| 740 |
)
|
| 741 |
|
| 742 |
return safe_serialize_response(reasoning.to_dict())
|
|
@@ -746,8 +746,7 @@ def _generate_reasoning(detection_result: DetectionResult, attribution_result: O
|
|
| 746 |
return {}
|
| 747 |
|
| 748 |
|
| 749 |
-
def _generate_reports(detection_result: DetectionResult,
|
| 750 |
-
analysis_id: str = None) -> Dict[str, str]:
|
| 751 |
"""
|
| 752 |
Generate reports for detection results
|
| 753 |
"""
|
|
@@ -756,7 +755,6 @@ def _generate_reports(detection_result: DetectionResult, attribution_result: Opt
|
|
| 756 |
|
| 757 |
try:
|
| 758 |
report_files = reporter.generate_complete_report(detection_result = detection_result,
|
| 759 |
-
attribution_result = attribution_result,
|
| 760 |
highlighted_sentences = highlighted_sentences,
|
| 761 |
formats = ["json", "pdf"],
|
| 762 |
filename_prefix = analysis_id or f"report_{int(time.time() * 1000)}",
|
|
@@ -768,6 +766,55 @@ def _generate_reports(detection_result: DetectionResult, attribution_result: Opt
|
|
| 768 |
return {}
|
| 769 |
|
| 770 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
# ==================== ROOT & HEALTH ENDPOINTS ====================
|
| 772 |
@app.get("/", response_class = HTMLResponse)
|
| 773 |
async def root():
|
|
@@ -783,10 +830,10 @@ async def root():
|
|
| 783 |
|
| 784 |
# Fallback to static directory if exists
|
| 785 |
ui_static_path = Path(__file__).parent / "ui" / "static"
|
| 786 |
-
index_path
|
| 787 |
|
| 788 |
if index_path.exists():
|
| 789 |
-
with open(index_path, 'r', encoding='utf-8') as f:
|
| 790 |
return HTMLResponse(content=f.read())
|
| 791 |
|
| 792 |
return HTMLResponse(content = """
|
|
@@ -794,7 +841,7 @@ async def root():
|
|
| 794 |
<head><title>TEXT-AUTH API</title></head>
|
| 795 |
<body style="font-family: sans-serif; padding: 50px; text-align: center;">
|
| 796 |
<h1>🔍 TEXT-AUTH API</h1>
|
| 797 |
-
<p>
|
| 798 |
<p><a href="/api/docs">API Documentation</a></p>
|
| 799 |
<p><a href="/health">Health Check</a></p>
|
| 800 |
</body>
|
|
@@ -809,20 +856,22 @@ async def health_check():
|
|
| 809 |
Health check endpoint
|
| 810 |
"""
|
| 811 |
return HealthCheckResponse(status = "healthy" if all(initialization_status.values()) else "degraded",
|
| 812 |
-
version = "
|
| 813 |
uptime = time.time() - app_start_time,
|
| 814 |
models_loaded = initialization_status,
|
| 815 |
-
|
| 816 |
|
| 817 |
|
| 818 |
# ==================== ANALYSIS ENDPOINTS ====================
|
| 819 |
@app.post("/api/analyze", response_model = TextAnalysisResponse)
|
| 820 |
async def analyze_text(request: TextAnalysisRequest):
|
| 821 |
"""
|
| 822 |
-
Analyze text for
|
| 823 |
"""
|
| 824 |
if not orchestrator:
|
| 825 |
-
raise HTTPException(status_code=503,
|
|
|
|
|
|
|
| 826 |
|
| 827 |
start_time = time.time()
|
| 828 |
analysis_id = f"analysis_{int(time.time() * 1000)}"
|
|
@@ -836,42 +885,68 @@ async def analyze_text(request: TextAnalysisRequest):
|
|
| 836 |
detail = f"Invalid domain. Valid options: {[d.value for d in Domain]}",
|
| 837 |
)
|
| 838 |
|
| 839 |
-
# Run detection analysis
|
| 840 |
-
logger.info(f"[{analysis_id}] Analyzing text ({len(request.text)} chars)")
|
| 841 |
|
| 842 |
-
detection_result
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
|
| 847 |
# Convert detection result to ensure serializability
|
| 848 |
-
detection_dict
|
| 849 |
|
| 850 |
-
#
|
| 851 |
-
|
| 852 |
-
|
|
|
|
| 853 |
|
| 854 |
-
|
|
|
|
| 855 |
try:
|
| 856 |
-
logger.info(f"[{analysis_id}]
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
except Exception as e:
|
| 866 |
-
logger.warning(f"
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 873 |
try:
|
| 874 |
-
logger.info(f"[{analysis_id}] Generating highlights...")
|
| 875 |
highlighted_sentences = highlighter.generate_highlights(text = request.text,
|
| 876 |
metric_results = detection_result.metric_results,
|
| 877 |
ensemble_result = detection_result.ensemble_result,
|
|
@@ -880,29 +955,25 @@ async def analyze_text(request: TextAnalysisRequest):
|
|
| 880 |
|
| 881 |
highlighted_html = highlighter.generate_html(highlighted_sentences = highlighted_sentences,
|
| 882 |
include_legend = False,
|
| 883 |
-
include_metrics = request.include_metrics_summary,
|
| 884 |
)
|
| 885 |
except Exception as e:
|
| 886 |
logger.warning(f"Highlighting failed: {e}")
|
| 887 |
-
|
| 888 |
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
)
|
| 893 |
|
| 894 |
# Generate reports (if requested)
|
| 895 |
-
report_files
|
| 896 |
|
| 897 |
if request.generate_report:
|
| 898 |
try:
|
| 899 |
logger.info(f"[{analysis_id}] Generating reports...")
|
| 900 |
-
report_files = _generate_reports
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
except Exception as e:
|
| 907 |
logger.warning(f"Report generation failed: {e}")
|
| 908 |
|
|
@@ -911,14 +982,12 @@ async def analyze_text(request: TextAnalysisRequest):
|
|
| 911 |
# Cache the full analysis result
|
| 912 |
if analysis_cache:
|
| 913 |
cache_data = {'detection_result' : detection_result,
|
| 914 |
-
'attribution_result' : attribution_result,
|
| 915 |
'highlighted_sentences' : highlighted_sentences,
|
| 916 |
'original_text' : request.text,
|
| 917 |
'processing_time' : processing_time,
|
| 918 |
}
|
| 919 |
|
| 920 |
analysis_cache.set(analysis_id, cache_data)
|
| 921 |
-
|
| 922 |
logger.debug(f"Cached analysis: {analysis_id}")
|
| 923 |
|
| 924 |
# Log the detection event
|
|
@@ -928,14 +997,12 @@ async def analyze_text(request: TextAnalysisRequest):
|
|
| 928 |
confidence = detection_result.ensemble_result.overall_confidence,
|
| 929 |
domain = detection_result.domain_prediction.primary_domain.value,
|
| 930 |
processing_time = processing_time,
|
| 931 |
-
enable_attribution = request.enable_attribution,
|
| 932 |
enable_highlighting = request.enable_highlighting,
|
| 933 |
)
|
| 934 |
|
| 935 |
return TextAnalysisResponse(status = "success",
|
| 936 |
analysis_id = analysis_id,
|
| 937 |
detection_result = detection_dict,
|
| 938 |
-
attribution = attribution_dict,
|
| 939 |
highlighted_html = highlighted_html,
|
| 940 |
reasoning = reasoning_dict,
|
| 941 |
report_files = report_files,
|
|
@@ -943,13 +1010,12 @@ async def analyze_text(request: TextAnalysisRequest):
|
|
| 943 |
timestamp = datetime.now().isoformat(),
|
| 944 |
)
|
| 945 |
|
| 946 |
-
except HTTPException:
|
| 947 |
central_logger.log_error("TextAnalysisError",
|
| 948 |
f"Analysis failed for request",
|
| 949 |
{"text_length": len(request.text)},
|
| 950 |
e,
|
| 951 |
)
|
| 952 |
-
|
| 953 |
raise
|
| 954 |
|
| 955 |
except Exception as e:
|
|
@@ -960,10 +1026,9 @@ async def analyze_text(request: TextAnalysisRequest):
|
|
| 960 |
|
| 961 |
|
| 962 |
@app.post("/api/analyze/file", response_model = FileAnalysisResponse)
|
| 963 |
-
async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = Form(None),
|
| 964 |
-
use_sentence_level: bool = Form(True), include_metrics_summary: bool = Form(True), generate_report: bool = Form(False)):
|
| 965 |
"""
|
| 966 |
-
Analyze uploaded document
|
| 967 |
"""
|
| 968 |
if not document_extractor or not orchestrator:
|
| 969 |
raise HTTPException(status_code = 503,
|
|
@@ -993,13 +1058,13 @@ async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = For
|
|
| 993 |
|
| 994 |
logger.info(f"[{analysis_id}] Extracted {len(extracted_doc.text)} characters")
|
| 995 |
|
| 996 |
-
# Parse domain and analyze
|
| 997 |
-
domain_enum
|
| 998 |
|
| 999 |
-
detection_result
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
|
| 1004 |
# Set file_info on detection_result
|
| 1005 |
detection_result.file_info = {"filename" : file.filename,
|
|
@@ -1010,60 +1075,62 @@ async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = For
|
|
| 1010 |
}
|
| 1011 |
|
| 1012 |
# Convert to serializable dict
|
| 1013 |
-
detection_dict
|
| 1014 |
-
|
| 1015 |
-
# Attribution
|
| 1016 |
-
attribution_result = None
|
| 1017 |
-
attribution_dict = None
|
| 1018 |
|
| 1019 |
-
|
| 1020 |
-
try:
|
| 1021 |
-
attribution_result = attributor.attribute(text = extracted_doc.text,
|
| 1022 |
-
processed_text = detection_result.processed_text,
|
| 1023 |
-
metric_results = detection_result.metric_results,
|
| 1024 |
-
domain = detection_result.domain_prediction.primary_domain,
|
| 1025 |
-
)
|
| 1026 |
-
|
| 1027 |
-
attribution_dict = safe_serialize_response(attribution_result.to_dict())
|
| 1028 |
-
|
| 1029 |
-
except Exception as e:
|
| 1030 |
-
logger.warning(f"Attribution failed: {e}")
|
| 1031 |
-
|
| 1032 |
-
# Highlighting
|
| 1033 |
highlighted_sentences = None
|
| 1034 |
highlighted_html = None
|
| 1035 |
-
|
| 1036 |
-
|
|
|
|
| 1037 |
try:
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1048 |
except Exception as e:
|
| 1049 |
-
logger.warning(f"
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1055 |
|
| 1056 |
# Generate reports (if requested)
|
| 1057 |
-
report_files
|
| 1058 |
|
| 1059 |
if generate_report:
|
| 1060 |
try:
|
| 1061 |
logger.info(f"[{analysis_id}] Generating reports...")
|
| 1062 |
-
report_files = _generate_reports
|
| 1063 |
-
|
| 1064 |
-
|
| 1065 |
-
|
| 1066 |
-
|
|
|
|
| 1067 |
except Exception as e:
|
| 1068 |
logger.warning(f"Report generation failed: {e}")
|
| 1069 |
|
|
@@ -1072,7 +1139,6 @@ async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = For
|
|
| 1072 |
# Cache the full analysis result including Original Text
|
| 1073 |
if analysis_cache:
|
| 1074 |
cache_data = {'detection_result' : detection_result,
|
| 1075 |
-
'attribution_result' : attribution_result,
|
| 1076 |
'highlighted_sentences' : highlighted_sentences,
|
| 1077 |
'original_text' : extracted_doc.text,
|
| 1078 |
'processing_time' : processing_time,
|
|
@@ -1090,7 +1156,6 @@ async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = For
|
|
| 1090 |
"highlighted_html" : highlighted_html is not None,
|
| 1091 |
},
|
| 1092 |
detection_result = detection_dict,
|
| 1093 |
-
attribution = attribution_dict,
|
| 1094 |
highlighted_html = highlighted_html,
|
| 1095 |
reasoning = reasoning_dict,
|
| 1096 |
report_files = report_files,
|
|
@@ -1111,7 +1176,7 @@ async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = For
|
|
| 1111 |
@app.post("/api/analyze/batch", response_model = BatchAnalysisResponse)
|
| 1112 |
async def batch_analyze(request: BatchAnalysisRequest):
|
| 1113 |
"""
|
| 1114 |
-
Analyze multiple texts in batch
|
| 1115 |
- Limits : 1-100 texts per request
|
| 1116 |
"""
|
| 1117 |
if not orchestrator:
|
|
@@ -1124,78 +1189,76 @@ async def batch_analyze(request: BatchAnalysisRequest):
|
|
| 1124 |
detail = "Maximum 100 texts per batch",
|
| 1125 |
)
|
| 1126 |
|
| 1127 |
-
|
| 1128 |
start_time = time.time()
|
| 1129 |
batch_id = f"batch_{int(time.time() * 1000)}"
|
| 1130 |
|
| 1131 |
try:
|
| 1132 |
# Parse domain
|
| 1133 |
-
domain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1134 |
|
| 1135 |
-
|
| 1136 |
|
| 1137 |
-
results
|
|
|
|
| 1138 |
|
| 1139 |
-
for i,
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
processed_text = detection_result.processed_text,
|
| 1157 |
-
metric_results = detection_result.metric_results,
|
| 1158 |
-
domain = detection_result.domain_prediction.primary_domain,
|
| 1159 |
-
)
|
| 1160 |
|
| 1161 |
-
|
| 1162 |
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
|
| 1166 |
-
|
| 1167 |
-
|
| 1168 |
-
|
| 1169 |
-
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
|
| 1174 |
-
|
| 1175 |
-
try:
|
| 1176 |
-
report_files = _generate_reports(detection_result = detection_result,
|
| 1177 |
-
attribution_result = attribution_result,
|
| 1178 |
-
analysis_id = f"{batch_id}_{i}"
|
| 1179 |
-
)
|
| 1180 |
-
except Exception:
|
| 1181 |
-
pass
|
| 1182 |
-
|
| 1183 |
results.append(BatchAnalysisResult(index = i,
|
| 1184 |
status = "success",
|
| 1185 |
detection = detection_dict,
|
| 1186 |
-
attribution = attribution_dict,
|
| 1187 |
reasoning = reasoning_dict,
|
| 1188 |
-
report_files =
|
| 1189 |
-
)
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
except Exception as e:
|
| 1193 |
-
logger.error(f"[{batch_id}]
|
| 1194 |
-
results.append(BatchAnalysisResult(index
|
| 1195 |
-
status
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1199 |
|
| 1200 |
processing_time = time.time() - start_time
|
| 1201 |
success_count = sum(1 for r in results if r.status == "success")
|
|
@@ -1243,7 +1306,6 @@ async def generate_report(background_tasks: BackgroundTasks, analysis_id: str =
|
|
| 1243 |
|
| 1244 |
# Extract cached data
|
| 1245 |
detection_result = cached_data['detection_result']
|
| 1246 |
-
attribution_result = cached_data.get('attribution_result')
|
| 1247 |
highlighted_sentences = cached_data.get('highlighted_sentences')
|
| 1248 |
|
| 1249 |
# Parse formats
|
|
@@ -1259,12 +1321,12 @@ async def generate_report(background_tasks: BackgroundTasks, analysis_id: str =
|
|
| 1259 |
# Generate reports using cached data
|
| 1260 |
logger.info(f"Generating {', '.join(requested_formats)} report(s) for {analysis_id}")
|
| 1261 |
|
| 1262 |
-
report_files = reporter.generate_complete_report
|
| 1263 |
-
|
| 1264 |
-
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
|
| 1268 |
|
| 1269 |
# Extract only the filename from the full path for the response
|
| 1270 |
report_filenames = dict()
|
|
@@ -1289,6 +1351,7 @@ async def generate_report(background_tasks: BackgroundTasks, analysis_id: str =
|
|
| 1289 |
detail = str(e),
|
| 1290 |
)
|
| 1291 |
|
|
|
|
| 1292 |
@app.get("/api/report/download/{filename}")
|
| 1293 |
async def download_report(filename: str):
|
| 1294 |
"""
|
|
@@ -1328,19 +1391,6 @@ async def list_domains():
|
|
| 1328 |
return {"domains": domains_list}
|
| 1329 |
|
| 1330 |
|
| 1331 |
-
@app.get("/api/models")
|
| 1332 |
-
async def list_ai_models():
|
| 1333 |
-
"""
|
| 1334 |
-
List all AI models that can be attributed
|
| 1335 |
-
"""
|
| 1336 |
-
return {"models" : [{"value" : model.value,
|
| 1337 |
-
"name" : model.value.replace('-', ' ').replace('_', ' ').title(),
|
| 1338 |
-
}
|
| 1339 |
-
for model in AIModel if model not in [AIModel.HUMAN, AIModel.UNKNOWN]
|
| 1340 |
-
]
|
| 1341 |
-
}
|
| 1342 |
-
|
| 1343 |
-
|
| 1344 |
@app.get("/api/cache/stats")
|
| 1345 |
async def get_cache_stats():
|
| 1346 |
"""
|
|
@@ -1418,6 +1468,7 @@ async def log_requests(request: Request, call_next):
|
|
| 1418 |
|
| 1419 |
|
| 1420 |
|
|
|
|
| 1421 |
# ==================== MAIN ====================
|
| 1422 |
if __name__ == "__main__":
|
| 1423 |
# Configure logging
|
|
@@ -1430,4 +1481,4 @@ if __name__ == "__main__":
|
|
| 1430 |
reload = settings.DEBUG,
|
| 1431 |
log_level = log_level,
|
| 1432 |
workers = 1 if settings.DEBUG else settings.WORKERS,
|
| 1433 |
-
)
|
|
|
|
| 3 |
import time
|
| 4 |
import json
|
| 5 |
import uvicorn
|
| 6 |
+
import asyncio
|
| 7 |
import numpy as np
|
| 8 |
from typing import Any
|
| 9 |
from typing import List
|
|
|
|
| 20 |
from datetime import datetime
|
| 21 |
from fastapi import UploadFile
|
| 22 |
from pydantic import BaseModel
|
| 23 |
+
from config.enums import Domain
|
| 24 |
from fastapi import HTTPException
|
| 25 |
from fastapi import BackgroundTasks
|
| 26 |
from config.settings import settings
|
| 27 |
from utils.logger import central_logger
|
| 28 |
from utils.logger import log_api_request
|
|
|
|
|
|
|
| 29 |
from fastapi.responses import JSONResponse
|
| 30 |
from fastapi.responses import HTMLResponse
|
| 31 |
from fastapi.responses import FileResponse
|
| 32 |
+
from config.schemas import DetectionResult
|
| 33 |
from fastapi.staticfiles import StaticFiles
|
| 34 |
+
from utils.logger import log_analysis_event
|
| 35 |
+
from services.highlighter import TextHighlighter
|
| 36 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 37 |
+
from concurrent.futures import ProcessPoolExecutor
|
|
|
|
|
|
|
| 38 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 39 |
from reporter.report_generator import ReportGenerator
|
| 40 |
+
from services.orchestrator import DetectionOrchestrator
|
|
|
|
|
|
|
| 41 |
from processors.document_extractor import DocumentExtractor
|
| 42 |
+
from services.reasoning_generator import ReasoningGenerator
|
| 43 |
|
| 44 |
|
| 45 |
|
|
|
|
| 198 |
"""
|
| 199 |
text : str = Field(..., min_length = 50, max_length = 50000, description = "Text to analyze")
|
| 200 |
domain : Optional[str] = Field(None, description = "Override automatic domain detection")
|
|
|
|
| 201 |
enable_highlighting : bool = Field(True, description = "Generate sentence highlighting")
|
| 202 |
skip_expensive_metrics : bool = Field(False, description = "Skip computationally expensive metrics")
|
| 203 |
use_sentence_level : bool = Field(True, description = "Use sentence-level analysis for highlighting")
|
|
|
|
| 212 |
status : str
|
| 213 |
analysis_id : str
|
| 214 |
detection_result : Dict[str, Any]
|
|
|
|
| 215 |
highlighted_html : Optional[str] = None
|
| 216 |
reasoning : Optional[Dict[str, Any]] = None
|
| 217 |
report_files : Optional[Dict[str, str]] = None
|
|
|
|
| 225 |
"""
|
| 226 |
texts : List[str] = Field(..., min_items = 1, max_items = 100)
|
| 227 |
domain : Optional[str] = None
|
|
|
|
| 228 |
skip_expensive_metrics : bool = True
|
| 229 |
generate_reports : bool = False
|
| 230 |
|
|
|
|
| 236 |
index : int
|
| 237 |
status : str
|
| 238 |
detection : Optional[Dict[str, Any]] = None
|
|
|
|
| 239 |
reasoning : Optional[Dict[str, Any]] = None
|
| 240 |
report_files : Optional[Dict[str, str]] = None
|
| 241 |
error : Optional[str] = None
|
|
|
|
| 263 |
analysis_id : str
|
| 264 |
file_info : Dict[str, Any]
|
| 265 |
detection_result : Dict[str, Any]
|
|
|
|
| 266 |
highlighted_html : Optional[str] = None
|
| 267 |
reasoning : Optional[Dict[str, Any]] = None
|
| 268 |
report_files : Optional[Dict[str, str]] = None
|
|
|
|
| 318 |
self.ttl_seconds = ttl_seconds
|
| 319 |
logger.info(f"AnalysisCache initialized (max_size={max_size}, ttl={ttl_seconds}s)")
|
| 320 |
|
| 321 |
+
|
| 322 |
def set(self, analysis_id: str, data: Dict[str, Any]) -> None:
|
| 323 |
"""
|
| 324 |
Store analysis result in cache
|
|
|
|
| 327 |
self._cleanup_expired()
|
| 328 |
|
| 329 |
# If cache is full, remove oldest entry
|
| 330 |
+
if (len(self.cache) >= self.max_size):
|
| 331 |
+
oldest_key = min(self.cache.keys(), key = lambda k: self.cache[k]['timestamp'])
|
| 332 |
+
|
| 333 |
del self.cache[oldest_key]
|
| 334 |
+
|
| 335 |
logger.debug(f"Cache full, removed oldest entry: {oldest_key}")
|
| 336 |
|
| 337 |
# Store new entry
|
| 338 |
+
self.cache[analysis_id] = {'data' : data,
|
| 339 |
+
'timestamp' : time.time()
|
| 340 |
+
}
|
|
|
|
| 341 |
logger.debug(f"Cached analysis: {analysis_id} (cache size: {len(self.cache)})")
|
| 342 |
|
| 343 |
+
|
| 344 |
def get(self, analysis_id: str) -> Optional[Dict[str, Any]]:
|
| 345 |
"""
|
| 346 |
Retrieve analysis result from cache
|
|
|
|
| 352 |
entry = self.cache[analysis_id]
|
| 353 |
|
| 354 |
# Check if expired
|
| 355 |
+
if ((time.time() - entry['timestamp']) > self.ttl_seconds):
|
| 356 |
del self.cache[analysis_id]
|
| 357 |
logger.debug(f"Cache expired: {analysis_id}")
|
| 358 |
return None
|
|
|
|
| 360 |
logger.debug(f"Cache hit: {analysis_id}")
|
| 361 |
return entry['data']
|
| 362 |
|
| 363 |
+
|
| 364 |
def _cleanup_expired(self) -> None:
|
| 365 |
"""
|
| 366 |
Remove expired entries from cache
|
| 367 |
"""
|
| 368 |
current_time = time.time()
|
| 369 |
+
expired_keys = [key for key, entry in self.cache.items() if ((current_time - entry['timestamp']) > self.ttl_seconds)]
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
for key in expired_keys:
|
| 372 |
del self.cache[key]
|
|
|
|
| 374 |
if expired_keys:
|
| 375 |
logger.debug(f"Cleaned up {len(expired_keys)} expired cache entries")
|
| 376 |
|
| 377 |
+
|
| 378 |
def clear(self) -> None:
|
| 379 |
"""
|
| 380 |
Clear all cached entries
|
|
|
|
| 383 |
self.cache.clear()
|
| 384 |
logger.info(f"Cache cleared ({count} entries removed)")
|
| 385 |
|
| 386 |
+
|
| 387 |
def size(self) -> int:
|
| 388 |
"""
|
| 389 |
Get current cache size
|
|
|
|
| 392 |
|
| 393 |
|
| 394 |
# ==================== FASTAPI APPLICATION ====================
|
| 395 |
+
app = FastAPI(title = "Text Forensics API",
|
| 396 |
+
description = "Evidence-based statistical and linguistic text analysis API",
|
| 397 |
version = "1.0.0",
|
| 398 |
docs_url = "/api/docs",
|
| 399 |
redoc_url = "/api/redoc",
|
|
|
|
| 417 |
|
| 418 |
# Global instances
|
| 419 |
orchestrator : Optional[DetectionOrchestrator] = None
|
|
|
|
| 420 |
highlighter : Optional[TextHighlighter] = None
|
| 421 |
reporter : Optional[ReportGenerator] = None
|
| 422 |
reasoning_generator: Optional[ReasoningGenerator] = None
|
| 423 |
document_extractor : Optional[DocumentExtractor] = None
|
| 424 |
analysis_cache : Optional[AnalysisCache] = None
|
| 425 |
|
| 426 |
+
# Thread pool executor for parallel processing
|
| 427 |
+
parallel_executor : Optional[ThreadPoolExecutor] = None
|
| 428 |
|
| 429 |
# App state
|
| 430 |
app_start_time = time.time()
|
| 431 |
|
| 432 |
initialization_status = {"orchestrator" : False,
|
|
|
|
| 433 |
"highlighter" : False,
|
| 434 |
"reporter" : False,
|
| 435 |
"reasoning_generator" : False,
|
| 436 |
"document_extractor" : False,
|
| 437 |
+
"analysis_cache" : False,
|
| 438 |
+
"parallel_executor" : False,
|
| 439 |
+
}
|
| 440 |
|
| 441 |
|
| 442 |
# ==================== APPLICATION LIFECYCLE ====================
|
|
|
|
| 446 |
Initialize all components on startup
|
| 447 |
"""
|
| 448 |
global orchestrator
|
|
|
|
| 449 |
global highlighter
|
| 450 |
global reporter
|
| 451 |
global reasoning_generator
|
| 452 |
global document_extractor
|
| 453 |
global analysis_cache
|
| 454 |
+
global parallel_executor
|
| 455 |
global initialization_status
|
| 456 |
|
| 457 |
# Initialize centralized logging first
|
|
|
|
| 459 |
raise RuntimeError("Failed to initialize logging system")
|
| 460 |
|
| 461 |
logger.info("=" * 80)
|
| 462 |
+
logger.info("TEXT-AUTH Forensic Analysis API Starting Up...")
|
| 463 |
logger.info("=" * 80)
|
| 464 |
|
| 465 |
try:
|
| 466 |
+
# Initialize ThreadPoolExecutor for parallel metric calculation
|
| 467 |
+
logger.info("Initializing Parallel Executor...")
|
| 468 |
+
parallel_executor = ThreadPoolExecutor(
|
| 469 |
+
max_workers = getattr(settings, 'PARALLEL_WORKERS', 4)
|
| 470 |
+
)
|
| 471 |
+
initialization_status["parallel_executor"] = True
|
| 472 |
+
logger.success(f"✓ Parallel Executor initialized with {parallel_executor._max_workers} workers")
|
| 473 |
+
|
| 474 |
+
# Initialize Detection Orchestrator with parallel execution enabled
|
| 475 |
logger.info("Initializing Detection Orchestrator...")
|
| 476 |
+
|
| 477 |
+
# Use the factory method to create orchestrator with executor
|
| 478 |
+
orchestrator = DetectionOrchestrator.create_with_executor(
|
| 479 |
+
max_workers = getattr(settings, 'PARALLEL_WORKERS', 4),
|
| 480 |
+
enable_language_detection = True,
|
| 481 |
+
parallel_execution = True, # Enable parallel execution
|
| 482 |
+
skip_expensive_metrics = False,
|
| 483 |
+
)
|
| 484 |
|
| 485 |
if orchestrator.initialize():
|
| 486 |
initialization_status["orchestrator"] = True
|
| 487 |
+
logger.success("✓ Detection Orchestrator initialized with parallel execution")
|
| 488 |
|
| 489 |
else:
|
| 490 |
logger.warning("⚠ Detection Orchestrator initialization incomplete")
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
# Initialize Text Highlighter
|
| 493 |
logger.info("Initializing Text Highlighter...")
|
| 494 |
|
|
|
|
| 537 |
logger.success("✓ Analysis Cache initialized")
|
| 538 |
|
| 539 |
logger.info("=" * 80)
|
| 540 |
+
logger.success("TEXT-AUTH Forensic Analysis API Ready!")
|
| 541 |
logger.info(f"Server: {settings.HOST}:{settings.PORT}")
|
| 542 |
logger.info(f"Environment: {settings.ENVIRONMENT}")
|
| 543 |
logger.info(f"Device: {settings.DEVICE}")
|
| 544 |
+
logger.info(f"Parallel Execution: Enabled")
|
| 545 |
logger.info("=" * 80)
|
| 546 |
|
| 547 |
except Exception as e:
|
|
|
|
| 555 |
"""
|
| 556 |
Cleanup on shutdown
|
| 557 |
"""
|
| 558 |
+
# Clean up orchestrator first (it will handle executor cleanup)
|
| 559 |
+
if orchestrator:
|
| 560 |
+
orchestrator.cleanup()
|
| 561 |
+
logger.info("Orchestrator cleanup complete")
|
| 562 |
+
|
| 563 |
+
# Additional cleanup
|
| 564 |
if analysis_cache:
|
| 565 |
analysis_cache.clear()
|
| 566 |
|
|
|
|
| 569 |
logger.info("Shutdown complete")
|
| 570 |
|
| 571 |
|
|
|
|
| 572 |
# ==================== UTILITY FUNCTIONS ====================
|
| 573 |
def _get_domain_description(domain: Domain) -> str:
|
| 574 |
"""
|
| 575 |
Get description for a domain
|
| 576 |
"""
|
| 577 |
+
descriptions = {Domain.GENERAL : "General-purpose text without domain-specific structure",
|
| 578 |
Domain.ACADEMIC : "Academic papers, essays, research",
|
| 579 |
Domain.CREATIVE : "Creative writing, fiction, poetry",
|
| 580 |
Domain.AI_ML : "AI/ML research papers, technical content",
|
|
|
|
| 694 |
|
| 695 |
# Try to match with underscores/spaces variations
|
| 696 |
normalized_with_underscores = normalized_domain.replace(' ', '_')
|
| 697 |
+
if (normalized_with_underscores in domain_mapping):
|
| 698 |
return domain_mapping[normalized_with_underscores]
|
| 699 |
|
| 700 |
# Try partial matching for more flexibility
|
|
|
|
| 725 |
return file_extension
|
| 726 |
|
| 727 |
|
| 728 |
+
def _generate_reasoning(detection_result: DetectionResult) -> Dict[str, Any]:
|
| 729 |
"""
|
| 730 |
+
Generate detailed forensic reasoning explaining metric-level evidence
|
| 731 |
"""
|
| 732 |
if not reasoning_generator:
|
| 733 |
return {}
|
| 734 |
|
| 735 |
try:
|
| 736 |
+
reasoning = reasoning_generator.generate(ensemble_result = detection_result.ensemble_result,
|
| 737 |
+
metric_results = detection_result.metric_results,
|
| 738 |
+
domain = detection_result.domain_prediction.primary_domain,
|
| 739 |
+
text_length = detection_result.processed_text.word_count,
|
|
|
|
| 740 |
)
|
| 741 |
|
| 742 |
return safe_serialize_response(reasoning.to_dict())
|
|
|
|
| 746 |
return {}
|
| 747 |
|
| 748 |
|
| 749 |
+
def _generate_reports(detection_result: DetectionResult, highlighted_sentences: Optional[List] = None, analysis_id: str = None) -> Dict[str, str]:
|
|
|
|
| 750 |
"""
|
| 751 |
Generate reports for detection results
|
| 752 |
"""
|
|
|
|
| 755 |
|
| 756 |
try:
|
| 757 |
report_files = reporter.generate_complete_report(detection_result = detection_result,
|
|
|
|
| 758 |
highlighted_sentences = highlighted_sentences,
|
| 759 |
formats = ["json", "pdf"],
|
| 760 |
filename_prefix = analysis_id or f"report_{int(time.time() * 1000)}",
|
|
|
|
| 766 |
return {}
|
| 767 |
|
| 768 |
|
| 769 |
+
# ==================== ASYNC HELPER FUNCTIONS ====================
|
| 770 |
+
async def _run_detection_parallel(text: str, domain: Optional[Domain], skip_expensive: bool) -> DetectionResult:
|
| 771 |
+
"""
|
| 772 |
+
Run forensic analysis in parallel mode
|
| 773 |
+
"""
|
| 774 |
+
if not orchestrator:
|
| 775 |
+
raise HTTPException(status_code=503, detail="Service not initialized")
|
| 776 |
+
|
| 777 |
+
# Use orchestrator's analyze method which now handles parallel execution internally
|
| 778 |
+
return orchestrator.analyze(text = text,
|
| 779 |
+
domain = domain,
|
| 780 |
+
skip_expensive = skip_expensive,
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
+
|
| 784 |
+
async def _run_batch_analysis_parallel(texts: List[str], domain: Optional[Domain], skip_expensive: bool) -> List[DetectionResult]:
|
| 785 |
+
"""
|
| 786 |
+
Run batch analysis with parallel execution
|
| 787 |
+
"""
|
| 788 |
+
if not orchestrator:
|
| 789 |
+
raise HTTPException(status_code=503, detail="Service not initialized")
|
| 790 |
+
|
| 791 |
+
# Create tasks for parallel execution
|
| 792 |
+
tasks = list()
|
| 793 |
+
|
| 794 |
+
for text in texts:
|
| 795 |
+
task = asyncio.create_task(asyncio.to_thread(orchestrator.analyze,
|
| 796 |
+
text = text,
|
| 797 |
+
domain = domain,
|
| 798 |
+
skip_expensive = skip_expensive,
|
| 799 |
+
)
|
| 800 |
+
)
|
| 801 |
+
tasks.append(task)
|
| 802 |
+
|
| 803 |
+
# Wait for all tasks to complete
|
| 804 |
+
results = await asyncio.gather(*tasks, return_exceptions = True)
|
| 805 |
+
|
| 806 |
+
# Process results
|
| 807 |
+
detection_results = list()
|
| 808 |
+
|
| 809 |
+
for result in results:
|
| 810 |
+
if isinstance(result, Exception):
|
| 811 |
+
raise result
|
| 812 |
+
|
| 813 |
+
detection_results.append(result)
|
| 814 |
+
|
| 815 |
+
return detection_results
|
| 816 |
+
|
| 817 |
+
|
| 818 |
# ==================== ROOT & HEALTH ENDPOINTS ====================
|
| 819 |
@app.get("/", response_class = HTMLResponse)
|
| 820 |
async def root():
|
|
|
|
| 830 |
|
| 831 |
# Fallback to static directory if exists
|
| 832 |
ui_static_path = Path(__file__).parent / "ui" / "static"
|
| 833 |
+
index_path = ui_static_path / "index.html"
|
| 834 |
|
| 835 |
if index_path.exists():
|
| 836 |
+
with open(index_path, 'r', encoding = 'utf-8') as f:
|
| 837 |
return HTMLResponse(content=f.read())
|
| 838 |
|
| 839 |
return HTMLResponse(content = """
|
|
|
|
| 841 |
<head><title>TEXT-AUTH API</title></head>
|
| 842 |
<body style="font-family: sans-serif; padding: 50px; text-align: center;">
|
| 843 |
<h1>🔍 TEXT-AUTH API</h1>
|
| 844 |
+
<p>Evidence-First Text Forensics Platform v1.0</p>
|
| 845 |
<p><a href="/api/docs">API Documentation</a></p>
|
| 846 |
<p><a href="/health">Health Check</a></p>
|
| 847 |
</body>
|
|
|
|
| 856 |
Health check endpoint
|
| 857 |
"""
|
| 858 |
return HealthCheckResponse(status = "healthy" if all(initialization_status.values()) else "degraded",
|
| 859 |
+
version = "1.0.0",
|
| 860 |
uptime = time.time() - app_start_time,
|
| 861 |
models_loaded = initialization_status,
|
| 862 |
+
)
|
| 863 |
|
| 864 |
|
| 865 |
# ==================== ANALYSIS ENDPOINTS ====================
|
| 866 |
@app.post("/api/analyze", response_model = TextAnalysisResponse)
|
| 867 |
async def analyze_text(request: TextAnalysisRequest):
|
| 868 |
"""
|
| 869 |
+
Analyze text for statistical consistency with language-model generation patterns using parallel metric calculation
|
| 870 |
"""
|
| 871 |
if not orchestrator:
|
| 872 |
+
raise HTTPException(status_code = 503,
|
| 873 |
+
detail = "Service not initialized",
|
| 874 |
+
)
|
| 875 |
|
| 876 |
start_time = time.time()
|
| 877 |
analysis_id = f"analysis_{int(time.time() * 1000)}"
|
|
|
|
| 885 |
detail = f"Invalid domain. Valid options: {[d.value for d in Domain]}",
|
| 886 |
)
|
| 887 |
|
| 888 |
+
# Run detection analysis with parallel execution (handled internally by orchestrator)
|
| 889 |
+
logger.info(f"[{analysis_id}] Analyzing text ({len(request.text)} chars) with parallel metrics")
|
| 890 |
|
| 891 |
+
detection_result = await _run_detection_parallel(text = request.text,
|
| 892 |
+
domain = domain,
|
| 893 |
+
skip_expensive = request.skip_expensive_metrics
|
| 894 |
+
)
|
| 895 |
|
| 896 |
# Convert detection result to ensure serializability
|
| 897 |
+
detection_dict = safe_serialize_response(detection_result.to_dict())
|
| 898 |
|
| 899 |
+
# Highlighting (if enabled) - run in parallel with reasoning generation
|
| 900 |
+
highlighted_sentences = None
|
| 901 |
+
highlighted_html = None
|
| 902 |
+
reasoning_dict = dict()
|
| 903 |
|
| 904 |
+
# Run highlighting and reasoning generation in parallel if both are needed
|
| 905 |
+
if (request.enable_highlighting and highlighter and reasoning_generator):
|
| 906 |
try:
|
| 907 |
+
logger.info(f"[{analysis_id}] Generating highlights and reasoning in parallel...")
|
| 908 |
+
|
| 909 |
+
# Create parallel tasks for highlighting and reasoning
|
| 910 |
+
highlight_task = asyncio.create_task(asyncio.to_thread(highlighter.generate_highlights,
|
| 911 |
+
text = request.text,
|
| 912 |
+
metric_results = detection_result.metric_results,
|
| 913 |
+
ensemble_result = detection_result.ensemble_result,
|
| 914 |
+
use_sentence_level = request.use_sentence_level,
|
| 915 |
+
)
|
| 916 |
+
)
|
| 917 |
+
|
| 918 |
+
reasoning_task = asyncio.create_task(asyncio.to_thread(_generate_reasoning,
|
| 919 |
+
detection_result = detection_result
|
| 920 |
+
)
|
| 921 |
+
)
|
| 922 |
+
|
| 923 |
+
# Wait for both tasks to complete
|
| 924 |
+
highlighted_sentences, reasoning_dict = await asyncio.gather(highlight_task, reasoning_task)
|
| 925 |
+
|
| 926 |
+
# Generate HTML from highlighted sentences
|
| 927 |
+
highlighted_html = highlighter.generate_html(highlighted_sentences = highlighted_sentences,
|
| 928 |
+
include_legend = False,
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
except Exception as e:
|
| 932 |
+
logger.warning(f"Parallel highlighting/reasoning failed: {e}")
|
| 933 |
+
# Fallback to sequential if parallel fails
|
| 934 |
+
try:
|
| 935 |
+
highlighted_sentences = highlighter.generate_highlights(text = request.text,
|
| 936 |
+
metric_results = detection_result.metric_results,
|
| 937 |
+
ensemble_result = detection_result.ensemble_result,
|
| 938 |
+
use_sentence_level = request.use_sentence_level,
|
| 939 |
+
)
|
| 940 |
+
|
| 941 |
+
highlighted_html = highlighter.generate_html(highlighted_sentences = highlighted_sentences,
|
| 942 |
+
include_legend = False,
|
| 943 |
+
)
|
| 944 |
+
except Exception as e2:
|
| 945 |
+
logger.warning(f"Highlighting fallback also failed: {e2}")
|
| 946 |
+
|
| 947 |
+
elif request.enable_highlighting and highlighter:
|
| 948 |
+
# Only highlighting requested
|
| 949 |
try:
|
|
|
|
| 950 |
highlighted_sentences = highlighter.generate_highlights(text = request.text,
|
| 951 |
metric_results = detection_result.metric_results,
|
| 952 |
ensemble_result = detection_result.ensemble_result,
|
|
|
|
| 955 |
|
| 956 |
highlighted_html = highlighter.generate_html(highlighted_sentences = highlighted_sentences,
|
| 957 |
include_legend = False,
|
|
|
|
| 958 |
)
|
| 959 |
except Exception as e:
|
| 960 |
logger.warning(f"Highlighting failed: {e}")
|
|
|
|
| 961 |
|
| 962 |
+
elif reasoning_generator:
|
| 963 |
+
# Only reasoning requested
|
| 964 |
+
reasoning_dict = _generate_reasoning(detection_result = detection_result)
|
|
|
|
| 965 |
|
| 966 |
# Generate reports (if requested)
|
| 967 |
+
report_files = dict()
|
| 968 |
|
| 969 |
if request.generate_report:
|
| 970 |
try:
|
| 971 |
logger.info(f"[{analysis_id}] Generating reports...")
|
| 972 |
+
report_files = await asyncio.to_thread(_generate_reports,
|
| 973 |
+
detection_result = detection_result,
|
| 974 |
+
highlighted_sentences = highlighted_sentences,
|
| 975 |
+
analysis_id = analysis_id,
|
| 976 |
+
)
|
|
|
|
| 977 |
except Exception as e:
|
| 978 |
logger.warning(f"Report generation failed: {e}")
|
| 979 |
|
|
|
|
| 982 |
# Cache the full analysis result
|
| 983 |
if analysis_cache:
|
| 984 |
cache_data = {'detection_result' : detection_result,
|
|
|
|
| 985 |
'highlighted_sentences' : highlighted_sentences,
|
| 986 |
'original_text' : request.text,
|
| 987 |
'processing_time' : processing_time,
|
| 988 |
}
|
| 989 |
|
| 990 |
analysis_cache.set(analysis_id, cache_data)
|
|
|
|
| 991 |
logger.debug(f"Cached analysis: {analysis_id}")
|
| 992 |
|
| 993 |
# Log the detection event
|
|
|
|
| 997 |
confidence = detection_result.ensemble_result.overall_confidence,
|
| 998 |
domain = detection_result.domain_prediction.primary_domain.value,
|
| 999 |
processing_time = processing_time,
|
|
|
|
| 1000 |
enable_highlighting = request.enable_highlighting,
|
| 1001 |
)
|
| 1002 |
|
| 1003 |
return TextAnalysisResponse(status = "success",
|
| 1004 |
analysis_id = analysis_id,
|
| 1005 |
detection_result = detection_dict,
|
|
|
|
| 1006 |
highlighted_html = highlighted_html,
|
| 1007 |
reasoning = reasoning_dict,
|
| 1008 |
report_files = report_files,
|
|
|
|
| 1010 |
timestamp = datetime.now().isoformat(),
|
| 1011 |
)
|
| 1012 |
|
| 1013 |
+
except HTTPException as e:
|
| 1014 |
central_logger.log_error("TextAnalysisError",
|
| 1015 |
f"Analysis failed for request",
|
| 1016 |
{"text_length": len(request.text)},
|
| 1017 |
e,
|
| 1018 |
)
|
|
|
|
| 1019 |
raise
|
| 1020 |
|
| 1021 |
except Exception as e:
|
|
|
|
| 1026 |
|
| 1027 |
|
| 1028 |
@app.post("/api/analyze/file", response_model = FileAnalysisResponse)
|
| 1029 |
+
async def analyze_file(file: UploadFile = File(...), domain: Optional[str] = Form(None), skip_expensive_metrics: bool = Form(False), use_sentence_level: bool = Form(True), include_metrics_summary: bool = Form(True), generate_report: bool = Form(False)):
|
|
|
|
| 1030 |
"""
|
| 1031 |
+
Analyze uploaded document for linguistic and statistical consistency patterns using parallel processing
|
| 1032 |
"""
|
| 1033 |
if not document_extractor or not orchestrator:
|
| 1034 |
raise HTTPException(status_code = 503,
|
|
|
|
| 1058 |
|
| 1059 |
logger.info(f"[{analysis_id}] Extracted {len(extracted_doc.text)} characters")
|
| 1060 |
|
| 1061 |
+
# Parse domain and analyze with parallel execution
|
| 1062 |
+
domain_enum = _parse_domain(domain)
|
| 1063 |
|
| 1064 |
+
detection_result = await _run_detection_parallel(text = extracted_doc.text,
|
| 1065 |
+
domain = domain_enum,
|
| 1066 |
+
skip_expensive = skip_expensive_metrics,
|
| 1067 |
+
)
|
| 1068 |
|
| 1069 |
# Set file_info on detection_result
|
| 1070 |
detection_result.file_info = {"filename" : file.filename,
|
|
|
|
| 1075 |
}
|
| 1076 |
|
| 1077 |
# Convert to serializable dict
|
| 1078 |
+
detection_dict = safe_serialize_response(detection_result.to_dict())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1079 |
|
| 1080 |
+
# Parallel highlighting and reasoning generation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1081 |
highlighted_sentences = None
|
| 1082 |
highlighted_html = None
|
| 1083 |
+
reasoning_dict = {}
|
| 1084 |
+
|
| 1085 |
+
if highlighter and reasoning_generator:
|
| 1086 |
try:
|
| 1087 |
+
# Run highlighting and reasoning in parallel
|
| 1088 |
+
highlight_task = asyncio.create_task(asyncio.to_thread(highlighter.generate_highlights,
|
| 1089 |
+
text = extracted_doc.text,
|
| 1090 |
+
metric_results = detection_result.metric_results,
|
| 1091 |
+
ensemble_result = detection_result.ensemble_result,
|
| 1092 |
+
use_sentence_level = use_sentence_level,
|
| 1093 |
+
)
|
| 1094 |
+
)
|
| 1095 |
+
|
| 1096 |
+
reasoning_task = asyncio.create_task(asyncio.to_thread(_generate_reasoning,
|
| 1097 |
+
detection_result = detection_result
|
| 1098 |
+
)
|
| 1099 |
+
)
|
| 1100 |
+
|
| 1101 |
+
highlighted_sentences, reasoning_dict = await asyncio.gather(highlight_task, reasoning_task)
|
| 1102 |
+
|
| 1103 |
+
highlighted_html = highlighter.generate_html(highlighted_sentences = highlighted_sentences,
|
| 1104 |
+
include_legend = False,
|
| 1105 |
+
)
|
| 1106 |
+
|
| 1107 |
except Exception as e:
|
| 1108 |
+
logger.warning(f"Parallel highlighting/reasoning failed: {e}")
|
| 1109 |
+
# Fallback
|
| 1110 |
+
try:
|
| 1111 |
+
highlighted_sentences = highlighter.generate_highlights(text = extracted_doc.text,
|
| 1112 |
+
metric_results = detection_result.metric_results,
|
| 1113 |
+
ensemble_result = detection_result.ensemble_result,
|
| 1114 |
+
use_sentence_level = use_sentence_level,
|
| 1115 |
+
)
|
| 1116 |
+
highlighted_html = highlighter.generate_html(highlighted_sentences = highlighted_sentences,
|
| 1117 |
+
include_legend = False,
|
| 1118 |
+
)
|
| 1119 |
+
except Exception as e2:
|
| 1120 |
+
logger.warning(f"Highlighting fallback also failed: {e2}")
|
| 1121 |
|
| 1122 |
# Generate reports (if requested)
|
| 1123 |
+
report_files = dict()
|
| 1124 |
|
| 1125 |
if generate_report:
|
| 1126 |
try:
|
| 1127 |
logger.info(f"[{analysis_id}] Generating reports...")
|
| 1128 |
+
report_files = await asyncio.to_thread(_generate_reports,
|
| 1129 |
+
detection_result = detection_result,
|
| 1130 |
+
highlighted_sentences = highlighted_sentences,
|
| 1131 |
+
analysis_id = analysis_id,
|
| 1132 |
+
)
|
| 1133 |
+
|
| 1134 |
except Exception as e:
|
| 1135 |
logger.warning(f"Report generation failed: {e}")
|
| 1136 |
|
|
|
|
| 1139 |
# Cache the full analysis result including Original Text
|
| 1140 |
if analysis_cache:
|
| 1141 |
cache_data = {'detection_result' : detection_result,
|
|
|
|
| 1142 |
'highlighted_sentences' : highlighted_sentences,
|
| 1143 |
'original_text' : extracted_doc.text,
|
| 1144 |
'processing_time' : processing_time,
|
|
|
|
| 1156 |
"highlighted_html" : highlighted_html is not None,
|
| 1157 |
},
|
| 1158 |
detection_result = detection_dict,
|
|
|
|
| 1159 |
highlighted_html = highlighted_html,
|
| 1160 |
reasoning = reasoning_dict,
|
| 1161 |
report_files = report_files,
|
|
|
|
| 1176 |
@app.post("/api/analyze/batch", response_model = BatchAnalysisResponse)
|
| 1177 |
async def batch_analyze(request: BatchAnalysisRequest):
|
| 1178 |
"""
|
| 1179 |
+
Analyze multiple texts in batch for forensic consistency signals using parallel processing
|
| 1180 |
- Limits : 1-100 texts per request
|
| 1181 |
"""
|
| 1182 |
if not orchestrator:
|
|
|
|
| 1189 |
detail = "Maximum 100 texts per batch",
|
| 1190 |
)
|
| 1191 |
|
|
|
|
| 1192 |
start_time = time.time()
|
| 1193 |
batch_id = f"batch_{int(time.time() * 1000)}"
|
| 1194 |
|
| 1195 |
try:
|
| 1196 |
# Parse domain
|
| 1197 |
+
domain = _parse_domain(request.domain)
|
| 1198 |
+
|
| 1199 |
+
logger.info(f"[{batch_id}] Processing {len(request.texts)} texts with parallel execution")
|
| 1200 |
+
|
| 1201 |
+
# Use parallel batch analysis
|
| 1202 |
+
detection_results = await _run_batch_analysis_parallel(texts = request.texts,
|
| 1203 |
+
domain = domain,
|
| 1204 |
+
skip_expensive = request.skip_expensive_metrics,
|
| 1205 |
+
)
|
| 1206 |
|
| 1207 |
+
results = list()
|
| 1208 |
|
| 1209 |
+
# Process results with parallel reasoning generation
|
| 1210 |
+
reasoning_tasks = list()
|
| 1211 |
|
| 1212 |
+
for i, detection_result in enumerate(detection_results):
|
| 1213 |
+
if isinstance(detection_result, Exception):
|
| 1214 |
+
results.append(BatchAnalysisResult(index = i,
|
| 1215 |
+
status = "error",
|
| 1216 |
+
error = str(detection_result),
|
| 1217 |
+
))
|
| 1218 |
+
continue
|
| 1219 |
+
|
| 1220 |
+
# Convert to serializable dict
|
| 1221 |
+
detection_dict = safe_serialize_response(detection_result.to_dict())
|
| 1222 |
+
|
| 1223 |
+
# Start reasoning generation task
|
| 1224 |
+
if reasoning_generator:
|
| 1225 |
+
task = asyncio.create_task(asyncio.to_thread(_generate_reasoning,
|
| 1226 |
+
detection_result = detection_result
|
| 1227 |
+
)
|
| 1228 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1229 |
|
| 1230 |
+
reasoning_tasks.append((i, task, detection_dict))
|
| 1231 |
|
| 1232 |
+
else:
|
| 1233 |
+
results.append(BatchAnalysisResult(index = i,
|
| 1234 |
+
status = "success",
|
| 1235 |
+
detection = detection_dict,
|
| 1236 |
+
reasoning = {},
|
| 1237 |
+
report_files = None,
|
| 1238 |
+
))
|
| 1239 |
+
|
| 1240 |
+
# Wait for all reasoning tasks to complete
|
| 1241 |
+
for i, task, detection_dict in reasoning_tasks:
|
| 1242 |
+
try:
|
| 1243 |
+
reasoning_dict = await task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1244 |
results.append(BatchAnalysisResult(index = i,
|
| 1245 |
status = "success",
|
| 1246 |
detection = detection_dict,
|
|
|
|
| 1247 |
reasoning = reasoning_dict,
|
| 1248 |
+
report_files = None,
|
| 1249 |
+
))
|
| 1250 |
+
|
|
|
|
| 1251 |
except Exception as e:
|
| 1252 |
+
logger.error(f"[{batch_id}] Reasoning generation failed for text {i}: {e}")
|
| 1253 |
+
results.append(BatchAnalysisResult(index = i,
|
| 1254 |
+
status = "success",
|
| 1255 |
+
detection = detection_dict,
|
| 1256 |
+
reasoning = {},
|
| 1257 |
+
report_files = None,
|
| 1258 |
+
))
|
| 1259 |
+
|
| 1260 |
+
# Sort results by index
|
| 1261 |
+
results.sort(key = lambda x: x.index)
|
| 1262 |
|
| 1263 |
processing_time = time.time() - start_time
|
| 1264 |
success_count = sum(1 for r in results if r.status == "success")
|
|
|
|
| 1306 |
|
| 1307 |
# Extract cached data
|
| 1308 |
detection_result = cached_data['detection_result']
|
|
|
|
| 1309 |
highlighted_sentences = cached_data.get('highlighted_sentences')
|
| 1310 |
|
| 1311 |
# Parse formats
|
|
|
|
| 1321 |
# Generate reports using cached data
|
| 1322 |
logger.info(f"Generating {', '.join(requested_formats)} report(s) for {analysis_id}")
|
| 1323 |
|
| 1324 |
+
report_files = await asyncio.to_thread(reporter.generate_complete_report,
|
| 1325 |
+
detection_result = detection_result,
|
| 1326 |
+
highlighted_sentences = highlighted_sentences if include_highlights else None,
|
| 1327 |
+
formats = requested_formats,
|
| 1328 |
+
filename_prefix = analysis_id,
|
| 1329 |
+
)
|
| 1330 |
|
| 1331 |
# Extract only the filename from the full path for the response
|
| 1332 |
report_filenames = dict()
|
|
|
|
| 1351 |
detail = str(e),
|
| 1352 |
)
|
| 1353 |
|
| 1354 |
+
|
| 1355 |
@app.get("/api/report/download/{filename}")
|
| 1356 |
async def download_report(filename: str):
|
| 1357 |
"""
|
|
|
|
| 1391 |
return {"domains": domains_list}
|
| 1392 |
|
| 1393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1394 |
@app.get("/api/cache/stats")
|
| 1395 |
async def get_cache_stats():
|
| 1396 |
"""
|
|
|
|
| 1468 |
|
| 1469 |
|
| 1470 |
|
| 1471 |
+
|
| 1472 |
# ==================== MAIN ====================
|
| 1473 |
if __name__ == "__main__":
|
| 1474 |
# Configure logging
|
|
|
|
| 1481 |
reload = settings.DEBUG,
|
| 1482 |
log_level = log_level,
|
| 1483 |
workers = 1 if settings.DEBUG else settings.WORKERS,
|
| 1484 |
+
)
|
ui/static/index.html
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
-
<title>
|
| 7 |
<style>
|
| 8 |
* {
|
| 9 |
margin: 0;
|
|
@@ -731,12 +731,12 @@ input[type="checkbox"] {
|
|
| 731 |
min-width: 60px;
|
| 732 |
text-align: center;
|
| 733 |
}
|
| 734 |
-
.
|
| 735 |
background: rgba(239, 68, 68, 0.2);
|
| 736 |
color: var(--danger);
|
| 737 |
border: 1px solid rgba(239, 68, 68, 0.3);
|
| 738 |
}
|
| 739 |
-
.
|
| 740 |
background: rgba(16, 185, 129, 0.2);
|
| 741 |
color: var(--success);
|
| 742 |
border: 1px solid rgba(16, 185, 129, 0.3);
|
|
@@ -763,54 +763,6 @@ input[type="checkbox"] {
|
|
| 763 |
font-size: 0.9rem;
|
| 764 |
font-weight: 600;
|
| 765 |
}
|
| 766 |
-
/* Attribution Section */
|
| 767 |
-
.attribution-section {
|
| 768 |
-
margin-top: 2rem;
|
| 769 |
-
padding: 1.5rem;
|
| 770 |
-
background: rgba(51, 65, 85, 0.3);
|
| 771 |
-
border-radius: 10px;
|
| 772 |
-
border: 1px solid var(--border);
|
| 773 |
-
}
|
| 774 |
-
.attribution-title {
|
| 775 |
-
font-size: 1.1rem;
|
| 776 |
-
font-weight: 700;
|
| 777 |
-
margin-bottom: 1rem;
|
| 778 |
-
color: #fff;
|
| 779 |
-
}
|
| 780 |
-
.model-match {
|
| 781 |
-
display: flex;
|
| 782 |
-
align-items: center;
|
| 783 |
-
justify-content: space-between;
|
| 784 |
-
padding: 0.75rem;
|
| 785 |
-
background: rgba(6, 182, 212, 0.1);
|
| 786 |
-
border-radius: 6px;
|
| 787 |
-
margin-bottom: 0.5rem;
|
| 788 |
-
}
|
| 789 |
-
.model-name {
|
| 790 |
-
font-weight: 600;
|
| 791 |
-
color: var(--text-primary);
|
| 792 |
-
}
|
| 793 |
-
.model-confidence {
|
| 794 |
-
font-weight: 700;
|
| 795 |
-
color: var(--primary);
|
| 796 |
-
}
|
| 797 |
-
.attribution-confidence {
|
| 798 |
-
margin-top: 0.75rem;
|
| 799 |
-
font-size: 0.85rem;
|
| 800 |
-
color: var(--text-secondary);
|
| 801 |
-
}
|
| 802 |
-
.attribution-uncertain {
|
| 803 |
-
color: var(--text-muted);
|
| 804 |
-
font-style: italic;
|
| 805 |
-
margin-top: 0.5rem;
|
| 806 |
-
font-size: 0.9rem;
|
| 807 |
-
}
|
| 808 |
-
.attribution-reasoning {
|
| 809 |
-
color: var(--text-secondary);
|
| 810 |
-
margin-top: 1rem;
|
| 811 |
-
font-size: 0.9rem;
|
| 812 |
-
line-height: 1.4;
|
| 813 |
-
}
|
| 814 |
/* Download Actions */
|
| 815 |
.download-actions {
|
| 816 |
display: flex;
|
|
@@ -908,11 +860,11 @@ input[type="checkbox"] {
|
|
| 908 |
text-transform: uppercase;
|
| 909 |
margin-top: 0.5rem;
|
| 910 |
}
|
| 911 |
-
.verdict-
|
| 912 |
background: rgba(239, 68, 68, 0.2);
|
| 913 |
color: var(--danger);
|
| 914 |
}
|
| 915 |
-
.verdict-
|
| 916 |
background: rgba(16, 185, 129, 0.2);
|
| 917 |
color: var(--success);
|
| 918 |
}
|
|
@@ -1030,7 +982,7 @@ input[type="checkbox"] {
|
|
| 1030 |
font-size: 1.1rem !important;
|
| 1031 |
}
|
| 1032 |
|
| 1033 |
-
.verdict-
|
| 1034 |
background: rgba(168, 85, 247, 0.2);
|
| 1035 |
color: #a855f7;
|
| 1036 |
border: 1px solid rgba(168, 85, 247, 0.3);
|
|
@@ -1109,11 +1061,11 @@ html {
|
|
| 1109 |
<div class="header">
|
| 1110 |
<a href="#" class="logo" onclick="showLanding(); return false;">
|
| 1111 |
<div class="logo-icon">🔍</div>
|
| 1112 |
-
<span>
|
| 1113 |
</a>
|
| 1114 |
<div class="nav-links">
|
| 1115 |
<a href="#features" class="nav-link">Features</a>
|
| 1116 |
-
<a href="#metrics" class="nav-link">
|
| 1117 |
<a href="#" class="nav-link" onclick="showAnalysis(); return false;">Try It Now</a>
|
| 1118 |
</div>
|
| 1119 |
</div>
|
|
@@ -1121,26 +1073,27 @@ html {
|
|
| 1121 |
<div class="landing-page" id="landing-page">
|
| 1122 |
<!-- Hero Section -->
|
| 1123 |
<section class="hero">
|
| 1124 |
-
<h1 class="hero-title">
|
| 1125 |
-
<p class="hero-subtitle">
|
| 1126 |
<p class="hero-description">
|
| 1127 |
-
|
| 1128 |
-
|
|
|
|
| 1129 |
</p>
|
| 1130 |
<button class="try-btn" onclick="showAnalysis()"> Try It Now → </button>
|
| 1131 |
</section>
|
| 1132 |
<!-- Stats -->
|
| 1133 |
<div class="stats-grid">
|
| 1134 |
<div class="stat-card">
|
| 1135 |
-
<div class="stat-value">
|
| 1136 |
-
<div class="stat-label">False
|
| 1137 |
</div>
|
| 1138 |
<div class="stat-card">
|
| 1139 |
<div class="stat-value">6</div>
|
| 1140 |
-
<div class="stat-label">Total
|
| 1141 |
</div>
|
| 1142 |
<div class="stat-card">
|
| 1143 |
-
<div class="stat-value">
|
| 1144 |
<div class="stat-label">Average Processing Time</div>
|
| 1145 |
</div>
|
| 1146 |
</div>
|
|
@@ -1153,23 +1106,23 @@ html {
|
|
| 1153 |
<div class="features-grid">
|
| 1154 |
<div class="feature-card">
|
| 1155 |
<div class="feature-icon">🎯</div>
|
| 1156 |
-
<h3 class="feature-title">Domain-Aware
|
| 1157 |
<p class="feature-description">
|
| 1158 |
-
Calibrated thresholds for Academic, Technical, Creative, and Casual content types with specialized
|
| 1159 |
</p>
|
| 1160 |
</div>
|
| 1161 |
<div class="feature-card">
|
| 1162 |
<div class="feature-icon">🔬</div>
|
| 1163 |
-
<h3 class="feature-title">6-
|
| 1164 |
<p class="feature-description">
|
| 1165 |
-
Combines
|
| 1166 |
</p>
|
| 1167 |
</div>
|
| 1168 |
<div class="feature-card">
|
| 1169 |
<div class="feature-icon">💡</div>
|
| 1170 |
<h3 class="feature-title">Explainable Results</h3>
|
| 1171 |
<p class="feature-description">
|
| 1172 |
-
Sentence-level highlighting with confidence scores and detailed reasoning for
|
| 1173 |
</p>
|
| 1174 |
</div>
|
| 1175 |
<div class="feature-card">
|
|
@@ -1179,13 +1132,6 @@ html {
|
|
| 1179 |
Analyze short texts in 1.2 seconds, medium documents in 3.5 seconds with parallel metric computation.
|
| 1180 |
</p>
|
| 1181 |
</div>
|
| 1182 |
-
<div class="feature-card">
|
| 1183 |
-
<div class="feature-icon">🤖</div>
|
| 1184 |
-
<h3 class="feature-title">Model Attribution</h3>
|
| 1185 |
-
<p class="feature-description">
|
| 1186 |
-
Identifies which AI model likely generated the text - GPT-4, Claude, Gemini, LLaMA, and more.
|
| 1187 |
-
</p>
|
| 1188 |
-
</div>
|
| 1189 |
<div class="feature-card">
|
| 1190 |
<div class="feature-icon">📄</div>
|
| 1191 |
<h3 class="feature-title">Multi-Format Support</h3>
|
|
@@ -1197,29 +1143,29 @@ html {
|
|
| 1197 |
</section>
|
| 1198 |
<!-- Metrics Section -->
|
| 1199 |
<section class="metrics-info" id="metrics">
|
| 1200 |
-
<h2 class="section-title">
|
| 1201 |
<p class="section-subtitle">
|
| 1202 |
-
Understanding the science behind the
|
| 1203 |
</p>
|
| 1204 |
<div class="metric-card">
|
| 1205 |
<div class="metric-icon-box">📊</div>
|
| 1206 |
<div class="metric-content">
|
| 1207 |
<h3>Perplexity <span class="metric-weight">Weight: 25%</span></h3>
|
| 1208 |
-
<p>Measures how predictable the text is using
|
| 1209 |
</div>
|
| 1210 |
</div>
|
| 1211 |
<div class="metric-card">
|
| 1212 |
<div class="metric-icon-box">🎲</div>
|
| 1213 |
<div class="metric-content">
|
| 1214 |
<h3>Entropy <span class="metric-weight">Weight: 20%</span></h3>
|
| 1215 |
-
<p>Calculates token-level diversity and unpredictability in text sequences. Human writing shows higher entropy with more varied word choices, while
|
| 1216 |
</div>
|
| 1217 |
</div>
|
| 1218 |
<div class="metric-card">
|
| 1219 |
<div class="metric-icon-box">📈</div>
|
| 1220 |
<div class="metric-content">
|
| 1221 |
<h3>Structural Analysis <span class="metric-weight">Weight: 15%</span></h3>
|
| 1222 |
-
<p>Analyzes sentence length variance, punctuation patterns, and lexical burstiness. Human writing exhibits more variation in sentence structure and rhythm compared to
|
| 1223 |
</div>
|
| 1224 |
</div>
|
| 1225 |
<div class="metric-card">
|
|
@@ -1233,21 +1179,21 @@ html {
|
|
| 1233 |
<div class="metric-icon-box">🧠</div>
|
| 1234 |
<div class="metric-content">
|
| 1235 |
<h3>Semantic Analysis <span class="metric-weight">Weight: 15%</span></h3>
|
| 1236 |
-
<p>Assesses semantic coherence, repetition patterns, and contextual consistency.
|
| 1237 |
</div>
|
| 1238 |
</div>
|
| 1239 |
<div class="metric-card">
|
| 1240 |
<div class="metric-icon-box">🔍</div>
|
| 1241 |
<div class="metric-content">
|
| 1242 |
<h3>Multi-Perturbation Stability <span class="metric-weight">Weight: 10%</span></h3>
|
| 1243 |
-
<p>Tests text stability under random perturbations.
|
| 1244 |
</div>
|
| 1245 |
</div>
|
| 1246 |
</section>
|
| 1247 |
<!-- Footer -->
|
| 1248 |
<footer class="footer">
|
| 1249 |
-
<p>© 2025
|
| 1250 |
-
<p style="margin-top: 1rem;">
|
| 1251 |
</footer>
|
| 1252 |
</div>
|
| 1253 |
<!-- Analysis Interface -->
|
|
@@ -1270,7 +1216,7 @@ html {
|
|
| 1270 |
id="text-input"
|
| 1271 |
class="text-input"
|
| 1272 |
placeholder="Paste your text here for analysis...
|
| 1273 |
-
The more text you provide (minimum 50 characters), the more
|
| 1274 |
></textarea>
|
| 1275 |
</div>
|
| 1276 |
<div id="upload-tab" class="tab-content">
|
|
@@ -1300,13 +1246,6 @@ The more text you provide (minimum 50 characters), the more accurate the detecti
|
|
| 1300 |
<option value="social_media">Social Media</option>
|
| 1301 |
</select>
|
| 1302 |
</div>
|
| 1303 |
-
<div class="option-row">
|
| 1304 |
-
<label class="option-label">Enable AI Model Attribution:</label>
|
| 1305 |
-
<div class="checkbox-wrapper">
|
| 1306 |
-
<input type="checkbox" id="enable-attribution" checked>
|
| 1307 |
-
<span style="font-size: 0.85rem; color: var(--text-muted);">Identify which AI model generated the text</span>
|
| 1308 |
-
</div>
|
| 1309 |
-
</div>
|
| 1310 |
<div class="option-row">
|
| 1311 |
<label class="option-label">Enable Sentence Highlighting:</label>
|
| 1312 |
<div class="checkbox-wrapper">
|
|
@@ -1366,8 +1305,8 @@ The more text you provide (minimum 50 characters), the more accurate the detecti
|
|
| 1366 |
<div class="empty-icon">✓</div>
|
| 1367 |
<h3 class="empty-title">Ready for Analysis</h3>
|
| 1368 |
<p class="empty-description">
|
| 1369 |
-
Paste text or upload a document to begin
|
| 1370 |
-
Our
|
| 1371 |
</p>
|
| 1372 |
</div>
|
| 1373 |
</div>
|
|
@@ -1429,7 +1368,6 @@ function resetAnalysisInterface() {
|
|
| 1429 |
document.getElementById('paste-tab').classList.add('active');
|
| 1430 |
// Reset options to defaults
|
| 1431 |
document.getElementById('domain-select').value = '';
|
| 1432 |
-
document.getElementById('enable-attribution').checked = true;
|
| 1433 |
document.getElementById('enable-highlighting').checked = true;
|
| 1434 |
document.getElementById('use-sentence-level').checked = true;
|
| 1435 |
document.getElementById('include-metrics-summary').checked = true;
|
|
@@ -1444,8 +1382,8 @@ function resetAnalysisInterface() {
|
|
| 1444 |
<div class="empty-icon">✓</div>
|
| 1445 |
<h3 class="empty-title">Ready for Analysis</h3>
|
| 1446 |
<p class="empty-description">
|
| 1447 |
-
Paste text or upload a document to begin
|
| 1448 |
-
Our
|
| 1449 |
</p>
|
| 1450 |
</div>
|
| 1451 |
`;
|
|
@@ -1613,7 +1551,6 @@ async function performAnalysis(mode, text, file) {
|
|
| 1613 |
|
| 1614 |
async function analyzeText(text) {
|
| 1615 |
const domain = document.getElementById('domain-select').value || null;
|
| 1616 |
-
const enableAttribution = document.getElementById('enable-attribution').checked;
|
| 1617 |
const enableHighlighting = document.getElementById('enable-highlighting').checked;
|
| 1618 |
const useSentenceLevel = document.getElementById('use-sentence-level').checked;
|
| 1619 |
const includeMetricsSummary = document.getElementById('include-metrics-summary').checked;
|
|
@@ -1624,7 +1561,6 @@ async function analyzeText(text) {
|
|
| 1624 |
body: JSON.stringify({
|
| 1625 |
text: text,
|
| 1626 |
domain: domain,
|
| 1627 |
-
enable_attribution: enableAttribution,
|
| 1628 |
enable_highlighting: enableHighlighting,
|
| 1629 |
use_sentence_level: useSentenceLevel,
|
| 1630 |
include_metrics_summary: includeMetricsSummary,
|
|
@@ -1641,14 +1577,12 @@ async function analyzeText(text) {
|
|
| 1641 |
|
| 1642 |
async function analyzeFile(file) {
|
| 1643 |
const domain = document.getElementById('domain-select').value || null;
|
| 1644 |
-
const enableAttribution = document.getElementById('enable-attribution').checked;
|
| 1645 |
const useSentenceLevel = document.getElementById('use-sentence-level').checked;
|
| 1646 |
const includeMetricsSummary = document.getElementById('include-metrics-summary').checked;
|
| 1647 |
|
| 1648 |
const formData = new FormData();
|
| 1649 |
formData.append('file', file);
|
| 1650 |
if (domain) formData.append('domain', domain);
|
| 1651 |
-
formData.append('enable_attribution', enableAttribution.toString());
|
| 1652 |
formData.append('use_sentence_level', useSentenceLevel.toString());
|
| 1653 |
formData.append('include_metrics_summary', includeMetricsSummary.toString());
|
| 1654 |
formData.append('skip_expensive_metrics', 'false');
|
|
@@ -1669,7 +1603,7 @@ function showLoading() {
|
|
| 1669 |
document.getElementById('summary-report').innerHTML = `
|
| 1670 |
<div class="loading">
|
| 1671 |
<div class="spinner"></div>
|
| 1672 |
-
<p style="color: var(--text-secondary);">Analyzing content
|
| 1673 |
<p style="color: var(--text-muted); font-size: 0.9rem; margin-top: 0.5rem;">
|
| 1674 |
This may take a few seconds
|
| 1675 |
</p>
|
|
@@ -1704,7 +1638,7 @@ function displayResults(data) {
|
|
| 1704 |
const analysis = detection.analysis || {};
|
| 1705 |
|
| 1706 |
// Display Summary with enhanced reasoning
|
| 1707 |
-
displaySummary(ensemble, prediction, analysis, data.
|
| 1708 |
|
| 1709 |
// Display Highlighted Text with enhanced features
|
| 1710 |
if (data.highlighted_html) {
|
|
@@ -1729,31 +1663,27 @@ function displayResults(data) {
|
|
| 1729 |
}
|
| 1730 |
}
|
| 1731 |
|
| 1732 |
-
function displaySummary(ensemble, prediction, analysis,
|
| 1733 |
// Extract and validate data with fallbacks
|
| 1734 |
const {
|
| 1735 |
-
|
| 1736 |
-
|
| 1737 |
-
|
| 1738 |
verdict,
|
| 1739 |
confidence,
|
| 1740 |
domain,
|
| 1741 |
-
|
| 1742 |
gaugeColor,
|
| 1743 |
gaugeDegree,
|
| 1744 |
confidenceLevel,
|
| 1745 |
confidenceClass
|
| 1746 |
} = extractSummaryData(ensemble, analysis);
|
| 1747 |
|
| 1748 |
-
// Generate attribution HTML with proper filtering
|
| 1749 |
-
const attributionHTML = generateAttributionHTML(attribution);
|
| 1750 |
-
|
| 1751 |
document.getElementById('summary-report').innerHTML = `
|
| 1752 |
<div class="result-summary">
|
| 1753 |
-
${createGaugeSection(
|
| 1754 |
-
${createInfoGrid(verdict, confidence, confidenceClass, domain,
|
| 1755 |
${createEnhancedReasoningHTML(ensemble, analysis, reasoning)}
|
| 1756 |
-
${attributionHTML}
|
| 1757 |
${createDownloadActions()}
|
| 1758 |
</div>
|
| 1759 |
`;
|
|
@@ -1761,34 +1691,45 @@ function displaySummary(ensemble, prediction, analysis, attribution, reasoning)
|
|
| 1761 |
|
| 1762 |
// Helper function to extract and validate summary data
|
| 1763 |
function extractSummaryData(ensemble, analysis) {
|
| 1764 |
-
const
|
| 1765 |
-
(ensemble.
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
| 1772 |
-
|
|
|
|
|
|
|
|
|
|
| 1773 |
const verdict = ensemble.final_verdict || 'Unknown';
|
| 1774 |
-
|
| 1775 |
-
|
|
|
|
|
|
|
|
|
|
| 1776 |
const domain = analysis.domain || 'general';
|
| 1777 |
-
|
| 1778 |
-
const
|
| 1779 |
-
|
| 1780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1781 |
const confidenceLevel = getConfidenceLevel(parseFloat(confidence));
|
| 1782 |
const confidenceClass = getConfidenceClass(confidenceLevel);
|
| 1783 |
|
| 1784 |
return {
|
| 1785 |
-
|
| 1786 |
-
|
| 1787 |
-
|
| 1788 |
verdict,
|
| 1789 |
confidence,
|
| 1790 |
domain,
|
| 1791 |
-
|
| 1792 |
gaugeColor,
|
| 1793 |
gaugeDegree,
|
| 1794 |
confidenceLevel,
|
|
@@ -1813,104 +1754,33 @@ function getConfidenceClass(confidenceLevel) {
|
|
| 1813 |
return classMap[confidenceLevel] || 'confidence-low';
|
| 1814 |
}
|
| 1815 |
|
| 1816 |
-
// Helper function to generate attribution HTML with filtering
|
| 1817 |
-
function generateAttributionHTML(attribution) {
|
| 1818 |
-
if (!attribution || !attribution.predicted_model) {
|
| 1819 |
-
return '';
|
| 1820 |
-
}
|
| 1821 |
-
|
| 1822 |
-
const modelName = formatModelName(attribution.predicted_model);
|
| 1823 |
-
const modelConf = attribution.confidence ?
|
| 1824 |
-
(attribution.confidence * 100).toFixed(1) : 'N/A';
|
| 1825 |
-
|
| 1826 |
-
const topModelsHTML = generateTopModelsHTML(attribution.model_probabilities);
|
| 1827 |
-
const reasoningHTML = generateAttributionReasoningHTML(attribution.reasoning);
|
| 1828 |
-
|
| 1829 |
-
// Only show attribution if confidence is meaningful (>30%)
|
| 1830 |
-
if (attribution.confidence > 0.3) {
|
| 1831 |
-
return `
|
| 1832 |
-
<div class="attribution-section">
|
| 1833 |
-
<div class="attribution-title">🤖 AI Model Attribution</div>
|
| 1834 |
-
${topModelsHTML}
|
| 1835 |
-
<div class="attribution-confidence">
|
| 1836 |
-
Attribution Confidence: <strong>${modelConf}%</strong>
|
| 1837 |
-
</div>
|
| 1838 |
-
${reasoningHTML}
|
| 1839 |
-
</div>
|
| 1840 |
-
`;
|
| 1841 |
-
}
|
| 1842 |
-
|
| 1843 |
-
return '';
|
| 1844 |
-
}
|
| 1845 |
-
|
| 1846 |
-
// Helper function to generate top models HTML with filtering
|
| 1847 |
-
function generateTopModelsHTML(modelProbabilities) {
|
| 1848 |
-
if (!modelProbabilities) {
|
| 1849 |
-
return '<div class="attribution-uncertain">Model probabilities not available</div>';
|
| 1850 |
-
}
|
| 1851 |
-
|
| 1852 |
-
// Filter and sort models
|
| 1853 |
-
const meaningfulModels = Object.entries(modelProbabilities)
|
| 1854 |
-
.sort((a, b) => b[1] - a[1])
|
| 1855 |
-
.filter(([model, prob]) => prob > 0.15) // Only show models with >15% probability
|
| 1856 |
-
.slice(0, 3); // Show top 3
|
| 1857 |
-
|
| 1858 |
-
if (meaningfulModels.length === 0) {
|
| 1859 |
-
return `
|
| 1860 |
-
<div class="attribution-uncertain">
|
| 1861 |
-
Model attribution uncertain - text patterns don't strongly match any specific AI model
|
| 1862 |
-
</div>
|
| 1863 |
-
`;
|
| 1864 |
-
}
|
| 1865 |
-
|
| 1866 |
-
return meaningfulModels.map(([model, prob]) =>
|
| 1867 |
-
`<div class="model-match">
|
| 1868 |
-
<span class="model-name">${formatModelName(model)}</span>
|
| 1869 |
-
<span class="model-confidence">${(prob * 100).toFixed(1)}%</span>
|
| 1870 |
-
</div>`
|
| 1871 |
-
).join('');
|
| 1872 |
-
}
|
| 1873 |
-
|
| 1874 |
// Helper function to format model names
|
| 1875 |
function formatModelName(modelName) {
|
| 1876 |
return modelName.replace(/_/g, ' ').replace(/-/g, ' ').toUpperCase();
|
| 1877 |
}
|
| 1878 |
|
| 1879 |
-
// Helper function to generate attribution reasoning HTML
|
| 1880 |
-
function generateAttributionReasoningHTML(reasoning) {
|
| 1881 |
-
if (!reasoning || !Array.isArray(reasoning) || reasoning.length === 0) {
|
| 1882 |
-
return '';
|
| 1883 |
-
}
|
| 1884 |
-
|
| 1885 |
-
return `
|
| 1886 |
-
<div class="attribution-reasoning">
|
| 1887 |
-
${reasoning[0]}
|
| 1888 |
-
</div>
|
| 1889 |
-
`;
|
| 1890 |
-
}
|
| 1891 |
-
|
| 1892 |
// Helper function to create single-progress gauge section
|
| 1893 |
-
function createGaugeSection(
|
| 1894 |
// Ensure these are numbers
|
| 1895 |
-
const
|
| 1896 |
-
const
|
| 1897 |
-
const
|
| 1898 |
|
| 1899 |
// Determine which probability is highest
|
| 1900 |
let maxValue, maxColor, maxLabel;
|
| 1901 |
|
| 1902 |
-
if (
|
| 1903 |
-
maxValue =
|
| 1904 |
maxColor = 'var(--danger)';
|
| 1905 |
-
maxLabel = '
|
| 1906 |
-
} else if (
|
| 1907 |
-
maxValue =
|
| 1908 |
maxColor = 'var(--success)';
|
| 1909 |
-
maxLabel = '
|
| 1910 |
} else {
|
| 1911 |
-
maxValue =
|
| 1912 |
maxColor = 'var(--primary)';
|
| 1913 |
-
maxLabel = '
|
| 1914 |
}
|
| 1915 |
|
| 1916 |
console.log('Selected:', { maxValue, maxLabel });
|
|
@@ -1936,16 +1806,16 @@ function createGaugeSection(aiProbability, humanProbability, mixedProbability, g
|
|
| 1936 |
</div>
|
| 1937 |
<div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
|
| 1938 |
<div style="text-align: center; padding: 1rem; background: rgba(239, 68, 68, 0.1); border-radius: 8px; border: 1px solid rgba(239, 68, 68, 0.3);">
|
| 1939 |
-
<div style="font-size: 0.85rem; color: var(--danger); margin-bottom: 0.25rem; font-weight: 600;">
|
| 1940 |
-
<div style="font-size: 1.4rem; font-weight: 700; color: var(--danger);">${
|
| 1941 |
</div>
|
| 1942 |
<div style="text-align: center; padding: 1rem; background: rgba(16, 185, 129, 0.1); border-radius: 8px; border: 1px solid rgba(16, 185, 129, 0.3);">
|
| 1943 |
-
<div style="font-size: 0.85rem; color: var(--success); margin-bottom: 0.25rem; font-weight: 600;">
|
| 1944 |
-
<div style="font-size: 1.4rem; font-weight: 700; color: var(--success);">${
|
| 1945 |
</div>
|
| 1946 |
<div style="text-align: center; padding: 1rem; background: rgba(6, 182, 212, 0.1); border-radius: 8px; border: 1px solid rgba(6, 182, 212, 0.3);">
|
| 1947 |
-
<div style="font-size: 0.85rem; color: var(--primary); margin-bottom: 0.25rem; font-weight: 600;">
|
| 1948 |
-
<div style="font-size: 1.4rem; font-weight: 700; color: var(--primary);">${
|
| 1949 |
</div>
|
| 1950 |
</div>
|
| 1951 |
<style>
|
|
@@ -1989,10 +1859,10 @@ function createGaugeSection(aiProbability, humanProbability, mixedProbability, g
|
|
| 1989 |
|
| 1990 |
|
| 1991 |
// Helper function to create info grid
|
| 1992 |
-
function createInfoGrid(verdict, confidence, confidenceClass, domain,
|
| 1993 |
-
const
|
| 1994 |
`<div style="margin-top: 0.5rem; font-size: 0.85rem; color: var(--primary);">
|
| 1995 |
-
🔀 ${
|
| 1996 |
</div>` : '';
|
| 1997 |
|
| 1998 |
return `
|
|
@@ -2000,7 +1870,7 @@ function createInfoGrid(verdict, confidence, confidenceClass, domain, mixedProba
|
|
| 2000 |
<div class="info-card">
|
| 2001 |
<div class="info-label">Verdict</div>
|
| 2002 |
<div class="info-value verdict-text">${verdict}</div>
|
| 2003 |
-
${
|
| 2004 |
</div>
|
| 2005 |
<div class="info-card">
|
| 2006 |
<div class="info-label">Confidence Level</div>
|
|
@@ -2040,6 +1910,21 @@ function createEnhancedReasoningHTML(ensemble, analysis, reasoning) {
|
|
| 2040 |
if (reasoning && reasoning.summary) {
|
| 2041 |
// Process the summary into bullet points
|
| 2042 |
const bulletPoints = formatSummaryAsBulletPoints(reasoning.summary, ensemble, analysis);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2043 |
|
| 2044 |
// Process key indicators with markdown formatting
|
| 2045 |
let processedIndicators = [];
|
|
@@ -2071,14 +1956,15 @@ function createEnhancedReasoningHTML(ensemble, analysis, reasoning) {
|
|
| 2071 |
<div class="reasoning-box enhanced">
|
| 2072 |
<div class="reasoning-header">
|
| 2073 |
<div class="reasoning-icon">💡</div>
|
| 2074 |
-
<div class="reasoning-title">
|
| 2075 |
<div class="confidence-tag ${ensemble.overall_confidence >= 0.7 ? 'high-confidence' : ensemble.overall_confidence >= 0.4 ? 'medium-confidence' : 'low-confidence'}">
|
| 2076 |
${ensemble.overall_confidence >= 0.7 ? 'High Confidence' : ensemble.overall_confidence >= 0.4 ? 'Medium Confidence' : 'Low Confidence'}
|
| 2077 |
</div>
|
| 2078 |
</div>
|
| 2079 |
<div class="verdict-summary">
|
| 2080 |
<div class="verdict-text">${ensemble.final_verdict}</div>
|
| 2081 |
-
|
|
|
|
| 2082 |
</div>
|
| 2083 |
<div class="reasoning-bullet-points">
|
| 2084 |
${bulletPoints}
|
|
@@ -2130,11 +2016,11 @@ function createEnhancedReasoningHTML(ensemble, analysis, reasoning) {
|
|
| 2130 |
// Fallback to basic reasoning if no reasoning data
|
| 2131 |
return `
|
| 2132 |
<div class="reasoning-box">
|
| 2133 |
-
<div class="reasoning-title">💡
|
| 2134 |
<p class="reasoning-text" style="text-align: left;">
|
| 2135 |
-
Analysis based on
|
| 2136 |
-
The system evaluated linguistic
|
| 2137 |
-
to
|
| 2138 |
</p>
|
| 2139 |
</div>
|
| 2140 |
`;
|
|
@@ -2170,15 +2056,15 @@ function formatSummaryAsBulletPoints(summary, ensemble, analysis) {
|
|
| 2170 |
// Add verdict as second bullet
|
| 2171 |
bulletPoints.push(`<div class="bullet-point">• ${ensemble.final_verdict}</div>`);
|
| 2172 |
|
| 2173 |
-
// Add
|
| 2174 |
-
bulletPoints.push(`<div class="bullet-point">•
|
| 2175 |
|
| 2176 |
// Add the main analysis sentences as individual bullets
|
| 2177 |
sentences.forEach(sentence => {
|
| 2178 |
if (sentence.trim() &&
|
| 2179 |
!sentence.includes('confidence') &&
|
| 2180 |
!sentence.includes(ensemble.final_verdict) &&
|
| 2181 |
-
!sentence.includes('
|
| 2182 |
// Clean up the sentence and add as bullet
|
| 2183 |
let cleanSentence = sentence.trim();
|
| 2184 |
if (!cleanSentence.endsWith('.')) {
|
|
@@ -2205,36 +2091,20 @@ function createDefaultLegend() {
|
|
| 2205 |
return `
|
| 2206 |
<div class="highlight-legend">
|
| 2207 |
<div class="legend-item">
|
| 2208 |
-
<div class="legend-color" style="background: #
|
| 2209 |
-
<div class="legend-label">
|
| 2210 |
-
</div>
|
| 2211 |
-
<div class="legend-item">
|
| 2212 |
-
<div class="legend-color" style="background: #fed7aa;"></div>
|
| 2213 |
-
<div class="legend-label">Likely AI (75-90%)</div>
|
| 2214 |
-
</div>
|
| 2215 |
-
<div class="legend-item">
|
| 2216 |
-
<div class="legend-color" style="background: #fde68a;"></div>
|
| 2217 |
-
<div class="legend-label">Possibly AI (60-75%)</div>
|
| 2218 |
</div>
|
| 2219 |
<div class="legend-item">
|
| 2220 |
<div class="legend-color" style="background: #fef9c3;"></div>
|
| 2221 |
-
<div class="legend-label">Uncertain
|
| 2222 |
-
</div>
|
| 2223 |
-
<div class="legend-item">
|
| 2224 |
-
<div class="legend-color" style="background: #86efac;"></div>
|
| 2225 |
-
<div class="legend-label">Possibly Human (25-40%)</div>
|
| 2226 |
</div>
|
| 2227 |
<div class="legend-item">
|
| 2228 |
-
<div class="legend-color" style="background: #
|
| 2229 |
-
<div class="legend-label">
|
| 2230 |
-
</div>
|
| 2231 |
-
<div class="legend-item">
|
| 2232 |
-
<div class="legend-color" style="background: #dcfce7;"></div>
|
| 2233 |
-
<div class="legend-label">Very Likely Human (0-10%)</div>
|
| 2234 |
</div>
|
| 2235 |
<div class="legend-item">
|
| 2236 |
<div class="legend-color" style="background: #e9d5ff;"></div>
|
| 2237 |
-
<div class="legend-label">
|
| 2238 |
</div>
|
| 2239 |
</div>
|
| 2240 |
`;
|
|
@@ -2259,15 +2129,15 @@ function getHighlightStyles() {
|
|
| 2259 |
z-index: 10;
|
| 2260 |
text-shadow: 0 1px 1px rgba(255,255,255,0.8);
|
| 2261 |
}
|
| 2262 |
-
#highlighted-report .very-high-
|
| 2263 |
background-color: #fee2e2 !important;
|
| 2264 |
border-bottom-color: #ef4444 !important;
|
| 2265 |
}
|
| 2266 |
-
#highlighted-report .high-
|
| 2267 |
background-color: #fed7aa !important;
|
| 2268 |
border-bottom-color: #f97316 !important;
|
| 2269 |
}
|
| 2270 |
-
#highlighted-report .medium-
|
| 2271 |
background-color: #fef3c7 !important;
|
| 2272 |
border-bottom-color: #f59e0b !important;
|
| 2273 |
}
|
|
@@ -2275,19 +2145,19 @@ function getHighlightStyles() {
|
|
| 2275 |
background-color: #fef9c3 !important;
|
| 2276 |
border-bottom-color: #fbbf24 !important;
|
| 2277 |
}
|
| 2278 |
-
#highlighted-report .medium-
|
| 2279 |
background-color: #ecfccb !important;
|
| 2280 |
border-bottom-color: #a3e635 !important;
|
| 2281 |
}
|
| 2282 |
-
#highlighted-report .high-
|
| 2283 |
background-color: #bbf7d0 !important;
|
| 2284 |
border-bottom-color: #4ade80 !important;
|
| 2285 |
}
|
| 2286 |
-
#highlighted-report .very-high-
|
| 2287 |
background-color: #dcfce7 !important;
|
| 2288 |
border-bottom-color: #22c55e !important;
|
| 2289 |
}
|
| 2290 |
-
#highlighted-report .
|
| 2291 |
background-color: #e9d5ff !important;
|
| 2292 |
border-bottom-color: #a855f7 !important;
|
| 2293 |
background-image: repeating-linear-gradient(45deg, transparent, transparent 5px, rgba(168, 85, 247, 0.1) 5px, rgba(168, 85, 247, 0.1) 10px) !important;
|
|
@@ -2319,27 +2189,27 @@ function displayMetricsCarousel(metrics, analysis, ensemble) {
|
|
| 2319 |
const metric = metrics[metricKey];
|
| 2320 |
if (!metric) return;
|
| 2321 |
|
| 2322 |
-
const
|
| 2323 |
-
const
|
| 2324 |
-
const
|
| 2325 |
const confidence = (metric.confidence * 100).toFixed(1);
|
| 2326 |
const weight = ensemble.metric_contributions && ensemble.metric_contributions[metricKey] ?
|
| 2327 |
(ensemble.metric_contributions[metricKey].weight * 100).toFixed(1) : '0.0';
|
| 2328 |
|
| 2329 |
// Determine verdict based on probabilities
|
| 2330 |
let verdictText, verdictClass;
|
| 2331 |
-
if (metric.
|
| 2332 |
-
verdictText = '
|
| 2333 |
-
verdictClass = 'verdict-
|
| 2334 |
-
} else if (metric.
|
| 2335 |
-
verdictText = '
|
| 2336 |
-
verdictClass = 'verdict-
|
| 2337 |
-
} else if (metric.
|
| 2338 |
-
verdictText = '
|
| 2339 |
verdictClass = 'verdict-uncertain';
|
| 2340 |
} else {
|
| 2341 |
-
verdictText = '
|
| 2342 |
-
verdictClass = 'verdict-
|
| 2343 |
}
|
| 2344 |
|
| 2345 |
carouselHTML += `
|
|
@@ -2352,28 +2222,28 @@ function displayMetricsCarousel(metrics, analysis, ensemble) {
|
|
| 2352 |
${getMetricDescription(metricKey)}
|
| 2353 |
</div>
|
| 2354 |
|
| 2355 |
-
<!--
|
| 2356 |
<div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 1rem; margin: 1rem 0;">
|
| 2357 |
<div style="text-align: center;">
|
| 2358 |
-
<div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 0.25rem;">
|
| 2359 |
<div style="background: rgba(51, 65, 85, 0.5); height: 8px; border-radius: 4px; overflow: hidden;">
|
| 2360 |
-
<div style="background: var(--danger); height: 100%; width: ${
|
| 2361 |
</div>
|
| 2362 |
-
<div style="font-size: 0.85rem; font-weight: 600; margin-top: 0.25rem;">${
|
| 2363 |
</div>
|
| 2364 |
<div style="text-align: center;">
|
| 2365 |
-
<div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 0.25rem;">
|
| 2366 |
<div style="background: rgba(51, 65, 85, 0.5); height: 8px; border-radius: 4px; overflow: hidden;">
|
| 2367 |
-
<div style="background: var(--success); height: 100%; width: ${
|
| 2368 |
</div>
|
| 2369 |
-
<div style="font-size: 0.85rem; font-weight: 600; margin-top: 0.25rem;">${
|
| 2370 |
</div>
|
| 2371 |
<div style="text-align: center;">
|
| 2372 |
-
<div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 0.25rem;">
|
| 2373 |
<div style="background: rgba(51, 65, 85, 0.5); height: 8px; border-radius: 4px; overflow: hidden;">
|
| 2374 |
-
<div style="background: var(--primary); height: 100%; width: ${
|
| 2375 |
</div>
|
| 2376 |
-
<div style="font-size: 0.85rem; font-weight: 600; margin-top: 0.25rem;">${
|
| 2377 |
</div>
|
| 2378 |
</div>
|
| 2379 |
|
|
@@ -2448,7 +2318,7 @@ function renderMetricDetails(metricName, details) {
|
|
| 2448 |
'entropy': ['token_diversity', 'sequence_unpredictability', 'char_entropy'],
|
| 2449 |
'semantic_analysis': ['coherence_score', 'consistency_score', 'repetition_score'],
|
| 2450 |
'linguistic': ['pos_diversity', 'syntactic_complexity', 'grammatical_consistency'],
|
| 2451 |
-
'multi_perturbation_stability': ['stability_score', 'curvature_score', 'likelihood_ratio', 'perturbation_variance', '
|
| 2452 |
};
|
| 2453 |
|
| 2454 |
const keysToShow = importantKeys[metricName] || Object.keys(details).slice(0, 6);
|
|
@@ -2533,7 +2403,7 @@ async function downloadReport(format) {
|
|
| 2533 |
const blob = new Blob([JSON.stringify(data, null, 2)], {
|
| 2534 |
type: 'application/json'
|
| 2535 |
});
|
| 2536 |
-
const filename = `
|
| 2537 |
await downloadBlob(blob, filename);
|
| 2538 |
return;
|
| 2539 |
}
|
|
@@ -2572,7 +2442,7 @@ async function downloadReport(format) {
|
|
| 2572 |
throw new Error('Failed to download file');
|
| 2573 |
}
|
| 2574 |
const blob = await downloadResponse.blob();
|
| 2575 |
-
const downloadFilename = `
|
| 2576 |
await downloadBlob(blob, downloadFilename);
|
| 2577 |
} else {
|
| 2578 |
alert('Report file not available');
|
|
|
|
| 3 |
<head>
|
| 4 |
<meta charset="UTF-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>TextAuth Forensics — Evidence-Based Text Authenticity Analysis</title>
|
| 7 |
<style>
|
| 8 |
* {
|
| 9 |
margin: 0;
|
|
|
|
| 731 |
min-width: 60px;
|
| 732 |
text-align: center;
|
| 733 |
}
|
| 734 |
+
.synthetic-badge {
|
| 735 |
background: rgba(239, 68, 68, 0.2);
|
| 736 |
color: var(--danger);
|
| 737 |
border: 1px solid rgba(239, 68, 68, 0.3);
|
| 738 |
}
|
| 739 |
+
.authentic-badge {
|
| 740 |
background: rgba(16, 185, 129, 0.2);
|
| 741 |
color: var(--success);
|
| 742 |
border: 1px solid rgba(16, 185, 129, 0.3);
|
|
|
|
| 763 |
font-size: 0.9rem;
|
| 764 |
font-weight: 600;
|
| 765 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
/* Download Actions */
|
| 767 |
.download-actions {
|
| 768 |
display: flex;
|
|
|
|
| 860 |
text-transform: uppercase;
|
| 861 |
margin-top: 0.5rem;
|
| 862 |
}
|
| 863 |
+
.verdict-synthetic {
|
| 864 |
background: rgba(239, 68, 68, 0.2);
|
| 865 |
color: var(--danger);
|
| 866 |
}
|
| 867 |
+
.verdict-authentic {
|
| 868 |
background: rgba(16, 185, 129, 0.2);
|
| 869 |
color: var(--success);
|
| 870 |
}
|
|
|
|
| 982 |
font-size: 1.1rem !important;
|
| 983 |
}
|
| 984 |
|
| 985 |
+
.verdict-hybrid {
|
| 986 |
background: rgba(168, 85, 247, 0.2);
|
| 987 |
color: #a855f7;
|
| 988 |
border: 1px solid rgba(168, 85, 247, 0.3);
|
|
|
|
| 1061 |
<div class="header">
|
| 1062 |
<a href="#" class="logo" onclick="showLanding(); return false;">
|
| 1063 |
<div class="logo-icon">🔍</div>
|
| 1064 |
+
<span>TextAuth Forensics</span>
|
| 1065 |
</a>
|
| 1066 |
<div class="nav-links">
|
| 1067 |
<a href="#features" class="nav-link">Features</a>
|
| 1068 |
+
<a href="#metrics" class="nav-link">Forensic Signals</a>
|
| 1069 |
<a href="#" class="nav-link" onclick="showAnalysis(); return false;">Try It Now</a>
|
| 1070 |
</div>
|
| 1071 |
</div>
|
|
|
|
| 1073 |
<div class="landing-page" id="landing-page">
|
| 1074 |
<!-- Hero Section -->
|
| 1075 |
<section class="hero">
|
| 1076 |
+
<h1 class="hero-title">Evidence-Based Text Forensics Platform</h1>
|
| 1077 |
+
<p class="hero-subtitle">Analyzing Content Authenticity Through Linguistic & Statistical Evidence</p>
|
| 1078 |
<p class="hero-description">
|
| 1079 |
+
A forensic analysis system that evaluates textual evidence using multiple statistical,
|
| 1080 |
+
linguistic, and semantic signals to assess content authenticity across education,
|
| 1081 |
+
publishing, hiring, and research domains.
|
| 1082 |
</p>
|
| 1083 |
<button class="try-btn" onclick="showAnalysis()"> Try It Now → </button>
|
| 1084 |
</section>
|
| 1085 |
<!-- Stats -->
|
| 1086 |
<div class="stats-grid">
|
| 1087 |
<div class="stat-card">
|
| 1088 |
+
<div class="stat-value">Low</div>
|
| 1089 |
+
<div class="stat-label">False-Positive Bias (Domain-Calibrated)</div>
|
| 1090 |
</div>
|
| 1091 |
<div class="stat-card">
|
| 1092 |
<div class="stat-value">6</div>
|
| 1093 |
+
<div class="stat-label">Total Forensic Signals</div>
|
| 1094 |
</div>
|
| 1095 |
<div class="stat-card">
|
| 1096 |
+
<div class="stat-value">10s</div>
|
| 1097 |
<div class="stat-label">Average Processing Time</div>
|
| 1098 |
</div>
|
| 1099 |
</div>
|
|
|
|
| 1106 |
<div class="features-grid">
|
| 1107 |
<div class="feature-card">
|
| 1108 |
<div class="feature-icon">🎯</div>
|
| 1109 |
+
<h3 class="feature-title">Domain-Aware Analysis</h3>
|
| 1110 |
<p class="feature-description">
|
| 1111 |
+
Calibrated thresholds for Academic, Technical, Creative, and Casual content types with specialized analysis algorithms for each domain.
|
| 1112 |
</p>
|
| 1113 |
</div>
|
| 1114 |
<div class="feature-card">
|
| 1115 |
<div class="feature-icon">🔬</div>
|
| 1116 |
+
<h3 class="feature-title">6-Signal Evidence Ensemble</h3>
|
| 1117 |
<p class="feature-description">
|
| 1118 |
+
Combines perplexity, entropy, structural, linguistic, semantic, and perturbation-stability signals to form a multi-angle forensic evidence profile
|
| 1119 |
</p>
|
| 1120 |
</div>
|
| 1121 |
<div class="feature-card">
|
| 1122 |
<div class="feature-icon">💡</div>
|
| 1123 |
<h3 class="feature-title">Explainable Results</h3>
|
| 1124 |
<p class="feature-description">
|
| 1125 |
+
Sentence-level highlighting with confidence scores and detailed forensic reasoning for each assessment.
|
| 1126 |
</p>
|
| 1127 |
</div>
|
| 1128 |
<div class="feature-card">
|
|
|
|
| 1132 |
Analyze short texts in 1.2 seconds, medium documents in 3.5 seconds with parallel metric computation.
|
| 1133 |
</p>
|
| 1134 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1135 |
<div class="feature-card">
|
| 1136 |
<div class="feature-icon">📄</div>
|
| 1137 |
<h3 class="feature-title">Multi-Format Support</h3>
|
|
|
|
| 1143 |
</section>
|
| 1144 |
<!-- Metrics Section -->
|
| 1145 |
<section class="metrics-info" id="metrics">
|
| 1146 |
+
<h2 class="section-title">Forensic Signals Explained</h2>
|
| 1147 |
<p class="section-subtitle">
|
| 1148 |
+
Understanding the science behind the forensic evaluation
|
| 1149 |
</p>
|
| 1150 |
<div class="metric-card">
|
| 1151 |
<div class="metric-icon-box">📊</div>
|
| 1152 |
<div class="metric-content">
|
| 1153 |
<h3>Perplexity <span class="metric-weight">Weight: 25%</span></h3>
|
| 1154 |
+
<p>Measures how predictable the text is using reference language model. Model-generated or algorithmically assisted text typically exhibits lower perplexity (more predictable) than human writing, which tends to be more varied and surprising.</p>
|
| 1155 |
</div>
|
| 1156 |
</div>
|
| 1157 |
<div class="metric-card">
|
| 1158 |
<div class="metric-icon-box">🎲</div>
|
| 1159 |
<div class="metric-content">
|
| 1160 |
<h3>Entropy <span class="metric-weight">Weight: 20%</span></h3>
|
| 1161 |
+
<p>Calculates token-level diversity and unpredictability in text sequences. Human writing shows higher entropy with more varied word choices, while algorithmically generated text tends toward more uniform token distributions.</p>
|
| 1162 |
</div>
|
| 1163 |
</div>
|
| 1164 |
<div class="metric-card">
|
| 1165 |
<div class="metric-icon-box">📈</div>
|
| 1166 |
<div class="metric-content">
|
| 1167 |
<h3>Structural Analysis <span class="metric-weight">Weight: 15%</span></h3>
|
| 1168 |
+
<p>Analyzes sentence length variance, punctuation patterns, and lexical burstiness. Human writing exhibits more variation in sentence structure and rhythm compared to algorithmically generated text, which often shows more uniform patterns.</p>
|
| 1169 |
</div>
|
| 1170 |
</div>
|
| 1171 |
<div class="metric-card">
|
|
|
|
| 1179 |
<div class="metric-icon-box">🧠</div>
|
| 1180 |
<div class="metric-content">
|
| 1181 |
<h3>Semantic Analysis <span class="metric-weight">Weight: 15%</span></h3>
|
| 1182 |
+
<p>Assesses semantic coherence, repetition patterns, and contextual consistency. Identifies semantic consistency patterns that often differ between human-authored and algorithmically generated text.</p>
|
| 1183 |
</div>
|
| 1184 |
</div>
|
| 1185 |
<div class="metric-card">
|
| 1186 |
<div class="metric-icon-box">🔍</div>
|
| 1187 |
<div class="metric-content">
|
| 1188 |
<h3>Multi-Perturbation Stability <span class="metric-weight">Weight: 10%</span></h3>
|
| 1189 |
+
<p>Tests text stability under random perturbations. Algorithmically generated text tends to maintain higher likelihood scores even when slightly modified, while human text shows more variation.</p>
|
| 1190 |
</div>
|
| 1191 |
</div>
|
| 1192 |
</section>
|
| 1193 |
<!-- Footer -->
|
| 1194 |
<footer class="footer">
|
| 1195 |
+
<p>© 2025 Evidence-Based Text Forensics Platform</p>
|
| 1196 |
+
<p style="margin-top: 1rem;">Evidence-first text forensics with explainable decisions.</p>
|
| 1197 |
</footer>
|
| 1198 |
</div>
|
| 1199 |
<!-- Analysis Interface -->
|
|
|
|
| 1216 |
id="text-input"
|
| 1217 |
class="text-input"
|
| 1218 |
placeholder="Paste your text here for analysis...
|
| 1219 |
+
The more text you provide (minimum 50 characters), the more reliable the forensic evaluation will be."
|
| 1220 |
></textarea>
|
| 1221 |
</div>
|
| 1222 |
<div id="upload-tab" class="tab-content">
|
|
|
|
| 1246 |
<option value="social_media">Social Media</option>
|
| 1247 |
</select>
|
| 1248 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1249 |
<div class="option-row">
|
| 1250 |
<label class="option-label">Enable Sentence Highlighting:</label>
|
| 1251 |
<div class="checkbox-wrapper">
|
|
|
|
| 1305 |
<div class="empty-icon">✓</div>
|
| 1306 |
<h3 class="empty-title">Ready for Analysis</h3>
|
| 1307 |
<p class="empty-description">
|
| 1308 |
+
Paste text or upload a document to begin evidence-based forensic analysis.
|
| 1309 |
+
Our multi-signal ensemble will provide detailed, explainable insights.
|
| 1310 |
</p>
|
| 1311 |
</div>
|
| 1312 |
</div>
|
|
|
|
| 1368 |
document.getElementById('paste-tab').classList.add('active');
|
| 1369 |
// Reset options to defaults
|
| 1370 |
document.getElementById('domain-select').value = '';
|
|
|
|
| 1371 |
document.getElementById('enable-highlighting').checked = true;
|
| 1372 |
document.getElementById('use-sentence-level').checked = true;
|
| 1373 |
document.getElementById('include-metrics-summary').checked = true;
|
|
|
|
| 1382 |
<div class="empty-icon">✓</div>
|
| 1383 |
<h3 class="empty-title">Ready for Analysis</h3>
|
| 1384 |
<p class="empty-description">
|
| 1385 |
+
Paste text or upload a document to begin evidence-based forensic analysis.
|
| 1386 |
+
Our multi-signal ensemble will provide detailed, explainable insights.
|
| 1387 |
</p>
|
| 1388 |
</div>
|
| 1389 |
`;
|
|
|
|
| 1551 |
|
| 1552 |
async function analyzeText(text) {
|
| 1553 |
const domain = document.getElementById('domain-select').value || null;
|
|
|
|
| 1554 |
const enableHighlighting = document.getElementById('enable-highlighting').checked;
|
| 1555 |
const useSentenceLevel = document.getElementById('use-sentence-level').checked;
|
| 1556 |
const includeMetricsSummary = document.getElementById('include-metrics-summary').checked;
|
|
|
|
| 1561 |
body: JSON.stringify({
|
| 1562 |
text: text,
|
| 1563 |
domain: domain,
|
|
|
|
| 1564 |
enable_highlighting: enableHighlighting,
|
| 1565 |
use_sentence_level: useSentenceLevel,
|
| 1566 |
include_metrics_summary: includeMetricsSummary,
|
|
|
|
| 1577 |
|
| 1578 |
async function analyzeFile(file) {
|
| 1579 |
const domain = document.getElementById('domain-select').value || null;
|
|
|
|
| 1580 |
const useSentenceLevel = document.getElementById('use-sentence-level').checked;
|
| 1581 |
const includeMetricsSummary = document.getElementById('include-metrics-summary').checked;
|
| 1582 |
|
| 1583 |
const formData = new FormData();
|
| 1584 |
formData.append('file', file);
|
| 1585 |
if (domain) formData.append('domain', domain);
|
|
|
|
| 1586 |
formData.append('use_sentence_level', useSentenceLevel.toString());
|
| 1587 |
formData.append('include_metrics_summary', includeMetricsSummary.toString());
|
| 1588 |
formData.append('skip_expensive_metrics', 'false');
|
|
|
|
| 1603 |
document.getElementById('summary-report').innerHTML = `
|
| 1604 |
<div class="loading">
|
| 1605 |
<div class="spinner"></div>
|
| 1606 |
+
<p style="color: var(--text-secondary);">Analyzing content using multi-signal forensic evaluation...</p>
|
| 1607 |
<p style="color: var(--text-muted); font-size: 0.9rem; margin-top: 0.5rem;">
|
| 1608 |
This may take a few seconds
|
| 1609 |
</p>
|
|
|
|
| 1638 |
const analysis = detection.analysis || {};
|
| 1639 |
|
| 1640 |
// Display Summary with enhanced reasoning
|
| 1641 |
+
displaySummary(ensemble, prediction, analysis, data.reasoning);
|
| 1642 |
|
| 1643 |
// Display Highlighted Text with enhanced features
|
| 1644 |
if (data.highlighted_html) {
|
|
|
|
| 1663 |
}
|
| 1664 |
}
|
| 1665 |
|
| 1666 |
+
function displaySummary(ensemble, prediction, analysis, reasoning) {
|
| 1667 |
// Extract and validate data with fallbacks
|
| 1668 |
const {
|
| 1669 |
+
syntheticProbability,
|
| 1670 |
+
authenticProbability,
|
| 1671 |
+
hybridProbability,
|
| 1672 |
verdict,
|
| 1673 |
confidence,
|
| 1674 |
domain,
|
| 1675 |
+
isSynthetic,
|
| 1676 |
gaugeColor,
|
| 1677 |
gaugeDegree,
|
| 1678 |
confidenceLevel,
|
| 1679 |
confidenceClass
|
| 1680 |
} = extractSummaryData(ensemble, analysis);
|
| 1681 |
|
|
|
|
|
|
|
|
|
|
| 1682 |
document.getElementById('summary-report').innerHTML = `
|
| 1683 |
<div class="result-summary">
|
| 1684 |
+
${createGaugeSection(syntheticProbability, authenticProbability, hybridProbability, gaugeColor, gaugeDegree)}
|
| 1685 |
+
${createInfoGrid(verdict, confidence, confidenceClass, domain, hybridProbability)}
|
| 1686 |
${createEnhancedReasoningHTML(ensemble, analysis, reasoning)}
|
|
|
|
| 1687 |
${createDownloadActions()}
|
| 1688 |
</div>
|
| 1689 |
`;
|
|
|
|
| 1691 |
|
| 1692 |
// Helper function to extract and validate summary data
|
| 1693 |
function extractSummaryData(ensemble, analysis) {
|
| 1694 |
+
const syntheticProbability = ensemble.synthetic_probability !== undefined
|
| 1695 |
+
? (ensemble.synthetic_probability * 100).toFixed(0)
|
| 1696 |
+
: '0';
|
| 1697 |
+
|
| 1698 |
+
const authenticProbability = ensemble.authentic_probability !== undefined
|
| 1699 |
+
? (ensemble.authentic_probability * 100).toFixed(0)
|
| 1700 |
+
: '0';
|
| 1701 |
+
|
| 1702 |
+
const hybridProbability = ensemble.hybrid_probability !== undefined
|
| 1703 |
+
? (ensemble.hybrid_probability * 100).toFixed(0)
|
| 1704 |
+
: '0';
|
| 1705 |
+
|
| 1706 |
const verdict = ensemble.final_verdict || 'Unknown';
|
| 1707 |
+
|
| 1708 |
+
const confidence = ensemble.overall_confidence !== undefined
|
| 1709 |
+
? (ensemble.overall_confidence * 100).toFixed(1)
|
| 1710 |
+
: '0';
|
| 1711 |
+
|
| 1712 |
const domain = analysis.domain || 'general';
|
| 1713 |
+
|
| 1714 |
+
const isSynthetic = verdict.toLowerCase().includes('synthetic');
|
| 1715 |
+
|
| 1716 |
+
const gaugeColor = isSynthetic
|
| 1717 |
+
? 'var(--danger)'
|
| 1718 |
+
: 'var(--success)';
|
| 1719 |
+
|
| 1720 |
+
const gaugeDegree = parseFloat(syntheticProbability) * 3.6;
|
| 1721 |
+
|
| 1722 |
const confidenceLevel = getConfidenceLevel(parseFloat(confidence));
|
| 1723 |
const confidenceClass = getConfidenceClass(confidenceLevel);
|
| 1724 |
|
| 1725 |
return {
|
| 1726 |
+
syntheticProbability,
|
| 1727 |
+
authenticProbability,
|
| 1728 |
+
hybridProbability,
|
| 1729 |
verdict,
|
| 1730 |
confidence,
|
| 1731 |
domain,
|
| 1732 |
+
isSynthetic,
|
| 1733 |
gaugeColor,
|
| 1734 |
gaugeDegree,
|
| 1735 |
confidenceLevel,
|
|
|
|
| 1754 |
return classMap[confidenceLevel] || 'confidence-low';
|
| 1755 |
}
|
| 1756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1757 |
// Helper function to format model names
|
| 1758 |
function formatModelName(modelName) {
|
| 1759 |
return modelName.replace(/_/g, ' ').replace(/-/g, ' ').toUpperCase();
|
| 1760 |
}
|
| 1761 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1762 |
// Helper function to create single-progress gauge section
|
| 1763 |
+
function createGaugeSection(syntheticProbability, authenticProbability, hybridProbability, gaugeColor, gaugeDegree) {
|
| 1764 |
// Ensure these are numbers
|
| 1765 |
+
const synthetic = parseFloat(syntheticProbability);
|
| 1766 |
+
const authentic = parseFloat(authenticProbability);
|
| 1767 |
+
const hybrid = parseFloat(hybridProbability);
|
| 1768 |
|
| 1769 |
// Determine which probability is highest
|
| 1770 |
let maxValue, maxColor, maxLabel;
|
| 1771 |
|
| 1772 |
+
if (synthetic >= authentic && synthetic >= hybrid) {
|
| 1773 |
+
maxValue = synthetic;
|
| 1774 |
maxColor = 'var(--danger)';
|
| 1775 |
+
maxLabel = 'Synthetic Probability';
|
| 1776 |
+
} else if (authentic >= synthetic && authentic >= hybrid) {
|
| 1777 |
+
maxValue = authentic;
|
| 1778 |
maxColor = 'var(--success)';
|
| 1779 |
+
maxLabel = 'Authentic Probability';
|
| 1780 |
} else {
|
| 1781 |
+
maxValue = hybrid;
|
| 1782 |
maxColor = 'var(--primary)';
|
| 1783 |
+
maxLabel = 'Hybrid Probability';
|
| 1784 |
}
|
| 1785 |
|
| 1786 |
console.log('Selected:', { maxValue, maxLabel });
|
|
|
|
| 1806 |
</div>
|
| 1807 |
<div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
|
| 1808 |
<div style="text-align: center; padding: 1rem; background: rgba(239, 68, 68, 0.1); border-radius: 8px; border: 1px solid rgba(239, 68, 68, 0.3);">
|
| 1809 |
+
<div style="font-size: 0.85rem; color: var(--danger); margin-bottom: 0.25rem; font-weight: 600;">Synthetic</div>
|
| 1810 |
+
<div style="font-size: 1.4rem; font-weight: 700; color: var(--danger);">${syntheticProbability}%</div>
|
| 1811 |
</div>
|
| 1812 |
<div style="text-align: center; padding: 1rem; background: rgba(16, 185, 129, 0.1); border-radius: 8px; border: 1px solid rgba(16, 185, 129, 0.3);">
|
| 1813 |
+
<div style="font-size: 0.85rem; color: var(--success); margin-bottom: 0.25rem; font-weight: 600;">Authentic</div>
|
| 1814 |
+
<div style="font-size: 1.4rem; font-weight: 700; color: var(--success);">${authenticProbability}%</div>
|
| 1815 |
</div>
|
| 1816 |
<div style="text-align: center; padding: 1rem; background: rgba(6, 182, 212, 0.1); border-radius: 8px; border: 1px solid rgba(6, 182, 212, 0.3);">
|
| 1817 |
+
<div style="font-size: 0.85rem; color: var(--primary); margin-bottom: 0.25rem; font-weight: 600;">Hybrid</div>
|
| 1818 |
+
<div style="font-size: 1.4rem; font-weight: 700; color: var(--primary);">${hybridProbability}%</div>
|
| 1819 |
</div>
|
| 1820 |
</div>
|
| 1821 |
<style>
|
|
|
|
| 1859 |
|
| 1860 |
|
| 1861 |
// Helper function to create info grid
|
| 1862 |
+
function createInfoGrid(verdict, confidence, confidenceClass, domain, hybridProbability) {
|
| 1863 |
+
const hybridContentInfo = hybridProbability > 10 ?
|
| 1864 |
`<div style="margin-top: 0.5rem; font-size: 0.85rem; color: var(--primary);">
|
| 1865 |
+
🔀 ${hybridProbability}% Hybrid Content Detected
|
| 1866 |
</div>` : '';
|
| 1867 |
|
| 1868 |
return `
|
|
|
|
| 1870 |
<div class="info-card">
|
| 1871 |
<div class="info-label">Verdict</div>
|
| 1872 |
<div class="info-value verdict-text">${verdict}</div>
|
| 1873 |
+
${hybridContentInfo}
|
| 1874 |
</div>
|
| 1875 |
<div class="info-card">
|
| 1876 |
<div class="info-label">Confidence Level</div>
|
|
|
|
| 1910 |
if (reasoning && reasoning.summary) {
|
| 1911 |
// Process the summary into bullet points
|
| 1912 |
const bulletPoints = formatSummaryAsBulletPoints(reasoning.summary, ensemble, analysis);
|
| 1913 |
+
|
| 1914 |
+
const dominantLabel =
|
| 1915 |
+
ensemble.hybrid_probability > ensemble.synthetic_probability &&
|
| 1916 |
+
ensemble.hybrid_probability > ensemble.authentic_probability
|
| 1917 |
+
? 'Hybrid Probability'
|
| 1918 |
+
: ensemble.synthetic_probability > ensemble.authentic_probability
|
| 1919 |
+
? 'Synthetic Probability'
|
| 1920 |
+
: 'Authentic Probability';
|
| 1921 |
+
|
| 1922 |
+
const dominantValue =
|
| 1923 |
+
Math.max(
|
| 1924 |
+
ensemble.synthetic_probability,
|
| 1925 |
+
ensemble.authentic_probability,
|
| 1926 |
+
ensemble.hybrid_probability
|
| 1927 |
+
);
|
| 1928 |
|
| 1929 |
// Process key indicators with markdown formatting
|
| 1930 |
let processedIndicators = [];
|
|
|
|
| 1956 |
<div class="reasoning-box enhanced">
|
| 1957 |
<div class="reasoning-header">
|
| 1958 |
<div class="reasoning-icon">💡</div>
|
| 1959 |
+
<div class="reasoning-title">Forensic Reasoning</div>
|
| 1960 |
<div class="confidence-tag ${ensemble.overall_confidence >= 0.7 ? 'high-confidence' : ensemble.overall_confidence >= 0.4 ? 'medium-confidence' : 'low-confidence'}">
|
| 1961 |
${ensemble.overall_confidence >= 0.7 ? 'High Confidence' : ensemble.overall_confidence >= 0.4 ? 'Medium Confidence' : 'Low Confidence'}
|
| 1962 |
</div>
|
| 1963 |
</div>
|
| 1964 |
<div class="verdict-summary">
|
| 1965 |
<div class="verdict-text">${ensemble.final_verdict}</div>
|
| 1966 |
+
${dominantLabel}:
|
| 1967 |
+
<span class="probability-value">${(dominantValue * 100).toFixed(2)}%</span>
|
| 1968 |
</div>
|
| 1969 |
<div class="reasoning-bullet-points">
|
| 1970 |
${bulletPoints}
|
|
|
|
| 2016 |
// Fallback to basic reasoning if no reasoning data
|
| 2017 |
return `
|
| 2018 |
<div class="reasoning-box">
|
| 2019 |
+
<div class="reasoning-title">💡 Forensic Reasoning</div>
|
| 2020 |
<p class="reasoning-text" style="text-align: left;">
|
| 2021 |
+
Analysis based on a multi-signal forensic ensemble with domain-aware calibration.
|
| 2022 |
+
The system evaluated linguistic, statistical, and semantic evidence patterns
|
| 2023 |
+
to assess content authenticity with ${(ensemble.overall_confidence * 100).toFixed(1)}% confidence.
|
| 2024 |
</p>
|
| 2025 |
</div>
|
| 2026 |
`;
|
|
|
|
| 2056 |
// Add verdict as second bullet
|
| 2057 |
bulletPoints.push(`<div class="bullet-point">• ${ensemble.final_verdict}</div>`);
|
| 2058 |
|
| 2059 |
+
// Add Synthetic probability as third bullet
|
| 2060 |
+
bulletPoints.push(`<div class="bullet-point">• Synthetic Probability: ${(ensemble.synthetic_probability * 100).toFixed(2)}%</div>`);
|
| 2061 |
|
| 2062 |
// Add the main analysis sentences as individual bullets
|
| 2063 |
sentences.forEach(sentence => {
|
| 2064 |
if (sentence.trim() &&
|
| 2065 |
!sentence.includes('confidence') &&
|
| 2066 |
!sentence.includes(ensemble.final_verdict) &&
|
| 2067 |
+
!sentence.includes('Synthetic probability')) {
|
| 2068 |
// Clean up the sentence and add as bullet
|
| 2069 |
let cleanSentence = sentence.trim();
|
| 2070 |
if (!cleanSentence.endsWith('.')) {
|
|
|
|
| 2091 |
return `
|
| 2092 |
<div class="highlight-legend">
|
| 2093 |
<div class="legend-item">
|
| 2094 |
+
<div class="legend-color" style="background: #dcfce7;"></div>
|
| 2095 |
+
<div class="legend-label">Authentic</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2096 |
</div>
|
| 2097 |
<div class="legend-item">
|
| 2098 |
<div class="legend-color" style="background: #fef9c3;"></div>
|
| 2099 |
+
<div class="legend-label">Uncertain</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2100 |
</div>
|
| 2101 |
<div class="legend-item">
|
| 2102 |
+
<div class="legend-color" style="background: #fee2e2;"></div>
|
| 2103 |
+
<div class="legend-label">Synthetic</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2104 |
</div>
|
| 2105 |
<div class="legend-item">
|
| 2106 |
<div class="legend-color" style="background: #e9d5ff;"></div>
|
| 2107 |
+
<div class="legend-label">Hybrid</div>
|
| 2108 |
</div>
|
| 2109 |
</div>
|
| 2110 |
`;
|
|
|
|
| 2129 |
z-index: 10;
|
| 2130 |
text-shadow: 0 1px 1px rgba(255,255,255,0.8);
|
| 2131 |
}
|
| 2132 |
+
#highlighted-report .very-high-synthetic {
|
| 2133 |
background-color: #fee2e2 !important;
|
| 2134 |
border-bottom-color: #ef4444 !important;
|
| 2135 |
}
|
| 2136 |
+
#highlighted-report .high-synthetic {
|
| 2137 |
background-color: #fed7aa !important;
|
| 2138 |
border-bottom-color: #f97316 !important;
|
| 2139 |
}
|
| 2140 |
+
#highlighted-report .medium-synthetic {
|
| 2141 |
background-color: #fef3c7 !important;
|
| 2142 |
border-bottom-color: #f59e0b !important;
|
| 2143 |
}
|
|
|
|
| 2145 |
background-color: #fef9c3 !important;
|
| 2146 |
border-bottom-color: #fbbf24 !important;
|
| 2147 |
}
|
| 2148 |
+
#highlighted-report .medium-authentic {
|
| 2149 |
background-color: #ecfccb !important;
|
| 2150 |
border-bottom-color: #a3e635 !important;
|
| 2151 |
}
|
| 2152 |
+
#highlighted-report .high-authentic {
|
| 2153 |
background-color: #bbf7d0 !important;
|
| 2154 |
border-bottom-color: #4ade80 !important;
|
| 2155 |
}
|
| 2156 |
+
#highlighted-report .very-high-authentic {
|
| 2157 |
background-color: #dcfce7 !important;
|
| 2158 |
border-bottom-color: #22c55e !important;
|
| 2159 |
}
|
| 2160 |
+
#highlighted-report .hybrid-content {
|
| 2161 |
background-color: #e9d5ff !important;
|
| 2162 |
border-bottom-color: #a855f7 !important;
|
| 2163 |
background-image: repeating-linear-gradient(45deg, transparent, transparent 5px, rgba(168, 85, 247, 0.1) 5px, rgba(168, 85, 247, 0.1) 10px) !important;
|
|
|
|
| 2189 |
const metric = metrics[metricKey];
|
| 2190 |
if (!metric) return;
|
| 2191 |
|
| 2192 |
+
const syntheticProb = (metric.synthetic_probability * 100).toFixed(1);
|
| 2193 |
+
const authenticProb = (metric.authentic_probability * 100).toFixed(1);
|
| 2194 |
+
const hybridProb = (metric.hybrid_probability * 100).toFixed(1);
|
| 2195 |
const confidence = (metric.confidence * 100).toFixed(1);
|
| 2196 |
const weight = ensemble.metric_contributions && ensemble.metric_contributions[metricKey] ?
|
| 2197 |
(ensemble.metric_contributions[metricKey].weight * 100).toFixed(1) : '0.0';
|
| 2198 |
|
| 2199 |
// Determine verdict based on probabilities
|
| 2200 |
let verdictText, verdictClass;
|
| 2201 |
+
if (metric.hybrid_probability > 0.3) {
|
| 2202 |
+
verdictText = 'Hybrid';
|
| 2203 |
+
verdictClass = 'verdict-hybrid';
|
| 2204 |
+
} else if (metric.synthetic_probability >= 0.6) {
|
| 2205 |
+
verdictText = 'Synthetic';
|
| 2206 |
+
verdictClass = 'verdict-synthetic';
|
| 2207 |
+
} else if (metric.synthetic_probability >= 0.4) {
|
| 2208 |
+
verdictText = 'Uncertain';
|
| 2209 |
verdictClass = 'verdict-uncertain';
|
| 2210 |
} else {
|
| 2211 |
+
verdictText = 'Authentic';
|
| 2212 |
+
verdictClass = 'verdict-authentic';
|
| 2213 |
}
|
| 2214 |
|
| 2215 |
carouselHTML += `
|
|
|
|
| 2222 |
${getMetricDescription(metricKey)}
|
| 2223 |
</div>
|
| 2224 |
|
| 2225 |
+
<!-- Probability Display with Hybrid -->
|
| 2226 |
<div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 1rem; margin: 1rem 0;">
|
| 2227 |
<div style="text-align: center;">
|
| 2228 |
+
<div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 0.25rem;">Synthetic</div>
|
| 2229 |
<div style="background: rgba(51, 65, 85, 0.5); height: 8px; border-radius: 4px; overflow: hidden;">
|
| 2230 |
+
<div style="background: var(--danger); height: 100%; width: ${syntheticProb}%; transition: width 0.5s;"></div>
|
| 2231 |
</div>
|
| 2232 |
+
<div style="font-size: 0.85rem; font-weight: 600; margin-top: 0.25rem;">${syntheticProb}%</div>
|
| 2233 |
</div>
|
| 2234 |
<div style="text-align: center;">
|
| 2235 |
+
<div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 0.25rem;">Authentic</div>
|
| 2236 |
<div style="background: rgba(51, 65, 85, 0.5); height: 8px; border-radius: 4px; overflow: hidden;">
|
| 2237 |
+
<div style="background: var(--success); height: 100%; width: ${authenticProb}%; transition: width 0.5s;"></div>
|
| 2238 |
</div>
|
| 2239 |
+
<div style="font-size: 0.85rem; font-weight: 600; margin-top: 0.25rem;">${authenticProb}%</div>
|
| 2240 |
</div>
|
| 2241 |
<div style="text-align: center;">
|
| 2242 |
+
<div style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 0.25rem;">Hybrid</div>
|
| 2243 |
<div style="background: rgba(51, 65, 85, 0.5); height: 8px; border-radius: 4px; overflow: hidden;">
|
| 2244 |
+
<div style="background: var(--primary); height: 100%; width: ${hybridProb}%; transition: width 0.5s;"></div>
|
| 2245 |
</div>
|
| 2246 |
+
<div style="font-size: 0.85rem; font-weight: 600; margin-top: 0.25rem;">${hybridProb}%</div>
|
| 2247 |
</div>
|
| 2248 |
</div>
|
| 2249 |
|
|
|
|
| 2318 |
'entropy': ['token_diversity', 'sequence_unpredictability', 'char_entropy'],
|
| 2319 |
'semantic_analysis': ['coherence_score', 'consistency_score', 'repetition_score'],
|
| 2320 |
'linguistic': ['pos_diversity', 'syntactic_complexity', 'grammatical_consistency'],
|
| 2321 |
+
'multi_perturbation_stability': ['stability_score', 'curvature_score', 'likelihood_ratio', 'perturbation_variance', 'hybrid_probability']
|
| 2322 |
};
|
| 2323 |
|
| 2324 |
const keysToShow = importantKeys[metricName] || Object.keys(details).slice(0, 6);
|
|
|
|
| 2403 |
const blob = new Blob([JSON.stringify(data, null, 2)], {
|
| 2404 |
type: 'application/json'
|
| 2405 |
});
|
| 2406 |
+
const filename = `text-forensics-report-${analysisId}-${timestamp}.json`;
|
| 2407 |
await downloadBlob(blob, filename);
|
| 2408 |
return;
|
| 2409 |
}
|
|
|
|
| 2442 |
throw new Error('Failed to download file');
|
| 2443 |
}
|
| 2444 |
const blob = await downloadResponse.blob();
|
| 2445 |
+
const downloadFilename = `text-forensics-report-${format}-report-${analysisId}-${timestamp}.${format}`;
|
| 2446 |
await downloadBlob(blob, downloadFilename);
|
| 2447 |
} else {
|
| 2448 |
alert('Report file not available');
|
utils/logger.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
# DEPENDENCIES
|
| 2 |
-
import os
|
| 3 |
import sys
|
| 4 |
-
import json
|
| 5 |
-
import time
|
| 6 |
import logging
|
| 7 |
from typing import Any
|
| 8 |
from typing import Dict
|
|
@@ -31,52 +28,16 @@ class InterceptHandler(logging.Handler):
|
|
| 31 |
# Find caller from where originated the logged message
|
| 32 |
frame, depth = logging.currentframe(), 2
|
| 33 |
while (frame.f_code.co_filename == logging.__file__):
|
| 34 |
-
|
| 35 |
-
|
| 36 |
|
| 37 |
-
logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
class JSONFormatter:
|
| 41 |
-
"""
|
| 42 |
-
JSON formatter for structured logging
|
| 43 |
-
"""
|
| 44 |
-
def __init__(self):
|
| 45 |
-
self.pid = os.getpid()
|
| 46 |
|
| 47 |
|
| 48 |
-
def format(self, record: Dict[str, Any]) -> str:
|
| 49 |
-
"""
|
| 50 |
-
Format log record as JSON
|
| 51 |
-
"""
|
| 52 |
-
# Create structured log entry
|
| 53 |
-
log_entry = {"timestamp" : datetime.fromtimestamp(record["time"].timestamp()).isoformat(),
|
| 54 |
-
"level" : record["level"].name,
|
| 55 |
-
"message" : record["message"],
|
| 56 |
-
"module" : record["name"],
|
| 57 |
-
"function" : record["function"],
|
| 58 |
-
"line" : record["line"],
|
| 59 |
-
"process_id" : self.pid,
|
| 60 |
-
"thread_id" : record["thread"].id if record.get("thread") else None,
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
# Add exception info if present
|
| 64 |
-
if record.get("exception"):
|
| 65 |
-
log_entry["exception"] = {"type" : str(record["exception"].type),
|
| 66 |
-
"value" : str(record["exception"].value),
|
| 67 |
-
"traceback" : "".join(record["exception"].traceback).strip() if record["exception"].traceback else None,
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
# Add extra fields
|
| 71 |
-
if record.get("extra"):
|
| 72 |
-
log_entry.update(record["extra"])
|
| 73 |
-
|
| 74 |
-
return json.dumps(log_entry, ensure_ascii=False, default=str)
|
| 75 |
-
|
| 76 |
|
| 77 |
class CentralizedLogger:
|
| 78 |
"""
|
| 79 |
-
Centralized logging system for
|
| 80 |
|
| 81 |
Features:
|
| 82 |
- Structured JSON logging for production
|
|
@@ -305,7 +266,7 @@ class CentralizedLogger:
|
|
| 305 |
"""
|
| 306 |
performance_data = {"operation" : operation,
|
| 307 |
"duration_seconds" : round(duration, 4),
|
| 308 |
-
"timestamp" : datetime.
|
| 309 |
**kwargs
|
| 310 |
}
|
| 311 |
|
|
@@ -331,7 +292,7 @@ class CentralizedLogger:
|
|
| 331 |
security_data = {"event_type" : event_type,
|
| 332 |
"user" : user,
|
| 333 |
"ip_address" : ip,
|
| 334 |
-
"timestamp" : datetime.
|
| 335 |
**kwargs,
|
| 336 |
}
|
| 337 |
|
|
@@ -365,7 +326,7 @@ class CentralizedLogger:
|
|
| 365 |
"duration_seconds" : round(duration, 4),
|
| 366 |
"user" : user,
|
| 367 |
"ip_address" : ip,
|
| 368 |
-
"timestamp" : datetime.
|
| 369 |
**kwargs
|
| 370 |
}
|
| 371 |
|
|
@@ -386,9 +347,9 @@ class CentralizedLogger:
|
|
| 386 |
)
|
| 387 |
|
| 388 |
|
| 389 |
-
def
|
| 390 |
"""
|
| 391 |
-
Log text
|
| 392 |
|
| 393 |
Arguments:
|
| 394 |
----------
|
|
@@ -396,28 +357,28 @@ class CentralizedLogger:
|
|
| 396 |
|
| 397 |
text_length { int } : Length of analyzed text
|
| 398 |
|
| 399 |
-
|
| 400 |
|
| 401 |
-
|
| 402 |
|
| 403 |
domain { str } : Content domain
|
| 404 |
|
| 405 |
processing_time { float } : Processing time in seconds
|
| 406 |
|
| 407 |
-
**kwargs : Additional
|
| 408 |
"""
|
| 409 |
-
|
| 410 |
"text_length" : text_length,
|
| 411 |
-
"
|
| 412 |
-
"
|
| 413 |
"domain" : domain,
|
| 414 |
"processing_time_seconds" : round(processing_time, 4),
|
| 415 |
-
"timestamp" : datetime.
|
| 416 |
**kwargs
|
| 417 |
}
|
| 418 |
|
| 419 |
-
logger.bind(log_type = "application").info(f"
|
| 420 |
-
extra =
|
| 421 |
)
|
| 422 |
|
| 423 |
|
|
@@ -438,7 +399,7 @@ class CentralizedLogger:
|
|
| 438 |
model_data = {"model_name" : model_name,
|
| 439 |
"success" : success,
|
| 440 |
"load_time_seconds" : round(load_time, 4),
|
| 441 |
-
"timestamp" : datetime.
|
| 442 |
**kwargs
|
| 443 |
}
|
| 444 |
|
|
@@ -470,7 +431,7 @@ class CentralizedLogger:
|
|
| 470 |
error_data = {"error_type" : error_type,
|
| 471 |
"message" : message,
|
| 472 |
"context" : context or {},
|
| 473 |
-
"timestamp" : datetime.
|
| 474 |
}
|
| 475 |
|
| 476 |
if exception:
|
|
@@ -498,7 +459,7 @@ class CentralizedLogger:
|
|
| 498 |
"""
|
| 499 |
startup_data = {"component" : component,
|
| 500 |
"success" : success,
|
| 501 |
-
"timestamp" : datetime.
|
| 502 |
**kwargs
|
| 503 |
}
|
| 504 |
|
|
@@ -518,7 +479,7 @@ class CentralizedLogger:
|
|
| 518 |
Cleanup logging resources
|
| 519 |
"""
|
| 520 |
try:
|
| 521 |
-
logger.
|
| 522 |
logger.info("Logging system cleanup completed")
|
| 523 |
|
| 524 |
except Exception as e:
|
|
@@ -566,11 +527,11 @@ def log_api_request(method: str, path: str, status_code: int, duration: float, u
|
|
| 566 |
central_logger.log_api_request(method, path, status_code, duration, user, ip, **kwargs)
|
| 567 |
|
| 568 |
|
| 569 |
-
def
|
| 570 |
"""
|
| 571 |
-
Log text
|
| 572 |
"""
|
| 573 |
-
central_logger.
|
| 574 |
|
| 575 |
|
| 576 |
def log_model_loading(model_name: str, success: bool, load_time: float, **kwargs) -> None:
|
|
@@ -606,5 +567,5 @@ __all__ = ["log_error",
|
|
| 606 |
"CentralizedLogger",
|
| 607 |
"log_model_loading",
|
| 608 |
"log_security_event",
|
| 609 |
-
"
|
| 610 |
]
|
|
|
|
| 1 |
# DEPENDENCIES
|
|
|
|
| 2 |
import sys
|
|
|
|
|
|
|
| 3 |
import logging
|
| 4 |
from typing import Any
|
| 5 |
from typing import Dict
|
|
|
|
| 28 |
# Find caller from where originated the logged message
|
| 29 |
frame, depth = logging.currentframe(), 2
|
| 30 |
while (frame.f_code.co_filename == logging.__file__):
|
| 31 |
+
frame = frame.f_back
|
| 32 |
+
depth += 1
|
| 33 |
|
| 34 |
+
logger.opt(depth = depth, exception = record.exc_info).log(level, record.getMessage())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
class CentralizedLogger:
|
| 39 |
"""
|
| 40 |
+
Centralized logging system for Text Authenticator
|
| 41 |
|
| 42 |
Features:
|
| 43 |
- Structured JSON logging for production
|
|
|
|
| 266 |
"""
|
| 267 |
performance_data = {"operation" : operation,
|
| 268 |
"duration_seconds" : round(duration, 4),
|
| 269 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 270 |
**kwargs
|
| 271 |
}
|
| 272 |
|
|
|
|
| 292 |
security_data = {"event_type" : event_type,
|
| 293 |
"user" : user,
|
| 294 |
"ip_address" : ip,
|
| 295 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 296 |
**kwargs,
|
| 297 |
}
|
| 298 |
|
|
|
|
| 326 |
"duration_seconds" : round(duration, 4),
|
| 327 |
"user" : user,
|
| 328 |
"ip_address" : ip,
|
| 329 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 330 |
**kwargs
|
| 331 |
}
|
| 332 |
|
|
|
|
| 347 |
)
|
| 348 |
|
| 349 |
|
| 350 |
+
def log_analysis_event(self, analysis_id: str, text_length: int, assessment: str, signal_strength: float, domain: str, processing_time: float, **kwargs) -> None:
|
| 351 |
"""
|
| 352 |
+
Log text analysis events
|
| 353 |
|
| 354 |
Arguments:
|
| 355 |
----------
|
|
|
|
| 357 |
|
| 358 |
text_length { int } : Length of analyzed text
|
| 359 |
|
| 360 |
+
assessment { str } : Analysis assessment
|
| 361 |
|
| 362 |
+
signal_strength { float } : signal_strength score
|
| 363 |
|
| 364 |
domain { str } : Content domain
|
| 365 |
|
| 366 |
processing_time { float } : Processing time in seconds
|
| 367 |
|
| 368 |
+
**kwargs : Additional Analysis context
|
| 369 |
"""
|
| 370 |
+
analysis_data = {"analysis_id" : analysis_id,
|
| 371 |
"text_length" : text_length,
|
| 372 |
+
"assessment" : assessment,
|
| 373 |
+
"signal_strength" : round(signal_strength, 4),
|
| 374 |
"domain" : domain,
|
| 375 |
"processing_time_seconds" : round(processing_time, 4),
|
| 376 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 377 |
**kwargs
|
| 378 |
}
|
| 379 |
|
| 380 |
+
logger.bind(log_type = "application").info(f"Analysis completed: {analysis_id} -> {assessment}",
|
| 381 |
+
extra = analysis_data,
|
| 382 |
)
|
| 383 |
|
| 384 |
|
|
|
|
| 399 |
model_data = {"model_name" : model_name,
|
| 400 |
"success" : success,
|
| 401 |
"load_time_seconds" : round(load_time, 4),
|
| 402 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 403 |
**kwargs
|
| 404 |
}
|
| 405 |
|
|
|
|
| 431 |
error_data = {"error_type" : error_type,
|
| 432 |
"message" : message,
|
| 433 |
"context" : context or {},
|
| 434 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 435 |
}
|
| 436 |
|
| 437 |
if exception:
|
|
|
|
| 459 |
"""
|
| 460 |
startup_data = {"component" : component,
|
| 461 |
"success" : success,
|
| 462 |
+
"timestamp" : datetime.utcnow().isoformat(),
|
| 463 |
**kwargs
|
| 464 |
}
|
| 465 |
|
|
|
|
| 479 |
Cleanup logging resources
|
| 480 |
"""
|
| 481 |
try:
|
| 482 |
+
logger.remove()
|
| 483 |
logger.info("Logging system cleanup completed")
|
| 484 |
|
| 485 |
except Exception as e:
|
|
|
|
| 527 |
central_logger.log_api_request(method, path, status_code, duration, user, ip, **kwargs)
|
| 528 |
|
| 529 |
|
| 530 |
+
def log_analysis_event(analysis_id: str, text_length: int, assessment: str, signal_strength: float, domain: str, processing_time: float, **kwargs) -> None:
|
| 531 |
"""
|
| 532 |
+
Log text analysis events
|
| 533 |
"""
|
| 534 |
+
central_logger.log_analysis_event(analysis_id, text_length, assessment, signal_strength, domain, processing_time, **kwargs)
|
| 535 |
|
| 536 |
|
| 537 |
def log_model_loading(model_name: str, success: bool, load_time: float, **kwargs) -> None:
|
|
|
|
| 567 |
"CentralizedLogger",
|
| 568 |
"log_model_loading",
|
| 569 |
"log_security_event",
|
| 570 |
+
"log_analysis_event",
|
| 571 |
]
|