Spaces:

smartTranscend
/

1030

Paused

App Files Files Community

smartTranscend commited on Oct 31

Commit

569e864

verified ·

1 Parent(s): ff78cd4

Update app.py

Browse files

Files changed (1) hide show

app.py +1094 -468

app.py CHANGED Viewed

@@ -1,77 +1,196 @@
 import gradio as gr
 import pandas as pd
 import torch
 from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
 from peft import LoraConfig, AdaLoraConfig, get_peft_model, TaskType
 from datasets import Dataset
 from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
 from torch import nn
 import os
 from datetime import datetime
 import gc
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
-# 設置較小的預設值以節省記憶體
 torch.backends.cudnn.benchmark = False
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
-# 全域變數
 trained_models = {}
 model_counter = 0
-baseline_results = {}
-baseline_model_cache = {}
-def calculate_improvement(baseline_val, finetuned_val):
-    """安全計算改善率"""
-    if baseline_val == 0:
-        if finetuned_val > 0:
-            return float('inf')
-        else:
-            return 0.0
-    return (finetuned_val - baseline_val) / baseline_val * 100
-def format_improve(val):
-    """格式化改善率"""
-    if val == float('inf'):
-        return "N/A (baseline=0)"
-    return f"{val:+.1f}%"
 def compute_metrics(pred):
     try:
         labels = pred.label_ids
         preds = pred.predictions.argmax(-1)
-        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', pos_label=1, zero_division=0)
         acc = accuracy_score(labels, preds)
         cm = confusion_matrix(labels, preds)
         if cm.shape == (2, 2):
             tn, fp, fn, tp = cm.ravel()
-        else:
-            tn = fp = fn = tp = 0
         sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
         specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
         return {
-            'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall,
-            'sensitivity': sensitivity, 'specificity': specificity,
-            'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn)
         }
     except Exception as e:
         print(f"Error in compute_metrics: {e}")
-        return {
-            'accuracy': 0, 'f1': 0, 'precision': 0, 'recall': 0,
-            'sensitivity': 0, 'specificity': 0, 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0
-        }
-def evaluate_baseline(model, tokenizer, test_dataset, device):
     """評估未微調的基準模型"""
     model.eval()
     all_preds = []
     all_labels = []
-    from torch.utils.data import DataLoader
     def collate_fn(batch):
         return {
             'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
@@ -79,7 +198,13 @@ def evaluate_baseline(model, tokenizer, test_dataset, device):
             'labels': torch.tensor([item['label'] for item in batch])
         }
-    dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)
     with torch.no_grad():
         for batch in dataloader:
@@ -90,636 +215,1137 @@ def evaluate_baseline(model, tokenizer, test_dataset, device):
             all_preds.extend(preds.cpu().numpy())
             all_labels.extend(labels.numpy())
-    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', pos_label=1, zero_division=0)
     acc = accuracy_score(all_labels, all_preds)
     cm = confusion_matrix(all_labels, all_preds)
     if cm.shape == (2, 2):
         tn, fp, fn, tp = cm.ravel()
-    else:
-        tn = fp = fn = tp = 0
     sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
     specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
     return {
-        'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall,
-        'sensitivity': sensitivity, 'specificity': specificity,
-        'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn)
     }
-class WeightedTrainer(Trainer):
-    def __init__(self, *args, class_weights=None, use_focal_loss=False, **kwargs):
         super().__init__(*args, **kwargs)
         self.class_weights = class_weights
         self.use_focal_loss = use_focal_loss
     def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         labels = inputs.pop("labels")
         outputs = model(**inputs)
         logits = outputs.logits
-        if self.use_focal_loss:
-            # Focal Loss: 更關注難分類的樣本
             ce_loss = nn.CrossEntropyLoss(weight=self.class_weights, reduction='none')(
                 logits.view(-1, 2), labels.view(-1)
             )
             pt = torch.exp(-ce_loss)
-            focal_loss = ((1 - pt) ** 2 * ce_loss).mean()
             loss = focal_loss
-        else:
             loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
             loss = loss_fct(logits.view(-1, 2), labels.view(-1))
         return (loss, outputs) if return_outputs else loss
-def evaluate_baseline(model, tokenizer, test_dataset, device):
-    """評估未微調的基準模型"""
-    model.eval()
-    all_preds = []
-    all_labels = []
-    from torch.utils.data import DataLoader
-    def collate_fn(batch):
-        return {
-            'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
-            'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
-            'labels': torch.tensor([item['label'] for item in batch])
-        }
-    dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)
-    with torch.no_grad():
-        for batch in dataloader:
-            labels = batch.pop('labels')
-            inputs = {k: v.to(device) for k, v in batch.items()}
-            outputs = model(**inputs)
-            preds = torch.argmax(outputs.logits, dim=-1)
-            all_preds.extend(preds.cpu().numpy())
-            all_labels.extend(labels.numpy())
-    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', pos_label=1, zero_division=0)
-    acc = accuracy_score(all_labels, all_preds)
-    cm = confusion_matrix(all_labels, all_preds)
-    if cm.shape == (2, 2):
-        tn, fp, fn, tp = cm.ravel()
     else:
-        tn = fp = fn = tp = 0
-    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
-    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
-    return {
-        'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall,
-        'sensitivity': sensitivity, 'specificity': specificity,
-        'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn)
-    }
-def train_bert_model(csv_file, base_model, method, num_epochs, batch_size, learning_rate,
-                     weight_decay, dropout, lora_r, lora_alpha, lora_dropout,
-                     weight_mult, best_metric):
-    global trained_models, model_counter, baseline_results
     model_mapping = {
         "BERT-base": "bert-base-uncased",
     }
     model_name = model_mapping.get(base_model, "bert-base-uncased")
     try:
         if csv_file is None:
-            return "❌ 請上傳 CSV", "", "", ""
         df = pd.read_csv(csv_file.name)
         if 'Text' not in df.columns or 'label' not in df.columns:
-            return "❌ 需要 Text 和 label 欄位", "", "", ""
         df_clean = pd.DataFrame({
-            'text': df['Text'].astype(str),
             'label': df['label'].astype(int)
         }).dropna()
         n0 = int(sum(df_clean['label'] == 0))
         n1 = int(sum(df_clean['label'] == 1))
-        if n1 == 0:
-            return "❌ 無死亡樣本", "", "", ""
-        ratio = n0 / n1
-        # 動態調整權重計算
-        if ratio > 10:  # 極度不平衡
-            w0, w1 = 1.0, min(ratio * weight_mult, ratio * 0.7)  # 限制最大權重
-        else:
-            w0, w1 = 1.0, ratio * weight_mult
-        info = f"📊 資料: {len(df_clean)} 筆\n存活: {n0} | 死亡: {n1}\n比例: {ratio:.2f}:1\n"
-        info += f"⚖️ 權重: {w0:.2f} / {w1:.2f}\n模型: {base_model}\n方法: {method.upper()}"
         tokenizer = BertTokenizer.from_pretrained(model_name)
         dataset = Dataset.from_pandas(df_clean[['text', 'label']])
-        def preprocess(ex):
-            return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=128)
         tokenized = dataset.map(preprocess, batched=True, remove_columns=['text'])
-        split = tokenized.train_test_split(test_size=0.2, seed=42)
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        info += f"\n裝置: {'GPU ✅' if torch.cuda.is_available() else 'CPU ⚠️'}"
-        # 🔇 靜默評估基準模型（不顯示在資料資訊中）
         baseline_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
         baseline_model = baseline_model.to(device)
-        baseline_perf = evaluate_baseline(baseline_model, tokenizer, split['test'], device)
-        baseline_key = f"{base_model}_baseline"
-        baseline_results[baseline_key] = baseline_perf
-        # 清理基準模型以釋放記憶體
         del baseline_model
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         gc.collect()
-        # 開始微調
-        info += f"\n\n🔧 套用 {method.upper()} 微調..."
         model = BertForSequenceClassification.from_pretrained(
-            model_name, num_labels=2,
             hidden_dropout_prob=dropout,
             attention_probs_dropout_prob=dropout
         )
         peft_applied = False
         if method == "lora":
             config = LoraConfig(
-                task_type=TaskType.SEQ_CLS,
-                r=int(lora_r),
                 lora_alpha=int(lora_alpha),
-                lora_dropout=lora_dropout,
-                target_modules=["query", "value"],
                 bias="none"
             )
             model = get_peft_model(model, config)
             peft_applied = True
-            info += f"\n✅ LoRA 已套用（r={int(lora_r)}, alpha={int(lora_alpha)}）"
         elif method == "adalora":
             config = AdaLoraConfig(
-                task_type=TaskType.SEQ_CLS,
-                r=int(lora_r),
                 lora_alpha=int(lora_alpha),
-                lora_dropout=lora_dropout,
                 target_modules=["query", "value"],
-                init_r=12, tinit=200, tfinal=1000, deltaT=10
             )
             model = get_peft_model(model, config)
             peft_applied = True
-            info += f"\n✅ AdaLoRA 已套用（r={int(lora_r)}, alpha={int(lora_alpha)}）"
-        if not peft_applied:
-            info += f"\n⚠️ 警告：{method} 方法未被識別，使用 Full Fine-tuning"
         model = model.to(device)
-        total = sum(p.numel() for p in model.parameters())
-        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
-        info += f"\n\n💾 參數量\n總參數: {total:,}\n可訓練: {trainable:,}\n比例: {trainable/total*100:.2f}%"
         weights = torch.tensor([w0, w1], dtype=torch.float).to(device)
-        args = TrainingArguments(
-            output_dir='./results',
             num_train_epochs=int(num_epochs),
-            per_device_train_batch_size=int(batch_size),
-            per_device_eval_batch_size=int(batch_size)*2,
-            learning_rate=float(learning_rate),
             weight_decay=float(weight_decay),
-            evaluation_strategy="epoch",
-            save_strategy="no",  # 🔧 改為不保存，避免 PEFT 載入問題
-            load_best_model_at_end=False,  # 🔧 關閉，直接用最後一個 epoch
-            report_to="none",
-            logging_steps=10,
-            warmup_steps=50,
-            logging_first_step=True
         )
-        trainer = WeightedTrainer(
-            model=model,
-            args=args,
             train_dataset=split['train'],
-            eval_dataset=split['test'],
             compute_metrics=compute_metrics,
             class_weights=weights,
-            use_focal_loss=(ratio > 10)  # 極度不平衡時使用 Focal Loss
         )
-        if ratio > 10:
-            info += "\n\n⚡ 使用 Focal Loss 處理極度不平衡資料"
-        info += "\n\n⏳ 開始訓練..."
-        # 訓練前檢查
-        info += f"\n📊 訓練前檢查："
-        info += f"\n  - 訓練樣本: {len(split['train'])}"
-        info += f"\n  - 測試樣本: {len(split['test'])}"
-        info += f"\n  - 批次數/epoch: {len(split['train']) // int(batch_size)}"
         train_result = trainer.train()
-        # 訓練後資訊
-        info += f"\n\n✅ 訓練完成！"
-        info += f"\n📉 最終 Training Loss: {train_result.training_loss:.4f}"
-        results = trainer.evaluate()
-        # 生成帶時間戳的模型 ID
         model_counter += 1
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        model_id = f"{base_model}_{method}_{timestamp}"
         trained_models[model_id] = {
-            'model': model,
-            'tokenizer': tokenizer,
-            'results': results,
-            'baseline': baseline_perf,  # 保存基準結果供後續使用
             'config': {
-                'type': base_model,
-                'model_name': model_name,
-                'method': method,
-                'metric': best_metric
             },
-            'timestamp': timestamp
         }
-        # 計算改善
-        f1_improve = calculate_improvement(baseline_perf['f1'], results['eval_f1'])
-        acc_improve = calculate_improvement(baseline_perf['accuracy'], results['eval_accuracy'])
-        prec_improve = calculate_improvement(baseline_perf['precision'], results['eval_precision'])
-        rec_improve = calculate_improvement(baseline_perf['recall'], results['eval_recall'])
-        sens_improve = calculate_improvement(baseline_perf['sensitivity'], results['eval_sensitivity'])
-        spec_improve = calculate_improvement(baseline_perf['specificity'], results['eval_specificity'])
-        # 純 BERT 輸出
-        baseline_output = f"🔬 純 BERT（未微調）\n\n"
-        baseline_output += f"📊 表現\n"
-        baseline_output += f"F1: {baseline_perf['f1']:.4f}\n"
-        baseline_output += f"Accuracy: {baseline_perf['accuracy']:.4f}\n"
-        baseline_output += f"Precision: {baseline_perf['precision']:.4f}\n"
-        baseline_output += f"Recall: {baseline_perf['recall']:.4f}\n"
-        baseline_output += f"Sensitivity: {baseline_perf['sensitivity']:.4f}\n"
-        baseline_output += f"Specificity: {baseline_perf['specificity']:.4f}\n\n"
-        baseline_output += f"混淆矩陣\n"
-        baseline_output += f"TP: {baseline_perf['tp']} | TN: {baseline_perf['tn']}\n"
-        baseline_output += f"FP: {baseline_perf['fp']} | FN: {baseline_perf['fn']}"
-        # 微調 BERT 輸出
-        finetuned_output = f"✅ 微調 BERT\n"
-        finetuned_output += f"模型: {model_id}\n\n"
-        finetuned_output += f"📊 表現\n"
-        finetuned_output += f"F1: {results['eval_f1']:.4f}\n"
-        finetuned_output += f"Accuracy: {results['eval_accuracy']:.4f}\n"
-        finetuned_output += f"Precision: {results['eval_precision']:.4f}\n"
-        finetuned_output += f"Recall: {results['eval_recall']:.4f}\n"
-        finetuned_output += f"Sensitivity: {results['eval_sensitivity']:.4f}\n"
-        finetuned_output += f"Specificity: {results['eval_specificity']:.4f}\n\n"
-        finetuned_output += f"混淆矩陣\n"
-        finetuned_output += f"TP: {results['eval_tp']} | TN: {results['eval_tn']}\n"
-        finetuned_output += f"FP: {results['eval_fp']} | FN: {results['eval_fn']}"
-        # 比較結果輸出
-        comparison_output = f"📊 純 BERT vs 微調 BERT 比較\n\n"
-        comparison_output += f"指標改善：\n"
-        comparison_output += f"F1: {baseline_perf['f1']:.4f} → {results['eval_f1']:.4f} ({format_improve(f1_improve)})\n"
-        comparison_output += f"Accuracy: {baseline_perf['accuracy']:.4f} → {results['eval_accuracy']:.4f} ({format_improve(acc_improve)})\n"
-        comparison_output += f"Precision: {baseline_perf['precision']:.4f} → {results['eval_precision']:.4f} ({format_improve(prec_improve)})\n"
-        comparison_output += f"Recall: {baseline_perf['recall']:.4f} → {results['eval_recall']:.4f} ({format_improve(rec_improve)})\n"
-        comparison_output += f"Sensitivity: {baseline_perf['sensitivity']:.4f} → {results['eval_sensitivity']:.4f} ({format_improve(sens_improve)})\n"
-        comparison_output += f"Specificity: {baseline_perf['specificity']:.4f} → {results['eval_specificity']:.4f} ({format_improve(spec_improve)})\n\n"
-        comparison_output += f"混淆矩陣變化：\n"
-        comparison_output += f"TP: {baseline_perf['tp']} → {results['eval_tp']} ({results['eval_tp'] - baseline_perf['tp']:+d})\n"
-        comparison_output += f"TN: {baseline_perf['tn']} → {results['eval_tn']} ({results['eval_tn'] - baseline_perf['tn']:+d})\n"
-        comparison_output += f"FP: {baseline_perf['fp']} → {results['eval_fp']} ({results['eval_fp'] - baseline_perf['fp']:+d})\n"
-        comparison_output += f"FN: {baseline_perf['fn']} → {results['eval_fn']} ({results['eval_fn'] - baseline_perf['fn']:+d})"
-        info += "\n\n✅ 訓練完成！"
-        return info, baseline_output, finetuned_output, comparison_output
     except Exception as e:
         import traceback
-        error_msg = f"❌ 錯誤: {str(e)}\n\n{traceback.format_exc()}"
-        return error_msg, "", "", ""
 def predict(model_id, text):
-    global baseline_model_cache
     if not model_id or model_id not in trained_models:
-        return "❌ 請選擇模型"
-    if not text:
-        return "❌ 請輸入文字"
     try:
         info = trained_models[model_id]
-        model, tokenizer = info['model'], info['tokenizer']
         config = info['config']
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
-        inputs_cuda = {k: v.to(device) for k, v in inputs.items()}
-        # 預測：微調模型
         model.eval()
         with torch.no_grad():
-            outputs = model(**inputs_cuda)
-            probs_finetuned = torch.nn.functional.softmax(outputs.logits, dim=-1)
             pred_finetuned = torch.argmax(probs_finetuned, dim=-1).item()
-        result_finetuned = "存活" if pred_finetuned == 0 else "死亡"
-        # 預測：基準模型（使用快取）
-        cache_key = config['model_name']
-        if cache_key not in baseline_model_cache:
-            baseline_model = BertForSequenceClassification.from_pretrained(config['model_name'], num_labels=2)
-            baseline_model = baseline_model.to(device)
-            baseline_model.eval()
-            baseline_model_cache[cache_key] = baseline_model
-        else:
-            baseline_model = baseline_model_cache[cache_key]
         with torch.no_grad():
-            outputs_baseline = baseline_model(**inputs_cuda)
-            probs_baseline = torch.nn.functional.softmax(outputs_baseline.logits, dim=-1)
             pred_baseline = torch.argmax(probs_baseline, dim=-1).item()
-        result_baseline = "存活" if pred_baseline == 0 else "死亡"
-        # 判斷是否一致
         agreement = "✅ 一致" if pred_finetuned == pred_baseline else "⚠️ 不一致"
-        output = f"""🔮 預測結果比較
-📝 輸入文字: {text[:100]}{'...' if len(text) > 100 else ''}
-{'='*50}
-🧬 微調模型 ({model_id})
-預測: {result_finetuned}
-信心: {probs_finetuned[0][pred_finetuned].item():.2%}
 機率分布:
-  • 存活: {probs_finetuned[0][0].item():.2%}
-  • 死亡: {probs_finetuned[0][1].item():.2%}
-{'='*50}
-🔬 基準模型（未微調 {config['type']}）
-預測: {result_baseline}
-信心: {probs_baseline[0][pred_baseline].item():.2%}
 機率分布:
-  • 存活: {probs_baseline[0][0].item():.2%}
-  • 死亡: {probs_baseline[0][1].item():.2%}
-{'='*50}
-📊 結論
 兩模型預測: {agreement}
 """
         if pred_finetuned != pred_baseline:
-            output += f"\n💡 分析: 微調模型預測為【{result_finetuned}】，而基準模型預測為【{result_baseline}】"
-            output += f"\n   這顯示了 fine-tuning 對此案例的影響！"
-        f1_improve = calculate_improvement(info['baseline']['f1'], info['results']['eval_f1'])
         output += f"""
-📈 模型表現
 微調模型 F1: {info['results']['eval_f1']:.4f}
 基準模型 F1: {info['baseline']['f1']:.4f}
-改善幅度: {format_improve(f1_improve)}
 """
         return output
     except Exception as e:
         import traceback
-        return f"❌ 錯誤: {str(e)}\n\n{traceback.format_exc()}"
-def compare():
     if not trained_models:
-        return "❌ 尚未訓練模型"
-    text = "# 📊 模型比較\n\n"
-    text += "## 微調模型表現\n\n"
-    text += "| 模型 | 基礎 | 方法 | F1 | Acc | Prec | Recall | Sens | Spec |\n"
-    text += "|------|------|------|-----|-----|------|--------|------|------|\n"
-    for mid, info in trained_models.items():
         r = info['results']
         c = info['config']
-        text += f"| {mid} | {c['type']} | {c['method'].upper()} | {r['eval_f1']:.4f} | {r['eval_accuracy']:.4f} | "
-        text += f"{r['eval_precision']:.4f} | {r['eval_recall']:.4f} | "
-        text += f"{r['eval_sensitivity']:.4f} | {r['eval_specificity']:.4f} |\n"
-    text += "\n## 基準模型表現（未微調）\n\n"
-    text += "| 模型 | F1 | Acc | Prec | Recall | Sens | Spec |\n"
-    text += "|------|-----|-----|------|--------|------|------|\n"
-    for mid, info in trained_models.items():
-        b = info['baseline']
-        c = info['config']
-        text += f"| {c['type']}-baseline | {b['f1']:.4f} | {b['accuracy']:.4f} | "
-        text += f"{b['precision']:.4f} | {b['recall']:.4f} | "
-        text += f"{b['sensitivity']:.4f} | {b['specificity']:.4f} |\n"
-    text += "\n## 🏆 最佳模型\n\n"
-    for metric in ['f1', 'accuracy', 'precision', 'recall', 'sensitivity', 'specificity']:
-        best = max(trained_models.items(), key=lambda x: x[1]['results'][f'eval_{metric}'])
-        baseline_val = best[1]['baseline'][metric]
-        finetuned_val = best[1]['results'][f'eval_{metric}']
-        improvement = calculate_improvement(baseline_val, finetuned_val)
-        text += f"**{metric.upper()}**: {best[0]} ({finetuned_val:.4f}, 改善 {format_improve(improvement)})\n\n"
-    return text
-def refresh_model_list():
-    return gr.Dropdown(choices=list(trained_models.keys()))
-# Gradio UI
-with gr.Blocks(title="BERT Fine-tuning 教學平台", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🧬 BERT Fine-tuning 教學平台")
-    gr.Markdown("### 比較基準模型 vs 微調模型的表現差異")
-    with gr.Tab("訓練"):
-        gr.Markdown("## 步驟 1: 選擇基礎模型")
-        base_model = gr.Dropdown(
-            choices=["BERT-base"],
-            value="BERT-base",
-            label="基礎模型",
-            info="更多模型即將推出"
-        )
-        gr.Markdown("## 步驟 2: 選擇微調方法")
-        method = gr.Radio(
-            choices=["lora", "adalora"],
-            value="lora",
-            label="微調方法",
-            info="兩種都是參數高效方法，推薦從 LoRA 開始"
         )
-        gr.Markdown("## 步驟 3: 上傳資料")
-        csv_file = gr.File(label="CSV 檔案 (需包含 Text 和 label 欄位)", file_types=[".csv"])
-        gr.Markdown("## 步驟 4: 設定訓練參數")
-        gr.Markdown("### 🎯 基本訓練參數")
-        with gr.Row():
-            num_epochs = gr.Number(value=5, label="訓練輪數 (epochs)", minimum=1, maximum=100, precision=0,
-                                  info="建議 5-8 輪")
-            batch_size = gr.Number(value=4, label="批次大小 (batch_size)", minimum=1, maximum=128, precision=0,
-                                  info="記憶體不足時降到 4")
-            learning_rate = gr.Number(value=5e-5, label="學習率 (learning_rate)", minimum=0, maximum=1,
-                                     info="5e-5 是平衡選擇")
-        gr.Markdown("### ⚙️ 進階參數")
-        with gr.Row():
-            weight_decay = gr.Number(value=0.01, label="權重衰減 (weight_decay)", minimum=0, maximum=1)
-            dropout = gr.Number(value=0.1, label="Dropout 機率", minimum=0, maximum=1)
-        gr.Markdown("### 🔧 LoRA 參數")
-        with gr.Row():
-            lora_r = gr.Number(value=32, label="LoRA Rank (r)", minimum=1, maximum=256, precision=0,
-                             info="提高到 32，增加表達能力")
-            lora_alpha = gr.Number(value=64, label="LoRA Alpha", minimum=1, maximum=512, precision=0,
-                                 info="Alpha = Rank × 2")
-            lora_dropout = gr.Number(value=0.05, label="LoRA Dropout", minimum=0, maximum=1,
-                                   info="降低 dropout，避免欠擬合")
-        gr.Markdown("### ⚖️ 評估設定")
-        with gr.Row():
-            weight_mult = gr.Number(value=1.0, label="類別權重倍數", minimum=0, maximum=5,
-                                   info="⚠️ 資料極度不平衡時建議 0.5-1.5，不要超過 2.0")
-            best_metric = gr.Dropdown(
-                choices=["f1", "accuracy", "precision", "recall", "sensitivity", "specificity"],
-                value="f1",
-                label="最佳模型選擇指標",
-                info="訓練時用此指標選擇最佳模型"
-            )
-        train_btn = gr.Button("🚀 開始訓練", variant="primary", size="lg")
-        gr.Markdown("## 📊 訓練結果")
-        data_info = gr.Textbox(label="📋 資料資訊", lines=10)
-        with gr.Row():
-            baseline_result = gr.Textbox(label="🔬 純 BERT（未微調）", lines=14)
-            finetuned_result = gr.Textbox(label="✅ 微調 BERT", lines=14)
-        comparison_result = gr.Textbox(label="📊 純 BERT vs 微調 BERT 比較", lines=14)
-        train_btn.click(
-            train_bert_model,
-            inputs=[csv_file, base_model, method, num_epochs, batch_size, learning_rate,
-                   weight_decay, dropout, lora_r, lora_alpha, lora_dropout,
-                   weight_mult, best_metric],
-            outputs=[data_info, baseline_result, finetuned_result, comparison_result]
-        )
-    with gr.Tab("預測"):
-        gr.Markdown("## 使用訓練好的模型預測")
-        with gr.Row():
-            model_drop = gr.Dropdown(label="選擇模型", choices=list(trained_models.keys()))
-            refresh = gr.Button("🔄 刷新")
-        text_input = gr.Textbox(label="輸入病例描述", lines=4,
-                               placeholder="Patient diagnosed with...")
-        predict_btn = gr.Button("預測", variant="primary", size="lg")
-        pred_output = gr.Textbox(label="預測結果（含基準模型對比）", lines=20)
-        refresh.click(refresh_model_list, outputs=[model_drop])
-        predict_btn.click(predict, inputs=[model_drop, text_input], outputs=[pred_output])
-        gr.Examples(
-            examples=[
-                ["Patient with stage II breast cancer, good response to treatment."],
-                ["Advanced metastatic cancer, multiple organ involvement."]
-            ],
-            inputs=text_input
-        )
-    with gr.Tab("比較"):
-        gr.Markdown("## 比較所有模型（含基準模型）")
-        compare_btn = gr.Button("比較", variant="primary", size="lg")
-        compare_output = gr.Markdown()
-        compare_btn.click(compare, outputs=[compare_output])
-    with gr.Tab("說明"):
-        gr.Markdown("""
-        ## 📖 使用說明
-        ### 🎯 平台特色
-        本平台會自動比較：
-        - **基準模型**：未經微調的原始 BERT
-        - **微調模型**：使用你的資料訓練後的 BERT
-        這樣可以清楚看到 fine-tuning 帶來的改善！
-        ### 基礎模型
-        - **BERT-base**: 標準 BERT，110M 參數 ⭐目前支援
-        ### 微調方法
-        - **LoRA**: 低秩適應，參數高效的微調方法 ⭐強烈推薦
-          - 只訓練少量參數（通常 <1%）
-          - 訓練速度快，效果好
-          - 適合大多數情況
-        - **AdaLoRA**: 自適應 LoRA，動態調整秩
-          - 自動找出最重要的參數
-          - 可能比 LoRA 效果稍好
-          - 訓練時間稍長
-        ### 評估指標
-        - **F1**: 平衡指標，推薦用於不平衡資料 ⭐
-        - **Accuracy**: 整體準確率
-        - **Precision**: 減少假陽性
-        - **Recall/Sensitivity**: 減少假陰性
-        - **Specificity**: 真陰性率
-        ### 參數建議
-        針對不平衡資料（如醫療資料）：
-        - **微調方法**: LoRA（快速有效）或 AdaLoRA（追求極致）
-        - **LoRA Rank**: 8-16（平衡效果與速度）
-        - **類別權重倍數**:
-          - ⚠️ **極度不平衡 (>10:1)**: 0.5-1.0（你的情況！）
-          - 中度不平衡 (3-10:1): 1.0-1.5
-          - 輕度不平衡 (<3:1): 1.5-2.5
-        - **Learning rate**: 3e-5 到 5e-5（較高的學習率配合 LoRA）
-        - **Epochs**: 5-10（極度不平衡需要更多輪）
-        - **Batch size**: 8-16（依 GPU 記憶體調整）
-        ### 資料格式
-        CSV 必須包含:
-        - `Text`: 病例描述
-        - `label`: 0=存活, 1=死亡
-        ### 🚀 快速開始
-        1. 上傳包含 `Text` 和 `label` 欄位的 CSV
-        2. 使用預設參數（適合大多數情況）
-        3. 點擊「開始訓練」
-        4. 在「預測」分頁測試模型
-        5. 在「比較」分頁查看所有模型表現
-        """)
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        max_threads=4  # 限制執行緒數
     )

 import gradio as gr
 import pandas as pd
+import numpy as np
 import torch
 from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
 from peft import LoraConfig, AdaLoraConfig, get_peft_model, TaskType
 from datasets import Dataset
 from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
 from torch import nn
+from torch.utils.data import DataLoader, WeightedRandomSampler
 import os
 from datetime import datetime
 import gc
+import json
+from functools import lru_cache
+from typing import Dict, List, Tuple, Optional
+import warnings
+warnings.filterwarnings('ignore')
+# 環境設置
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+# 優化 CUDA 設置
 torch.backends.cudnn.benchmark = False
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
+# ==================== 全域變數 ====================
 trained_models = {}
 model_counter = 0
+training_histories = {}  # 新增：儲存訓練歷史
+# ==================== 訓練監控類 ====================
+class TrainingMonitor:
+    """訓練過程監控器"""
+    def __init__(self):
+        self.history = {
+            'epoch': [],
+            'train_loss': [],
+            'eval_loss': [],
+            'eval_accuracy': [],
+            'eval_f1': [],
+            'eval_precision': [],
+            'eval_recall': [],
+            'learning_rate': [],
+            'best_epoch': None,
+            'best_metric_value': None
+        }
+    def log_epoch(self, epoch: int, train_loss: float, eval_metrics: Dict, lr: float):
+        """記錄每個 epoch 的結果"""
+        self.history['epoch'].append(epoch)
+        self.history['train_loss'].append(train_loss)
+        self.history['eval_loss'].append(eval_metrics.get('eval_loss', 0))
+        self.history['eval_accuracy'].append(eval_metrics.get('eval_accuracy', 0))
+        self.history['eval_f1'].append(eval_metrics.get('eval_f1', 0))
+        self.history['eval_precision'].append(eval_metrics.get('eval_precision', 0))
+        self.history['eval_recall'].append(eval_metrics.get('eval_recall', 0))
+        self.history['learning_rate'].append(lr)
+    def update_best(self, epoch: int, metric_value: float):
+        """更新最佳結果"""
+        self.history['best_epoch'] = epoch
+        self.history['best_metric_value'] = metric_value
+    def get_summary(self) -> str:
+        """獲取訓練摘要"""
+        if not self.history['epoch']:
+            return "尚無訓練記錄"
+        summary = "📈 訓練歷程摘要\n"
+        summary += f"總訓練輪數: {len(self.history['epoch'])}\n"
+        summary += f"最佳 Epoch: {self.history['best_epoch']}\n"
+        summary += f"最佳指標值: {self.history['best_metric_value']:.4f}\n\n"
+        summary += "各 Epoch 表現:\n"
+        for i, epoch in enumerate(self.history['epoch']):
+            summary += f"Epoch {epoch}: Loss={self.history['train_loss'][i]:.4f}, "
+            summary += f"F1={self.history['eval_f1'][i]:.4f}, "
+            summary += f"Acc={self.history['eval_accuracy'][i]:.4f}\n"
+        return summary
+# ==================== 權重計算改進 ====================
+def calculate_class_weights(n0: int, n1: int, weight_mult: float = 1.0,
+                           method: str = 'sqrt') -> Tuple[float, float]:
+    """
+    改進的類別權重計算
+    Args:
+        n0: 負類樣本數（存活）
+        n1: 正類樣本數（死亡）
+        weight_mult: 權重倍數調整
+        method: 計算方法 ('balanced', 'sqrt', 'log', 'custom')
+    Returns:
+        (w0, w1): 類別權重
+    """
+    if n1 == 0:
+        return 1.0, 1.0
+    ratio = n0 / n1
+    total = n0 + n1
+    if method == 'balanced':
+        # sklearn 風格的平衡權重
+        w0 = total / (2 * n0) if n0 > 0 else 1.0
+        w1 = total / (2 * n1) if n1 > 0 else 1.0
+        w1 *= weight_mult
+    elif method == 'sqrt':
+        # 使用平方根緩和極端權重（推薦用於極度不平衡）
+        w0 = 1.0
+        w1 = min(np.sqrt(ratio) * weight_mult, 10.0)  # 設置上限為 10
+    elif method == 'log':
+        # 使用對數進一步緩和
+        w0 = 1.0
+        w1 = min(np.log1p(ratio) * weight_mult, 8.0)  # 設置上限為 8
+    elif method == 'custom':
+        # 自定義邏輯，根據不平衡程度調整
+        if ratio > 20:  # 極���不平衡
+            w0 = 1.0
+            w1 = min(5.0 * weight_mult, 10.0)
+        elif ratio > 10:  # 高度不平衡
+            w0 = 1.0
+            w1 = min(ratio * 0.3 * weight_mult, 8.0)
+        elif ratio > 5:  # 中度不平衡
+            w0 = 1.0
+            w1 = min(ratio * 0.5 * weight_mult, 6.0)
+        else:  # 輕度不平衡
+            w0 = 1.0
+            w1 = ratio * weight_mult
+    else:
+        # 預設使用 sqrt 方法
+        w0 = 1.0
+        w1 = min(np.sqrt(ratio) * weight_mult, 10.0)
+    return w0, w1
+# ==================== 評估指標計算 ====================
 def compute_metrics(pred):
+    """計算完整的評估指標"""
     try:
         labels = pred.label_ids
         preds = pred.predictions.argmax(-1)
+        # 基本指標
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            labels, preds, average='binary', pos_label=1, zero_division=0
+        )
         acc = accuracy_score(labels, preds)
+        # 混淆矩陣
         cm = confusion_matrix(labels, preds)
+        tn = fp = fn = tp = 0
         if cm.shape == (2, 2):
             tn, fp, fn, tp = cm.ravel()
+        # 敏感度和特異度
         sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
         specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
+        # 額外指標
+        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # 陽性預測值
+        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # 陰性預測值
         return {
+            'accuracy': acc,
+            'f1': f1,
+            'precision': precision,
+            'recall': recall,
+            'sensitivity': sensitivity,
+            'specificity': specificity,
+            'ppv': ppv,
+            'npv': npv,
+            'tp': int(tp),
+            'tn': int(tn),
+            'fp': int(fp),
+            'fn': int(fn)
         }
     except Exception as e:
         print(f"Error in compute_metrics: {e}")
+        return {k: 0 for k in ['accuracy', 'f1', 'precision', 'recall',
+                               'sensitivity', 'specificity', 'ppv', 'npv',
+                               'tp', 'tn', 'fp', 'fn']}
+# ==================== 基準模型評估（修正版，只保留一個） ====================
+def evaluate_baseline(model, tokenizer, test_dataset, device, batch_size=16):
     """評估未微調的基準模型"""
     model.eval()
     all_preds = []
     all_labels = []
     def collate_fn(batch):
         return {
             'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
             'labels': torch.tensor([item['label'] for item in batch])
         }
+    dataloader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        collate_fn=collate_fn,
+        pin_memory=torch.cuda.is_available(),
+        num_workers=0  # 避免多進程問題
+    )
     with torch.no_grad():
         for batch in dataloader:
             all_preds.extend(preds.cpu().numpy())
             all_labels.extend(labels.numpy())
+    # 計算所有指標
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        all_labels, all_preds, average='binary', pos_label=1, zero_division=0
+    )
     acc = accuracy_score(all_labels, all_preds)
     cm = confusion_matrix(all_labels, all_preds)
+    tn = fp = fn = tp = 0
     if cm.shape == (2, 2):
         tn, fp, fn, tp = cm.ravel()
     sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
     specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
+    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
+    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
     return {
+        'accuracy': acc,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall,
+        'sensitivity': sensitivity,
+        'specificity': specificity,
+        'ppv': ppv,
+        'npv': npv,
+        'tp': int(tp),
+        'tn': int(tn),
+        'fp': int(fp),
+        'fn': int(fn)
     }
+# ==================== 自定義 Trainer 與 Early Stopping ====================
+class CustomTrainer(Trainer):
+    """支援類別權重、Focal Loss 和 Early Stopping 的 Trainer"""
+    def __init__(self, *args, class_weights=None, use_focal_loss=False,
+                 focal_gamma=2.0, monitor=None, early_stopping_patience=3,
+                 early_stopping_metric='eval_f1', **kwargs):
         super().__init__(*args, **kwargs)
         self.class_weights = class_weights
         self.use_focal_loss = use_focal_loss
+        self.focal_gamma = focal_gamma
+        self.monitor = monitor
+        self.early_stopping_patience = early_stopping_patience
+        self.early_stopping_metric = early_stopping_metric
+        self.best_metric = -float('inf')
+        self.best_model_state = None
+        self.patience_counter = 0
+        self.current_epoch = 0
     def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        """計算損失函數"""
         labels = inputs.pop("labels")
         outputs = model(**inputs)
         logits = outputs.logits
+        if self.use_focal_loss and self.class_weights is not None:
+            # Focal Loss 實現
             ce_loss = nn.CrossEntropyLoss(weight=self.class_weights, reduction='none')(
                 logits.view(-1, 2), labels.view(-1)
             )
             pt = torch.exp(-ce_loss)
+            focal_loss = ((1 - pt) ** self.focal_gamma * ce_loss).mean()
             loss = focal_loss
+        elif self.class_weights is not None:
+            # 標準加權交叉熵
             loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
             loss = loss_fct(logits.view(-1, 2), labels.view(-1))
+        else:
+            # 標準交叉熵
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
         return (loss, outputs) if return_outputs else loss
+    def on_epoch_end(self, args, state, control, **kwargs):
+        """每個 epoch 結束時的回調"""
+        self.current_epoch += 1
+        # 評估模型
+        metrics = self.evaluate()
+        # 記錄到監控器
+        if self.monitor:
+            self.monitor.log_epoch(
+                epoch=self.current_epoch,
+                train_loss=state.log_history[-1].get('loss', 0) if state.log_history else 0,
+                eval_metrics=metrics,
+                lr=self.get_learning_rate()
+            )
+        # Early Stopping 檢查
+        current_metric = metrics.get(self.early_stopping_metric, 0)
+        if current_metric > self.best_metric:
+            self.best_metric = current_metric
+            self.best_model_state = {k: v.cpu().clone() for k, v in self.model.state_dict().items()}
+            self.patience_counter = 0
+            if self.monitor:
+                self.monitor.update_best(self.current_epoch, current_metric)
+            print(f"✅ Epoch {self.current_epoch}: 新最佳 {self.early_stopping_metric} = {current_metric:.4f}")
+        else:
+            self.patience_counter += 1
+            print(f"⏳ Epoch {self.current_epoch}: 無改善 (patience: {self.patience_counter}/{self.early_stopping_patience})")
+            if self.patience_counter >= self.early_stopping_patience:
+                print(f"🛑 Early Stopping 於 Epoch {self.current_epoch}")
+                control.should_training_stop = True
+        return control
+    def get_learning_rate(self):
+        """獲取當前學習率"""
+        if self.optimizer is None:
+            return 0
+        return self.optimizer.param_groups[0]['lr']
+    def load_best_model(self):
+        """載入最佳模型"""
+        if self.best_model_state:
+            self.model.load_state_dict(self.best_model_state)
+            print(f"✅ 已載入最佳模型 (最佳 {self.early_stopping_metric} = {self.best_metric:.4f})")
+# ==================== 基準模型快取（改進版） ====================
+@lru_cache(maxsize=3)
+def get_cached_baseline_model(model_name: str, num_labels: int = 2):
+    """使用 LRU 快取管理基準模型"""
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
+    return model.to(device)
+# ==================== 改善率計算 ====================
+def calculate_improvement(baseline_val: float, finetuned_val: float) -> float:
+    """安全計算改善率"""
+    if baseline_val == 0:
+        return float('inf') if finetuned_val > 0 else 0.0
+    return (finetuned_val - baseline_val) / baseline_val * 100
+def format_improvement(val: float) -> str:
+    """格式化改善率顯示"""
+    if val == float('inf'):
+        return "N/A (baseline=0)"
+    elif val > 0:
+        return f"↑ {val:.1f}%"
+    elif val < 0:
+        return f"↓ {abs(val):.1f}%"
     else:
+        return "→ 0.0%"
+# ==================== 主要訓練函數（改進版） ====================
+def train_bert_model(csv_file, base_model, method, num_epochs, batch_size, learning_rate,
+                     weight_decay, dropout, lora_r, lora_alpha, lora_dropout,
+                     weight_mult, weight_method, best_metric, use_early_stopping, patience):
+    """
+    改進的 BERT 模型訓練函數
+    """
+    global trained_models, model_counter, training_histories
     model_mapping = {
         "BERT-base": "bert-base-uncased",
+        "BERT-base-chinese": "bert-base-chinese",
+        "BioBERT": "dmis-lab/biobert-base-cased-v1.2",
+        "SciBERT": "allenai/scibert_scivocab_uncased"
     }
     model_name = model_mapping.get(base_model, "bert-base-uncased")
     try:
+        # ========== 資料驗證與載入 ==========
         if csv_file is None:
+            return "❌ 請上傳 CSV 檔案", "", "", "", ""
         df = pd.read_csv(csv_file.name)
         if 'Text' not in df.columns or 'label' not in df.columns:
+            return "❌ CSV 必須包含 'Text' 和 'label' 欄位", "", "", "", ""
+        # 資料清理
         df_clean = pd.DataFrame({
+            'text': df['Text'].astype(str),
             'label': df['label'].astype(int)
         }).dropna()
+        # 統計資料
         n0 = int(sum(df_clean['label'] == 0))
         n1 = int(sum(df_clean['label'] == 1))
+        if n1 == 0:
+            return "❌ 資料集中沒有正類樣本（死亡）", "", "", "", ""
+        ratio = n0 / n1 if n1 > 0 else 0
+        # ========== 計算類別權重 ==========
+        w0, w1 = calculate_class_weights(n0, n1, weight_mult, method=weight_method)
+        # ========== 準備資料資訊 ==========
+        info = f"📊 資料集統計\n"
+        info += f"{'='*50}\n"
+        info += f"總樣本數: {len(df_clean):,}\n"
+        info += f"存活 (0): {n0:,} ({n0/len(df_clean)*100:.1f}%)\n"
+        info += f"死亡 (1): {n1:,} ({n1/len(df_clean)*100:.1f}%)\n"
+        info += f"不平衡比例: {ratio:.2f}:1\n"
+        info += f"\n⚖️ 類別權重設定\n"
+        info += f"{'='*50}\n"
+        info += f"計算方法: {weight_method}\n"
+        info += f"存活權重: {w0:.3f}\n"
+        info += f"死亡權重: {w1:.3f}\n"
+        info += f"權重比例: 1:{w1/w0:.2f}\n"
+        # ========== 模型與分詞器初始化 ==========
+        info += f"\n🤖 模型配置\n"
+        info += f"{'='*50}\n"
+        info += f"基礎模型: {base_model}\n"
+        info += f"模型路徑: {model_name}\n"
+        info += f"微調方法: {method.upper()}\n"
         tokenizer = BertTokenizer.from_pretrained(model_name)
+        # ========== 資料集準備 ==========
         dataset = Dataset.from_pandas(df_clean[['text', 'label']])
+        def preprocess(examples):
+            return tokenizer(
+                examples['text'],
+                truncation=True,
+                padding='max_length',
+                max_length=128
+            )
         tokenized = dataset.map(preprocess, batched=True, remove_columns=['text'])
+        split = tokenized.train_test_split(test_size=0.2, seed=42, stratify=tokenized['label'])
+        # ========== 設備配置 ==========
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        info += f"運算設備: {'GPU ✅ (' + torch.cuda.get_device_name(0) + ')' if torch.cuda.is_available() else 'CPU ⚠️'}\n"
+        # ========== 評估基準模型 ==========
+        info += f"\n📏 基準模型評估\n"
+        info += f"{'='*50}\n"
+        info += f"正在評估未微調的 {base_model}...\n"
         baseline_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
         baseline_model = baseline_model.to(device)
+        baseline_perf = evaluate_baseline(
+            baseline_model, tokenizer, split['test'], device, batch_size=batch_size*2
+        )
+        info += f"基準 F1 分數: {baseline_perf['f1']:.4f}\n"
+        info += f"基準準確率: {baseline_perf['accuracy']:.4f}\n"
+        # 清理基準模型記憶體
         del baseline_model
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         gc.collect()
+        # ========== 配置微調模型 ==========
+        info += f"\n🔧 微調配置\n"
+        info += f"{'='*50}\n"
         model = BertForSequenceClassification.from_pretrained(
+            model_name,
+            num_labels=2,
             hidden_dropout_prob=dropout,
             attention_probs_dropout_prob=dropout
         )
+        # 應用 PEFT 方法
         peft_applied = False
         if method == "lora":
+            from peft import LoraConfig, get_peft_model, TaskType
             config = LoraConfig(
+                task_type=TaskType.SEQ_CLS,
+                r=int(lora_r),
                 lora_alpha=int(lora_alpha),
+                lora_dropout=lora_dropout,
+                target_modules=["query", "value"],
                 bias="none"
             )
             model = get_peft_model(model, config)
             peft_applied = True
+            info += f"✅ LoRA 已套用\n"
+            info += f"  - Rank (r): {int(lora_r)}\n"
+            info += f"  - Alpha: {int(lora_alpha)}\n"
+            info += f"  - Dropout: {lora_dropout}\n"
         elif method == "adalora":
+            from peft import AdaLoraConfig, get_peft_model, TaskType
             config = AdaLoraConfig(
+                task_type=TaskType.SEQ_CLS,
+                r=int(lora_r),
                 lora_alpha=int(lora_alpha),
+                lora_dropout=lora_dropout,
                 target_modules=["query", "value"],
+                init_r=12,
+                target_r=int(lora_r),
+                tinit=200,
+                tfinal=1000,
+                deltaT=10
             )
             model = get_peft_model(model, config)
             peft_applied = True
+            info += f"✅ AdaLoRA 已套用\n"
+            info += f"  - Initial Rank: 12\n"
+            info += f"  - Target Rank: {int(lora_r)}\n"
+            info += f"  - Alpha: {int(lora_alpha)}\n"
+        elif method == "full":
+            info += f"✅ Full Fine-tuning 模式\n"
+            peft_applied = False
         model = model.to(device)
+        # 參數統計
+        total_params = sum(p.numel() for p in model.parameters())
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        info += f"\n💾 模型參數\n"
+        info += f"{'='*50}\n"
+        info += f"總參數量: {total_params:,}\n"
+        info += f"可訓練參數: {trainable_params:,}\n"
+        info += f"可訓練比例: {trainable_params/total_params*100:.2f}%\n"
+        info += f"記憶體節省: {(1 - trainable_params/total_params)*100:.1f}%\n"
+        # ========== 準備訓練 ==========
         weights = torch.tensor([w0, w1], dtype=torch.float).to(device)
+        use_focal = ratio > 10  # 極度不平衡時使用 Focal Loss
+        if use_focal:
+            info += f"\n⚡ 特殊設定\n"
+            info += f"{'='*50}\n"
+            info += f"使用 Focal Loss (γ=2.0) 處理極度不平衡\n"
+        # 訓練參數
+        training_args = TrainingArguments(
+            output_dir='./results',
             num_train_epochs=int(num_epochs),
+            per_device_train_batch_size=int(batch_size),
+            per_device_eval_batch_size=int(batch_size) * 2,
+            learning_rate=float(learning_rate),
             weight_decay=float(weight_decay),
+            evaluation_strategy="epoch",
+            save_strategy="no",  # 使用自定義保存策略
+            load_best_model_at_end=False,
+            report_to="none",
+            logging_steps=max(1, len(split['train']) // (int(batch_size) * 10)),
+            warmup_steps=min(500, len(split['train']) // int(batch_size)),
+            logging_first_step=True,
+            remove_unused_columns=False,
+            label_smoothing_factor=0.1 if ratio > 20 else 0.0,  # 極度不平衡時使用標籤平滑
         )
+        # 創建監控器
+        monitor = TrainingMonitor()
+        # 創建自定義 Trainer
+        trainer = CustomTrainer(
+            model=model,
+            args=training_args,
             train_dataset=split['train'],
+            eval_dataset=split['test'],
             compute_metrics=compute_metrics,
             class_weights=weights,
+            use_focal_loss=use_focal,
+            focal_gamma=2.0,
+            monitor=monitor,
+            early_stopping_patience=patience if use_early_stopping else 999,
+            early_stopping_metric=f'eval_{best_metric}'
         )
+        info += f"\n🚀 訓練設定\n"
+        info += f"{'='*50}\n"
+        info += f"訓練樣本: {len(split['train']):,}\n"
+        info += f"測試樣本: {len(split['test']):,}\n"
+        info += f"批次大小: {int(batch_size)}\n"
+        info += f"訓練輪數: {int(num_epochs)}\n"
+        info += f"批次數/輪: {len(split['train']) // int(batch_size)}\n"
+        info += f"Early Stopping: {'開啟 (patience=' + str(patience) + ')' if use_early_stopping else '關閉'}\n"
+        info += f"最佳指標: {best_metric}\n"
+        info += f"\n⏳ 開始訓練...\n"
+        info += f"{'='*50}\n"
+        # ========== 執行訓練 ==========
         train_result = trainer.train()
+        # 載入最佳模型
+        if use_early_stopping:
+            trainer.load_best_model()
+        # 最終評估
+        final_results = trainer.evaluate()
+        # ========== 保存模型與結果 ==========
         model_counter += 1
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        model_id = f"{base_model}_{method}_{model_counter}_{timestamp}"
         trained_models[model_id] = {
+            'model': model,
+            'tokenizer': tokenizer,
+            'results': final_results,
+            'baseline': baseline_perf,
             'config': {
+                'type': base_model,
+                'model_name': model_name,
+                'method': method,
+                'metric': best_metric,
+                'epochs': int(num_epochs),
+                'batch_size': int(batch_size),
+                'learning_rate': float(learning_rate),
+                'weight_method': weight_method,
+                'weight_mult': weight_mult
             },
+            'timestamp': timestamp,
+            'monitor': monitor  # 保存訓練歷史
         }
+        training_histories[model_id] = monitor.history
+        info += f"\n✅ 訓練完成！\n"
+        info += f"最終 Training Loss: {train_result.training_loss:.4f}\n"
+        if monitor.history['best_epoch']:
+            info += f"最佳 Epoch: {monitor.history['best_epoch']}\n"
+        # ========== 準備輸出結果 ==========
+        # 基準模型結果
+        baseline_output = format_baseline_results(baseline_perf)
+        # 微調模型結果
+        finetuned_output = format_finetuned_results(model_id, final_results)
+        # 比較結果
+        comparison_output = format_comparison_results(baseline_perf, final_results)
+        # 訓練歷程
+        history_output = monitor.get_summary()
+        return info, baseline_output, finetuned_output, comparison_output, history_output
     except Exception as e:
         import traceback
+        error_msg = f"❌ 錯誤發生\n\n錯誤類型: {type(e).__name__}\n錯誤訊息: {str(e)}\n\n"
+        error_msg += f"詳細追蹤:\n{traceback.format_exc()}"
+        return error_msg, "", "", "", ""
+# ==================== 格式化輸出函數 ====================
+def format_baseline_results(baseline_perf: Dict) -> str:
+    """格式化基準模型結果"""
+    output = "🔬 純 BERT（未微調）\n\n"
+    output += "📊 模型表現\n"
+    output += f"{'='*30}\n"
+    output += f"F1 Score:     {baseline_perf['f1']:.4f}\n"
+    output += f"Accuracy:     {baseline_perf['accuracy']:.4f}\n"
+    output += f"Precision:    {baseline_perf['precision']:.4f}\n"
+    output += f"Recall:       {baseline_perf['recall']:.4f}\n"
+    output += f"Sensitivity:  {baseline_perf['sensitivity']:.4f}\n"
+    output += f"Specificity:  {baseline_perf['specificity']:.4f}\n"
+    output += f"PPV:          {baseline_perf['ppv']:.4f}\n"
+    output += f"NPV:          {baseline_perf['npv']:.4f}\n\n"
+    output += "📈 混淆矩陣\n"
+    output += f"{'='*30}\n"
+    output += f"       預測 0    預測 1\n"
+    output += f"實際 0   {baseline_perf['tn']:4d}     {baseline_perf['fp']:4d}\n"
+    output += f"實際 1   {baseline_perf['fn']:4d}     {baseline_perf['tp']:4d}\n"
+    return output
+def format_finetuned_results(model_id: str, results: Dict) -> str:
+    """格式化微調模型結果"""
+    output = f"✅ 微調 BERT\n"
+    output += f"模型 ID: {model_id}\n\n"
+    output += "📊 模型表現\n"
+    output += f"{'='*30}\n"
+    output += f"F1 Score:     {results['eval_f1']:.4f}\n"
+    output += f"Accuracy:     {results['eval_accuracy']:.4f}\n"
+    output += f"Precision:    {results['eval_precision']:.4f}\n"
+    output += f"Recall:       {results['eval_recall']:.4f}\n"
+    output += f"Sensitivity:  {results['eval_sensitivity']:.4f}\n"
+    output += f"Specificity:  {results['eval_specificity']:.4f}\n"
+    output += f"PPV:          {results['eval_ppv']:.4f}\n"
+    output += f"NPV:          {results['eval_npv']:.4f}\n\n"
+    output += "📈 混淆矩陣\n"
+    output += f"{'='*30}\n"
+    output += f"       預測 0    預測 1\n"
+    output += f"實際 0   {results['eval_tn']:4d}     {results['eval_fp']:4d}\n"
+    output += f"實際 1   {results['eval_fn']:4d}     {results['eval_tp']:4d}\n"
+    return output
+def format_comparison_results(baseline_perf: Dict, finetuned_results: Dict) -> str:
+    """格式化比較結果"""
+    output = "📊 純 BERT vs 微調 BERT 比較\n\n"
+    output += "指標改善分析:\n"
+    output += f"{'='*50}\n"
+    output += f"{'指標':<12} {'基準':>8} {'微調':>8} {'變化':>10} {'改善率':>10}\n"
+    output += f"{'-'*50}\n"
+    metrics = [
+        ('F1', 'f1', 'eval_f1'),
+        ('Accuracy', 'accuracy', 'eval_accuracy'),
+        ('Precision', 'precision', 'eval_precision'),
+        ('Recall', 'recall', 'eval_recall'),
+        ('Sensitivity', 'sensitivity', 'eval_sensitivity'),
+        ('Specificity', 'specificity', 'eval_specificity'),
+        ('PPV', 'ppv', 'eval_ppv'),
+        ('NPV', 'npv', 'eval_npv')
+    ]
+    for name, base_key, fine_key in metrics:
+        base_val = baseline_perf[base_key]
+        fine_val = finetuned_results[fine_key]
+        change = fine_val - base_val
+        improve = calculate_improvement(base_val, fine_val)
+        output += f"{name:<12} {base_val:>8.4f} {fine_val:>8.4f} "
+        output += f"{change:+10.4f} {format_improvement(improve):>10}\n"
+    output += f"\n混淆矩陣變化:\n"
+    output += f"{'='*40}\n"
+    output += f"{'項目':<10} {'基準':>8} {'微調':>8} {'變化':>10}\n"
+    output += f"{'-'*40}\n"
+    cm_items = [
+        ('True Pos', 'tp', 'eval_tp'),
+        ('True Neg', 'tn', 'eval_tn'),
+        ('False Pos', 'fp', 'eval_fp'),
+        ('False Neg', 'fn', 'eval_fn')
+    ]
+    for name, base_key, fine_key in cm_items:
+        base_val = baseline_perf[base_key]
+        fine_val = finetuned_results[fine_key]
+        change = fine_val - base_val
+        output += f"{name:<10} {base_val:>8d} {fine_val:>8d} {change:+10d}\n"
+    # 總結
+    output += f"\n📈 整體評估:\n"
+    output += f"{'='*40}\n"
+    f1_improve = calculate_improvement(baseline_perf['f1'], finetuned_results['eval_f1'])
+    if f1_improve > 10:
+        output += "✅ 顯著改善：微調帶來明顯的性能提升！\n"
+    elif f1_improve > 0:
+        output += "✅ 有所改善：微調產生正向影響。\n"
+    elif f1_improve == 0:
+        output += "➖ 無變化：微調未產生明顯影響。\n"
+    else:
+        output += "⚠️ 性能下降：可能需要調整超參數。\n"
+    return output
+# ==================== 預測函數（改進版） ====================
 def predict(model_id, text):
+    """使用選定模型進行預測並與基準模型比較"""
     if not model_id or model_id not in trained_models:
+        return "❌ 請選擇一個已訓練的模型"
+    if not text or len(text.strip()) == 0:
+        return "❌ 請輸入要預測的文字"
     try:
+        # 獲取模型資訊
         info = trained_models[model_id]
+        model = info['model']
+        tokenizer = info['tokenizer']
         config = info['config']
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # 文字預處理
+        inputs = tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+            max_length=128
+        )
+        inputs_device = {k: v.to(device) for k, v in inputs.items()}
+        # ========== 微調模型預測 ==========
         model.eval()
         with torch.no_grad():
+            outputs = model(**inputs_device)
+            logits = outputs.logits
+            probs_finetuned = torch.nn.functional.softmax(logits, dim=-1)
             pred_finetuned = torch.argmax(probs_finetuned, dim=-1).item()
+            confidence_finetuned = probs_finetuned[0][pred_finetuned].item()
+        # ========== 基準模型預測 ==========
+        baseline_model = get_cached_baseline_model(config['model_name'])
+        baseline_model.eval()
         with torch.no_grad():
+            outputs_baseline = baseline_model(**inputs_device)
+            logits_baseline = outputs_baseline.logits
+            probs_baseline = torch.nn.functional.softmax(logits_baseline, dim=-1)
             pred_baseline = torch.argmax(probs_baseline, dim=-1).item()
+            confidence_baseline = probs_baseline[0][pred_baseline].item()
+        # ========== 格式化輸出 ==========
+        result_finetuned = "🟢 存活" if pred_finetuned == 0 else "🔴 死亡"
+        result_baseline = "🟢 存活" if pred_baseline == 0 else "🔴 死亡"
         agreement = "✅ 一致" if pred_finetuned == pred_baseline else "⚠️ 不一致"
+        output = f"""🔮 預測結果比較分析
+📝 輸入文字
+{'='*60}
+{text[:200]}{'...' if len(text) > 200 else ''}
+{'='*60}
+🎯 微調模型預測 ({model_id})
+{'='*60}
+預測結果: {result_finetuned}
+預測信心: {confidence_finetuned:.1%}
 機率分布:
+  • 存活 (0): {probs_finetuned[0][0].item():.2%}
+  • 死亡 (1): {probs_finetuned[0][1].item():.2%}
+模型配置:
+  • 方法: {config['method'].upper()}
+  • 基礎模型: {config['type']}
+  • 訓練輪數: {config['epochs']}
+{'='*60}
+🔬 基準模型預測（未微調 {config['type']}）
+{'='*60}
+預測結果: {result_baseline}
+預測信心: {confidence_baseline:.1%}
 機率分布:
+  • 存活 (0): {probs_baseline[0][0].item():.2%}
+  • 死亡 (1): {probs_baseline[0][1].item():.2%}
+{'='*60}
+📊 預測分析
+{'='*60}
 兩模型預測: {agreement}
 """
         if pred_finetuned != pred_baseline:
+            output += f"""
+💡 差異分析:
+微調模型預測【{result_finetuned}】（信心: {confidence_finetuned:.1%}）
+基準模型預測【{result_baseline}】（信心: {confidence_baseline:.1%}）
+這種差異顯示了微調對此特定案例的影響。
+微調模型可能學習到了更適合您資料集的特徵。
+"""
+        else:
+            output += f"""
+✅ 預測一致性分析:
+兩個模型都預測為【{result_finetuned}】
+信心差異: {abs(confidence_finetuned - confidence_baseline):.1%}
+"""
+        # 加入模型整體表現對比
+        f1_improve = calculate_improvement(
+            info['baseline']['f1'],
+            info['results']['eval_f1']
+        )
         output += f"""
+📈 模型整體表現對比
+{'='*60}
 微調模型 F1: {info['results']['eval_f1']:.4f}
 基準模型 F1: {info['baseline']['f1']:.4f}
+改善幅度: {format_improvement(f1_improve)}
+微調模型準確率: {info['results']['eval_accuracy']:.4f}
+基準模型準確率: {info['baseline']['accuracy']:.4f}
 """
         return output
     except Exception as e:
         import traceback
+        return f"❌ 預測時發生錯誤\n\n{str(e)}\n\n{traceback.format_exc()}"
+# ==================== 模型比較函數 ====================
+def compare_models():
+    """比較所有已訓練的模型"""
     if not trained_models:
+        return "❌ 尚未訓練任何模型。請先在「訓練」頁面訓練模型。"
+    output = "# 📊 模型比較報告\n\n"
+    output += f"共有 {len(trained_models)} 個已訓練模型\n\n"
+    # 微調模型表現表格
+    output += "## 🎯 微調模型表現\n\n"
+    output += "| 模型 ID | 基礎模型 | 方法 | F1 | 準確率 | 精確率 | 召回率 | 敏感度 | 特異度 |\n"
+    output += "|---------|----------|------|-----|--------|--------|--------|--------|--------|\n"
+    for model_id, info in trained_models.items():
         r = info['results']
         c = info['config']
+        # 縮短模型 ID 顯示
+        short_id = f"{c['type']}_{c['method']}_{info['timestamp'][-6:]}"
+        output += f"| {short_id} | {c['type']} | {c['method'].upper()} | "
+        output += f"{r['eval_f1']:.4f} | {r['eval_accuracy']:.4f} | "
+        output += f"{r['eval_precision']:.4f} | {r['eval_recall']:.4f} | "
+        output += f"{r['eval_sensitivity']:.4f} | {r['eval_specificity']:.4f} |\n"
+    # 基準模型表現
+    output += "\n## 🔬 基準模型表現（未微調）\n\n"
+    # 獲取唯一的基準模型
+    unique_baselines = {}
+    for model_id, info in trained_models.items():
+        base_type = info['config']['type']
+        if base_type not in unique_baselines:
+            unique_baselines[base_type] = info['baseline']
+    output += "| 基礎模型 | F1 | 準確率 | 精確率 | 召回率 | 敏感度 | 特異度 |\n"
+    output += "|----------|-----|--------|--------|--------|--------|--------|\n"
+    for base_type, baseline in unique_baselines.items():
+        output += f"| {base_type} | {baseline['f1']:.4f} | {baseline['accuracy']:.4f} | "
+        output += f"{baseline['precision']:.4f} | {baseline['recall']:.4f} | "
+        output += f"{baseline['sensitivity']:.4f} | {baseline['specificity']:.4f} |\n"
+    # 最佳模型分析
+    output += "\n## 🏆 最佳模型（各指標）\n\n"
+    metrics_to_check = [
+        ('F1 Score', 'eval_f1'),
+        ('準確率', 'eval_accuracy'),
+        ('精確率', 'eval_precision'),
+        ('召回率', 'eval_recall'),
+        ('敏感度', 'eval_sensitivity'),
+        ('特異度', 'eval_specificity')
+    ]
+    for metric_name, metric_key in metrics_to_check:
+        best_model = max(
+            trained_models.items(),
+            key=lambda x: x[1]['results'][metric_key]
         )
+        model_id = best_model[0]
+        value = best_model[1]['results'][metric_key]
+        baseline_val = best_model[1]['baseline'][metric_key.replace('eval_', '')]
+        improvement = calculate_improvement(baseline_val, value)
+        output += f"**{metric_name}**: {model_id[:30]}... "
+        output += f"({value:.4f}, 改善 {format_improvement(improvement)})\n\n"
+    # 改善統計
+    output += "## 📈 改善統計\n\n"
+    improvements = []
+    for model_id, info in trained_models.items():
+        f1_base = info['baseline']['f1']
+        f1_fine = info['results']['eval_f1']
+        improve = calculate_improvement(f1_base, f1_fine)
+        if improve != float('inf'):
+            improvements.append({
+                'model': model_id,
+                'improvement': improve,
+                'method': info['config']['method']
+            })
+    if improvements:
+        avg_improvement = np.mean([x['improvement'] for x in improvements])
+        max_improvement = max(improvements, key=lambda x: x['improvement'])
+        min_improvement = min(improvements, key=lambda x: x['improvement'])
+        output += f"平均 F1 改善: {format_improvement(avg_improvement)}\n"
+        output += f"最大改善: {max_improvement['model'][:30]}... ({format_improvement(max_improvement['improvement'])})\n"
+        output += f"最小改善: {min_improvement['model'][:30]}... ({format_improvement(min_improvement['improvement'])})\n\n"
+        # 方法比較
+        method_improvements = {}
+        for imp in improvements:
+            method = imp['method']
+            if method not in method_improvements:
+                method_improvements[method] = []
+            method_improvements[method].append(imp['improvement'])
+        output += "### 各方法平均改善:\n"
+        for method, imps in method_improvements.items():
+            avg_imp = np.mean(imps)
+            output += f"- **{method.upper()}**: {format_improvement(avg_imp)}\n"
+    return output
+# ==================== Gradio UI ====================
+def create_demo():
+    """創建 Gradio 介面"""
+    with gr.Blocks(
+        title="BERT Fine-tuning 教學平台",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {font-family: 'Microsoft JhengHei', 'Arial', sans-serif;}
+        """
+    ) as demo:
+        gr.Markdown(
+            """
+            # 🧬 BERT Fine-tuning 教學平台
+            ### 比較基準模型 vs 微調模型的表現差異（改進版）
+            """
+        )
+        with gr.Tab("🎯 訓練"):
+            gr.Markdown("## 步驟 1: 選擇基礎模型")
+            base_model = gr.Dropdown(
+                choices=["BERT-base", "BERT-base-chinese", "BioBERT", "SciBERT"],
+                value="BERT-base",
+                label="基礎模型",
+                info="選擇適合您資料的預訓練模型"
+            )
+            gr.Markdown("## 步驟 2: 選擇微調方法")
+            method = gr.Radio(
+                choices=["lora", "adalora", "full"],
+                value="lora",
+                label="微調方法",
+                info="LoRA 和 AdaLoRA 是參數高效方法，Full 是完全微調"
+            )
+            gr.Markdown("## 步驟 3: 上傳資料")
+            csv_file = gr.File(
+                label="CSV 檔案（需包含 Text 和 label 欄位）",
+                file_types=[".csv"]
+            )
+            gr.Markdown("## 步驟 4: 設定訓練參數")
+            with gr.Accordion("🎯 基本訓練參數", open=True):
+                with gr.Row():
+                    num_epochs = gr.Number(
+                        value=5, label="訓練輪數", minimum=1, maximum=50, precision=0,
+                        info="建議 3-10 輪，過多可能過擬合"
+                    )
+                    batch_size = gr.Number(
+                        value=8, label="批次大小", minimum=1, maximum=64, precision=0,
+                        info="GPU 記憶體不足時請降低"
+                    )
+                    learning_rate = gr.Number(
+                        value=3e-5, label="學習率", minimum=1e-6, maximum=1e-3,
+                        info="建議 1e-5 到 5e-5"
+                    )
+            with gr.Accordion("⚙️ 進階參數"):
+                with gr.Row():
+                    weight_decay = gr.Number(
+                        value=0.01, label="權重衰減", minimum=0, maximum=1,
+                        info="防止過擬合，建議 0.01-0.1"
+                    )
+                    dropout = gr.Number(
+                        value=0.1, label="Dropout 率", minimum=0, maximum=0.5,
+                        info="防止過擬合，建議 0.1-0.3"
+                    )
+            with gr.Accordion("🔧 PEFT 參數（LoRA/AdaLoRA）"):
+                with gr.Row():
+                    lora_r = gr.Number(
+                        value=16, label="LoRA Rank (r)", minimum=1, maximum=64, precision=0,
+                        info="越大表達能力越強，但參數越多"
+                    )
+                    lora_alpha = gr.Number(
+                        value=32, label="LoRA Alpha", minimum=1, maximum=128, precision=0,
+                        info="通常設為 Rank 的 2 倍"
+                    )
+                    lora_dropout = gr.Number(
+                        value=0.05, label="LoRA Dropout", minimum=0, maximum=0.5,
+                        info="LoRA 層的 dropout"
+                    )
+            with gr.Accordion("⚖️ 類別平衡設定"):
+                with gr.Row():
+                    weight_mult = gr.Number(
+                        value=1.0, label="權重倍數", minimum=0.1, maximum=5.0,
+                        info="調整少數類權重的倍數"
+                    )
+                    weight_method = gr.Dropdown(
+                        choices=["sqrt", "log", "balanced", "custom"],
+                        value="sqrt",
+                        label="權重計算方法",
+                        info="sqrt 和 log 適合極度不平衡資料"
+                    )
+            with gr.Accordion("🎯 訓練策略"):
+                with gr.Row():
+                    best_metric = gr.Dropdown(
+                        choices=["f1", "accuracy", "precision", "recall", "sensitivity", "specificity"],
+                        value="f1",
+                        label="最佳模型指標",
+                        info="根據此指標選擇最佳模型"
+                    )
+                    use_early_stopping = gr.Checkbox(
+                        value=True, label="啟用 Early Stopping",
+                        info="當模型不再改善時提前停止"
+                    )
+                    patience = gr.Number(
+                        value=3, label="Patience", minimum=1, maximum=10, precision=0,
+                        info="幾輪無改善後停止訓練"
+                    )
+            train_btn = gr.Button("🚀 開始訓練", variant="primary", size="lg")
+            gr.Markdown("## 📊 訓練結果")
+            with gr.Row():
+                data_info = gr.Textbox(label="📋 訓練資訊", lines=25)
+                history_output = gr.Textbox(label="📈 訓練歷程", lines=25)
+            with gr.Row():
+                baseline_result = gr.Textbox(label="🔬 基準模型（未微調）", lines=15)
+                finetuned_result = gr.Textbox(label="✅ 微調模型", lines=15)
+            comparison_result = gr.Textbox(label="📊 效能比較分析", lines=20)
+            train_btn.click(
+                train_bert_model,
+                inputs=[
+                    csv_file, base_model, method, num_epochs, batch_size, learning_rate,
+                    weight_decay, dropout, lora_r, lora_alpha, lora_dropout,
+                    weight_mult, weight_method, best_metric, use_early_stopping, patience
+                ],
+                outputs=[data_info, baseline_result, finetuned_result, comparison_result, history_output]
+            )
+        with gr.Tab("🔮 預測"):
+            gr.Markdown("## 使用訓練好的模型進行預測")
+            with gr.Row():
+                model_dropdown = gr.Dropdown(
+                    label="選擇模型",
+                    choices=list(trained_models.keys()),
+                    interactive=True
+                )
+                refresh_btn = gr.Button("🔄 刷新模型列表", size="sm")
+            text_input = gr.Textbox(
+                label="輸入要預測的文字",
+                lines=5,
+                placeholder="請輸入病例描述或相關文字..."
+            )
+            predict_btn = gr.Button("🎯 執行預測", variant="primary", size="lg")
+            pred_output = gr.Textbox(label="預測結果與分析", lines=25)
+            # 刷新模型列表
+            refresh_btn.click(
+                lambda: gr.Dropdown(choices=list(trained_models.keys())),
+                outputs=[model_dropdown]
+            )
+            # 執行預測
+            predict_btn.click(
+                predict,
+                inputs=[model_dropdown, text_input],
+                outputs=[pred_output]
+            )
+            # 範例
+            gr.Examples(
+                examples=[
+                    ["Patient with stage II breast cancer, showing good response to chemotherapy treatment."],
+                    ["Advanced metastatic cancer with multiple organ failure, poor prognosis."],
+                    ["Early stage tumor detected, surgery scheduled, excellent recovery expected."],
+                    ["Terminal stage disease, palliative care initiated, family counseling provided."]
+                ],
+                inputs=text_input
+            )
+        with gr.Tab("📊 比較"):
+            gr.Markdown("## 比較所有已訓練的模型")
+            compare_btn = gr.Button("📊 生成比較報告", variant="primary", size="lg")
+            compare_output = gr.Markdown()
+            compare_btn.click(compare_models, outputs=[compare_output])
+        with gr.Tab("📖 說明"):
+            gr.Markdown("""
+            ## 📖 使用說明
+            ### 🎯 平台特色
+            本改進版平台提供以下功能：
+            1. **自動基準比較**：每次訓練都會自動評估基準模型，清楚顯示微調的改善
+            2. **訓練監控**：記錄每個 epoch 的詳細訓練歷程
+            3. **Early Stopping**：避免過擬合，自動選擇最佳模型
+            4. **多種權重策略**：針對不平衡資料提供多種處理方法
+            5. **完整評估指標**：包含 F1、準確率、精確率、召回率、敏感度、特異度、PPV、NPV
+            ### 🤖 支援的基礎模型
+            - **BERT-base**: 標準英文 BERT，適用於一般英文文本
+            - **BERT-base-chinese**: 中文 BERT，適用於中文文本
+            - **BioBERT**: 生物醫學領域專用 BERT
+            - **SciBERT**: 科學文獻專用 BERT
+            ### 🔧 微調方法說明
+            - **LoRA** (Low-Rank Adaptation)
+              - 參數效率最高，只訓練 <1% 參數
+              - 訓練速度快，記憶體需求低
+              - 適合大多數場景
+            - **AdaLoRA** (Adaptive LoRA)
+              - 自動調整秩的分配
+              - 可能獲得更好的效果
+              - 訓練時間稍長
+            - **Full** (完全微調)
+              - 訓練所有參數
+              - 可能獲得最佳效果
+              - 需要較大記憶體和時間
+            ### ⚖️ 處理不平衡資料
+            #### 權重計算方法：
+            1. **sqrt** (平方根法) - 推薦用於極度不平衡
+               - 使用平方根緩和權重
+               - 避免權重過大導致過擬合
+            2. **log** (對數法) - 更保守的方法
+               - 使用對數進一步緩和
+               - 適合極度不平衡且容易過擬合的情況
+            3. **balanced** (平衡法)
+               - sklearn 風格的自動平衡
+               - 適合中度不平衡
+            4. **custom** (自定義)
+               - 根據不平衡程度自動調整
+               - 綜合考慮多種因素
+            #### 建議參數設定：
+            **極度不平衡 (>20:1)**
+            - 權重方法: sqrt 或 log
+            - 權重倍數: 0.5-1.0
+            - 使用 Focal Loss (自動啟用)
+            - Early Stopping: 建議開啟
+            **高度不平衡 (10-20:1)**
+            - 權重方法: sqrt
+            - 權重倍數: 0.8-1.5
+            - Early Stopping: 建議開啟
+            **中度不平衡 (5-10:1)**
+            - 權重方法: balanced
+            - 權重倍數: 1.0-2.0
+            **輕度不平衡 (<5:1)**
+            - 權重方法: balanced
+            - 權重倍數: 1.5-3.0
+            ### 📊 評估指標說明
+            - **F1 Score**: 精確率和召回率的調和平均，適合不平衡資料
+            - **Accuracy**: 整體準確率
+            - **Precision**: 預測為正類中實際為正類的比例
+            - **Recall/Sensitivity**: 實際正類中被正確預測的比例
+            - **Specificity**: 實際負類中被正確預測的比例
+            - **PPV**: 陽性預測值
+            - **NPV**: 陰性預測值
+            ### 🚀 快速開始指南
+            1. **準備資料**
+               - CSV 格式，包含 `Text` 和 `label` 欄位
+               - label: 0=負類(如存活), 1=正類(如死亡)
+            2. **選擇模型與方法**
+               - 英文資料：BERT-base + LoRA
+               - 中文資料：BERT-base-chinese + LoRA
+               - 醫學資料：BioBERT + LoRA
+            3. **設定參數**
+               - 使用預設參數作為起點
+               - 根據資料不平衡程度調整權重設定
+            4. **訓練與評估**
+               - 點擊「開始訓練」
+               - 查看基準 vs 微調的比較
+               - 觀察訓練歷程
+            5. **測試預測**
+               - 在「預測」頁面選擇模型
+               - 輸入文字進行預測
+               - 比較微調前後的差異
+            ### ⚠️ 注意事項
+            - GPU 可大幅加速訓練
+            - 批次大小過大可能導致記憶體不足
+            - Early Stopping 可避免過擬合
+            - 極度不平衡資料建議使用較保守的權重設定
+            ### 💡 優化建議
+            1. **記憶體不足**：降低批次大小或使用 LoRA
+            2. **過擬合**：增加 dropout、使用 Early Stopping、降低學習率
+            3. **欠擬合**：增加訓練輪數、提高學習率、增加模型容量
+            4. **不平衡資料**：調整類別權重、使用適當的評估指標(F1)
+            """)
+    return demo
+# ==================== 主程式 ====================
 if __name__ == "__main__":
+    demo = create_demo()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        max_threads=4
     )