import gradio as gr import logging import time import json import csv import io from transformers import pipeline from typing import Tuple, Optional, List, Dict import traceback from datetime import datetime import re # Configure logging for debugging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class LinguisticTranslationApp: def __init__(self): self.translators = {} self.translation_history = [] self.load_models() def load_models(self): """Load translation models with error handling""" try: logger.info("Loading translation models...") self.translators['en_to_ss'] = pipeline( "translation", model="dsfsi/en-ss-m2m100-combo", src_lang="en", tgt_lang="ss" ) self.translators['ss_to_en'] = pipeline( "translation", model="dsfsi/ss-en-m2m100-combo", src_lang="ss", tgt_lang="en" ) logger.info("Models loaded successfully!") except Exception as e: logger.error(f"Error loading models: {str(e)}") raise e def analyze_text_complexity(self, text: str, lang: str) -> Dict: """Analyze linguistic features of the input text""" words = text.split() sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] # Basic linguistic metrics analysis = { 'character_count': len(text), 'word_count': len(words), 'sentence_count': len(sentences), 'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0, 'avg_sentence_length': len(words) / len(sentences) if sentences else 0, 'unique_words': len(set(word.lower() for word in words)), 'lexical_diversity': len(set(word.lower() for word in words)) / len(words) if words else 0 } # Language-specific features if lang == 'ss': # Siswati # Check for common Siswati features analysis['potential_agglutination'] = sum(1 for word in words if len(word) > 10) analysis['click_consonants'] = sum(text.count(click) for click in ['c', 'q', 'x']) analysis['tone_markers'] = text.count('́') + text.count('̀') # Acute and grave accents return analysis def translate_text(self, text: str, direction: str, save_to_history: bool = True) -> Tuple[str, str, bool, Dict]: """ Translate text with comprehensive linguistic analysis Returns: Tuple[str, str, bool, Dict]: (translated_text, status_message, success, analysis) """ if not text or not text.strip(): return "", "⚠️ Please enter some text to translate", False, {} if not direction: return "", "⚠️ Please select a translation direction", False, {} # Input validation if len(text) > 2000: # Increased limit for linguistic work return "", "⚠️ Text is too long. Please limit to 2000 characters.", False, {} try: start_time = time.time() # Determine source and target languages if direction == 'English → Siswati': translator = self.translators['en_to_ss'] source_lang = "English" target_lang = "Siswati" source_code = "en" target_code = "ss" else: translator = self.translators['ss_to_en'] source_lang = "Siswati" target_lang = "English" source_code = "ss" target_code = "en" logger.info(f"Translating from {source_lang} to {target_lang}") # Analyze source text source_analysis = self.analyze_text_complexity(text, source_code) # Perform translation result = translator( text, max_length=512, early_stopping=True, do_sample=False, num_beams=4 # Better quality for linguistic analysis ) translation = result[0]['translation_text'] # Analyze translated text target_analysis = self.analyze_text_complexity(translation, target_code) # Calculate processing time processing_time = time.time() - start_time # Linguistic comparison analysis = { 'source': source_analysis, 'target': target_analysis, 'translation_ratio': len(translation) / len(text) if text else 0, 'word_ratio': target_analysis['word_count'] / source_analysis['word_count'] if source_analysis['word_count'] else 0, 'processing_time': processing_time, 'timestamp': datetime.now().isoformat() } # Save to history for linguistic research if save_to_history: history_entry = { 'source_text': text, 'translated_text': translation, 'direction': direction, 'source_lang': source_lang, 'target_lang': target_lang, 'analysis': analysis, 'timestamp': datetime.now().isoformat() } self.translation_history.append(history_entry) # Success message with linguistic metadata status_msg = f"✅ Translation completed in {processing_time:.2f}s | Word ratio: {analysis['word_ratio']:.2f} | Character ratio: {analysis['translation_ratio']:.2f}" logger.info(f"Translation completed: {processing_time:.2f}s") return translation, status_msg, True, analysis except Exception as e: error_msg = f"❌ Translation failed: {str(e)}" logger.error(f"Translation error: {str(e)}") logger.error(traceback.format_exc()) return "", error_msg, False, {} def batch_translate(self, text_list: List[str], direction: str) -> List[Dict]: """Translate multiple texts for corpus analysis""" results = [] for i, text in enumerate(text_list): if text.strip(): translation, status, success, analysis = self.translate_text(text, direction, False) results.append({ 'index': i + 1, 'source': text, 'translation': translation, 'success': success, 'analysis': analysis }) return results def export_history_csv(self) -> str: """Export translation history as CSV for linguistic analysis""" if not self.translation_history: return None output = io.StringIO() writer = csv.writer(output) # Headers writer.writerow([ 'Timestamp', 'Source Language', 'Target Language', 'Source Text', 'Translation', 'Source Words', 'Target Words', 'Word Ratio', 'Source Characters', 'Target Characters', 'Character Ratio', 'Lexical Diversity (Source)', 'Lexical Diversity (Target)', 'Processing Time (s)' ]) # Data rows for entry in self.translation_history: analysis = entry['analysis'] writer.writerow([ entry['timestamp'], entry['source_lang'], entry['target_lang'], entry['source_text'], entry['translated_text'], analysis['source']['word_count'], analysis['target']['word_count'], analysis['word_ratio'], analysis['source']['character_count'], analysis['target']['character_count'], analysis['translation_ratio'], analysis['source']['lexical_diversity'], analysis['target']['lexical_diversity'], analysis['processing_time'] ]) return output.getvalue() # Initialize the app app = LinguisticTranslationApp() # Custom CSS for linguistic interface custom_css = """ #logo { display: block; margin: 0 auto 20px auto; } .linguistic-panel { background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%); border: 1px solid #0891b2; border-radius: 12px; padding: 20px; margin: 10px 0; } .analysis-metric { background: white; padding: 10px; border-radius: 8px; margin: 5px; border-left: 4px solid #0891b2; } .status-success { color: #059669 !important; font-weight: 500; } .status-error { color: #DC2626 !important; font-weight: 500; } .gradient-text { background: linear-gradient(45deg, #059669, #0891b2); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; } .linguistic-header { text-align: center; margin-bottom: 30px; padding: 20px; background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%); border-radius: 16px; border: 1px solid #cbd5e1; } .comparison-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin: 15px 0; } .metric-card { background: white; padding: 15px; border-radius: 8px; border: 1px solid #e2e8f0; text-align: center; } """ # Create the Gradio interface with gr.Blocks(css=custom_css, title="Linguistic Translation Analysis Tool", theme=gr.themes.Soft()) as demo: # Header section with gr.Row(): with gr.Column(): gr.HTML("""
Advanced translation system with comprehensive linguistic analysis for researchers, linguists, and language documentation projects. Includes morphological insights, statistical analysis, and corpus management features.
Words: {source_metrics['word_count']}
Characters: {source_metrics['character_count']}
Sentences: {source_metrics['sentence_count']}
Lexical Diversity: {source_metrics['lexical_diversity']:.3f}
Words: {target_metrics['word_count']}
Characters: {target_metrics['character_count']}
Sentences: {target_metrics['sentence_count']}
Lexical Diversity: {target_metrics['lexical_diversity']:.3f}
Potential agglutinated words: {source_metrics['potential_agglutination']}
Click consonants (c,q,x): {source_metrics['click_consonants']}
Tone markers: {source_metrics['tone_markers']}
Word ratio: {analysis['word_ratio']:.3f}
Character ratio: {analysis['translation_ratio']:.3f}
Processing time: {analysis['processing_time']:.3f}s