import gradio as gr import logging import time import json import csv import io from transformers import pipeline from typing import Tuple, Optional, List, Dict import traceback from datetime import datetime import re # Configure logging for debugging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class LinguisticTranslationApp: def __init__(self): self.translators = {} self.translation_history = [] self.load_models() def load_models(self): """Load translation models with error handling""" try: logger.info("Loading translation models...") self.translators['en_to_ss'] = pipeline( "translation", model="dsfsi/en-ss-m2m100-combo", src_lang="en", tgt_lang="ss" ) self.translators['ss_to_en'] = pipeline( "translation", model="dsfsi/ss-en-m2m100-combo", src_lang="ss", tgt_lang="en" ) logger.info("Models loaded successfully!") except Exception as e: logger.error(f"Error loading models: {str(e)}") raise e def analyze_text_complexity(self, text: str, lang: str) -> Dict: """Analyze linguistic features of the input text""" words = text.split() sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] # Basic linguistic metrics analysis = { 'character_count': len(text), 'word_count': len(words), 'sentence_count': len(sentences), 'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0, 'avg_sentence_length': len(words) / len(sentences) if sentences else 0, 'unique_words': len(set(word.lower() for word in words)), 'lexical_diversity': len(set(word.lower() for word in words)) / len(words) if words else 0 } # Language-specific features if lang == 'ss': # Siswati # Check for common Siswati features analysis['potential_agglutination'] = sum(1 for word in words if len(word) > 10) analysis['click_consonants'] = sum(text.count(click) for click in ['c', 'q', 'x']) analysis['tone_markers'] = text.count('́') + text.count('̀') # Acute and grave accents return analysis def translate_text(self, text: str, direction: str, save_to_history: bool = True) -> Tuple[str, str, bool, Dict]: """ Translate text with comprehensive linguistic analysis Returns: Tuple[str, str, bool, Dict]: (translated_text, status_message, success, analysis) """ if not text or not text.strip(): return "", "⚠️ Please enter some text to translate", False, {} if not direction: return "", "⚠️ Please select a translation direction", False, {} # Input validation if len(text) > 2000: # Increased limit for linguistic work return "", "⚠️ Text is too long. Please limit to 2000 characters.", False, {} try: start_time = time.time() # Determine source and target languages if direction == 'English → Siswati': translator = self.translators['en_to_ss'] source_lang = "English" target_lang = "Siswati" source_code = "en" target_code = "ss" else: translator = self.translators['ss_to_en'] source_lang = "Siswati" target_lang = "English" source_code = "ss" target_code = "en" logger.info(f"Translating from {source_lang} to {target_lang}") # Analyze source text source_analysis = self.analyze_text_complexity(text, source_code) # Perform translation result = translator( text, max_length=512, early_stopping=True, do_sample=False, num_beams=4 # Better quality for linguistic analysis ) translation = result[0]['translation_text'] # Analyze translated text target_analysis = self.analyze_text_complexity(translation, target_code) # Calculate processing time processing_time = time.time() - start_time # Linguistic comparison analysis = { 'source': source_analysis, 'target': target_analysis, 'translation_ratio': len(translation) / len(text) if text else 0, 'word_ratio': target_analysis['word_count'] / source_analysis['word_count'] if source_analysis['word_count'] else 0, 'processing_time': processing_time, 'timestamp': datetime.now().isoformat() } # Save to history for linguistic research if save_to_history: history_entry = { 'source_text': text, 'translated_text': translation, 'direction': direction, 'source_lang': source_lang, 'target_lang': target_lang, 'analysis': analysis, 'timestamp': datetime.now().isoformat() } self.translation_history.append(history_entry) # Success message with linguistic metadata status_msg = f"✅ Translation completed in {processing_time:.2f}s | Word ratio: {analysis['word_ratio']:.2f} | Character ratio: {analysis['translation_ratio']:.2f}" logger.info(f"Translation completed: {processing_time:.2f}s") return translation, status_msg, True, analysis except Exception as e: error_msg = f"❌ Translation failed: {str(e)}" logger.error(f"Translation error: {str(e)}") logger.error(traceback.format_exc()) return "", error_msg, False, {} def batch_translate(self, text_list: List[str], direction: str) -> List[Dict]: """Translate multiple texts for corpus analysis""" results = [] for i, text in enumerate(text_list): if text.strip(): translation, status, success, analysis = self.translate_text(text, direction, False) results.append({ 'index': i + 1, 'source': text, 'translation': translation, 'success': success, 'analysis': analysis }) return results def export_history_csv(self) -> str: """Export translation history as CSV for linguistic analysis""" if not self.translation_history: return None output = io.StringIO() writer = csv.writer(output) # Headers writer.writerow([ 'Timestamp', 'Source Language', 'Target Language', 'Source Text', 'Translation', 'Source Words', 'Target Words', 'Word Ratio', 'Source Characters', 'Target Characters', 'Character Ratio', 'Lexical Diversity (Source)', 'Lexical Diversity (Target)', 'Processing Time (s)' ]) # Data rows for entry in self.translation_history: analysis = entry['analysis'] writer.writerow([ entry['timestamp'], entry['source_lang'], entry['target_lang'], entry['source_text'], entry['translated_text'], analysis['source']['word_count'], analysis['target']['word_count'], analysis['word_ratio'], analysis['source']['character_count'], analysis['target']['character_count'], analysis['translation_ratio'], analysis['source']['lexical_diversity'], analysis['target']['lexical_diversity'], analysis['processing_time'] ]) return output.getvalue() # Initialize the app app = LinguisticTranslationApp() # Custom CSS for linguistic interface custom_css = """ #logo { display: block; margin: 0 auto 20px auto; } .linguistic-panel { background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%); border: 1px solid #0891b2; border-radius: 12px; padding: 20px; margin: 10px 0; } .analysis-metric { background: white; padding: 10px; border-radius: 8px; margin: 5px; border-left: 4px solid #0891b2; } .status-success { color: #059669 !important; font-weight: 500; } .status-error { color: #DC2626 !important; font-weight: 500; } .gradient-text { background: linear-gradient(45deg, #059669, #0891b2); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; } .linguistic-header { text-align: center; margin-bottom: 30px; padding: 20px; background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%); border-radius: 16px; border: 1px solid #cbd5e1; } .comparison-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin: 15px 0; } .metric-card { background: white; padding: 15px; border-radius: 8px; border: 1px solid #e2e8f0; text-align: center; } """ # Create the Gradio interface with gr.Blocks(css=custom_css, title="Linguistic Translation Analysis Tool", theme=gr.themes.Soft()) as demo: # Header section with gr.Row(): with gr.Column(): gr.HTML("""

🔬 Siswati ⇄ English Linguistic Analysis Tool

Advanced translation system with comprehensive linguistic analysis for researchers, linguists, and language documentation projects. Includes morphological insights, statistical analysis, and corpus management features.

""") # Main translation interface with gr.Row(): with gr.Column(scale=2): # Input section with gr.Group(): gr.HTML("

📝 Translation Input

") direction = gr.Radio( choices=['English → Siswati', 'Siswati → English'], label="Translation Direction", value='English → Siswati', interactive=True ) input_text = gr.Textbox( lines=6, placeholder="Enter your text here for linguistic analysis... (maximum 2000 characters)", label="Source Text", max_lines=12, show_copy_button=True ) char_count = gr.HTML("Character count: 0/2000") with gr.Row(): translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg") clear_btn = gr.Button("🗑️ Clear", variant="secondary") # Output section with gr.Group(): gr.HTML("

✨ Translation Output

") output_text = gr.Textbox( label="Translation", lines=6, max_lines=12, show_copy_button=True, interactive=False ) status_display = gr.HTML() # Linguistic analysis panel with gr.Column(scale=1): with gr.Group(): gr.HTML("

📊 Linguistic Analysis

") # Real-time metrics with gr.Accordion("📈 Text Metrics", open=True): metrics_display = gr.HTML("""
Translate text to see linguistic analysis
""") # Language-specific features with gr.Accordion("🔍 Language Features", open=False): features_display = gr.HTML("") # Translation quality indicators with gr.Accordion("⚖️ Translation Ratios", open=False): ratios_display = gr.HTML("") # Batch processing section with gr.Accordion("📚 Batch Translation & Corpus Analysis", open=False): with gr.Row(): with gr.Column(): gr.HTML("

Upload text file or enter multiple lines:

") batch_input = gr.File( label="Upload .txt file", file_types=[".txt"], type="filepath" ) batch_text = gr.Textbox( lines=8, placeholder="Or paste multiple lines here (one per line)...", label="Batch Text Input", show_copy_button=True ) batch_direction = gr.Radio( choices=['English → Siswati', 'Siswati → English'], label="Batch Translation Direction", value='English → Siswati' ) batch_btn = gr.Button("🔄 Process Batch", variant="primary") with gr.Column(): batch_results = gr.Dataframe( headers=["Index", "Source", "Translation", "Words (S→T)", "Chars (S→T)"], label="Batch Results", interactive=False ) # Research tools section with gr.Accordion("🔬 Research & Export Tools", open=False): with gr.Row(): with gr.Column(): gr.HTML("

Translation History & Export

") history_display = gr.Dataframe( headers=["Timestamp", "Direction", "Source", "Translation"], label="Translation History", interactive=False ) with gr.Row(): refresh_history_btn = gr.Button("🔄 Refresh History") export_csv_btn = gr.Button("📊 Export CSV", variant="secondary") clear_history_btn = gr.Button("🗑️ Clear History", variant="stop") csv_download = gr.File(label="Download CSV", visible=False) with gr.Column(): gr.HTML("

Linguistic Resources

") gr.HTML("""
📖 Siswati Language Notes:
🔧 Research Features:
""") # Examples for linguists with gr.Accordion("💡 Linguistic Examples", open=False): examples = gr.Examples( examples=[ ["The child is playing with traditional toys.", "English → Siswati"], ["Umntfwana udlala ngetinsisimane tesintu.", "Siswati → English"], ["Agglutination demonstrates morphological complexity in Bantu languages.", "English → Siswati"], ["Lolimi lune-morphology leyinkimbinkimbi.", "Siswati → English"], ["What are the phonological features of this language?", "English → Siswati"], ["Yini tinchubo te-phonology talolimi?", "Siswati → English"], ], inputs=[input_text, direction], label="Click examples to analyze linguistic features:" ) # Footer with gr.Row(): with gr.Column(): gr.HTML("""
📁 En→Ss Model Repository 📁 Ss→En Model Repository 💬 Research Feedback
Research Team: Vukosi Marivate, Richard Lastrucci
Supporting African language documentation and computational linguistics research
For academic use: Please cite the original models in your publications
""") # Event handlers def update_char_count(text): count = len(text) if text else 0 color = "#DC2626" if count > 2000 else "#059669" if count > 1600 else "#64748b" return f"Character count: {count}/2000" def clear_all(): return "", "", "Character count: 0/2000", "", "", "", "" def translate_with_analysis(text, direction): translation, status, success, analysis = app.translate_text(text, direction) status_html = f"
{status}
" if success and analysis: # Create metrics display source_metrics = analysis['source'] target_metrics = analysis['target'] metrics_html = f"""
📊 Source Text

Words: {source_metrics['word_count']}

Characters: {source_metrics['character_count']}

Sentences: {source_metrics['sentence_count']}

Lexical Diversity: {source_metrics['lexical_diversity']:.3f}

📊 Translation

Words: {target_metrics['word_count']}

Characters: {target_metrics['character_count']}

Sentences: {target_metrics['sentence_count']}

Lexical Diversity: {target_metrics['lexical_diversity']:.3f}

""" # Language features features_html = "" if 'potential_agglutination' in source_metrics: features_html = f"""
🔍 Siswati Features Detected:

Potential agglutinated words: {source_metrics['potential_agglutination']}

Click consonants (c,q,x): {source_metrics['click_consonants']}

Tone markers: {source_metrics['tone_markers']}

""" # Translation ratios ratios_html = f"""
⚖️ Translation Ratios:

Word ratio: {analysis['word_ratio']:.3f}

Character ratio: {analysis['translation_ratio']:.3f}

Processing time: {analysis['processing_time']:.3f}s

""" return translation, status_html, metrics_html, features_html, ratios_html return translation, status_html, "", "", "" def process_batch(file_path, batch_text, direction): texts = [] if file_path: try: with open(file_path, 'r', encoding='utf-8') as f: texts = [line.strip() for line in f.readlines() if line.strip()] except Exception as e: return [[f"Error reading file: {str(e)}", "", "", "", ""]] elif batch_text: texts = [line.strip() for line in batch_text.split('\n') if line.strip()] if not texts: return [["No text provided", "", "", "", ""]] results = app.batch_translate(texts, direction) # Format for display display_data = [] for r in results: if r['success']: word_ratio = f"{r['analysis']['source']['word_count']}→{r['analysis']['target']['word_count']}" char_ratio = f"{r['analysis']['source']['character_count']}→{r['analysis']['target']['character_count']}" else: word_ratio = "Error" char_ratio = "Error" display_data.append([ r['index'], r['source'][:50] + "..." if len(r['source']) > 50 else r['source'], r['translation'][:50] + "..." if len(r['translation']) > 50 else r['translation'], word_ratio, char_ratio ]) return display_data def get_history(): if not app.translation_history: return [] return [[ entry['timestamp'][:19], # Remove microseconds entry['direction'], entry['source_text'][:50] + "..." if len(entry['source_text']) > 50 else entry['source_text'], entry['translated_text'][:50] + "..." if len(entry['translated_text']) > 50 else entry['translated_text'] ] for entry in app.translation_history[-20:]] # Show last 20 def export_csv(): csv_content = app.export_history_csv() if csv_content: filename = f"translation_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" return gr.File.update(value=csv_content, visible=True, label=f"📊 {filename}") return gr.File.update(visible=False) def clear_history(): app.translation_history = [] return [] # Wire up events input_text.change(fn=update_char_count, inputs=input_text, outputs=char_count) translate_btn.click( fn=translate_with_analysis, inputs=[input_text, direction], outputs=[output_text, status_display, metrics_display, features_display, ratios_display] ) clear_btn.click( fn=clear_all, outputs=[input_text, output_text, char_count, status_display, metrics_display, features_display, ratios_display] ) batch_btn.click( fn=process_batch, inputs=[batch_input, batch_text, batch_direction], outputs=batch_results ) refresh_history_btn.click(fn=get_history, outputs=history_display) export_csv_btn.click(fn=export_csv, outputs=csv_download) clear_history_btn.click(fn=clear_history, outputs=history_display) # Auto-translate on Enter input_text.submit( fn=translate_with_analysis, inputs=[input_text, direction], outputs=[output_text, status_display, metrics_display, features_display, ratios_display] ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )