vukosi's picture
Updated with a Claude Driven Modernization
cf370ec verified
raw
history blame
25.8 kB
import gradio as gr
import logging
import time
import json
import csv
import io
from transformers import pipeline
from typing import Tuple, Optional, List, Dict
import traceback
from datetime import datetime
import re
# Configure logging for debugging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LinguisticTranslationApp:
def __init__(self):
self.translators = {}
self.translation_history = []
self.load_models()
def load_models(self):
"""Load translation models with error handling"""
try:
logger.info("Loading translation models...")
self.translators['en_to_ss'] = pipeline(
"translation",
model="dsfsi/en-ss-m2m100-combo",
src_lang="en",
tgt_lang="ss"
)
self.translators['ss_to_en'] = pipeline(
"translation",
model="dsfsi/ss-en-m2m100-combo",
src_lang="ss",
tgt_lang="en"
)
logger.info("Models loaded successfully!")
except Exception as e:
logger.error(f"Error loading models: {str(e)}")
raise e
def analyze_text_complexity(self, text: str, lang: str) -> Dict:
"""Analyze linguistic features of the input text"""
words = text.split()
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
# Basic linguistic metrics
analysis = {
'character_count': len(text),
'word_count': len(words),
'sentence_count': len(sentences),
'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0,
'avg_sentence_length': len(words) / len(sentences) if sentences else 0,
'unique_words': len(set(word.lower() for word in words)),
'lexical_diversity': len(set(word.lower() for word in words)) / len(words) if words else 0
}
# Language-specific features
if lang == 'ss': # Siswati
# Check for common Siswati features
analysis['potential_agglutination'] = sum(1 for word in words if len(word) > 10)
analysis['click_consonants'] = sum(text.count(click) for click in ['c', 'q', 'x'])
analysis['tone_markers'] = text.count('́') + text.count('̀') # Acute and grave accents
return analysis
def translate_text(self, text: str, direction: str, save_to_history: bool = True) -> Tuple[str, str, bool, Dict]:
"""
Translate text with comprehensive linguistic analysis
Returns:
Tuple[str, str, bool, Dict]: (translated_text, status_message, success, analysis)
"""
if not text or not text.strip():
return "", "⚠️ Please enter some text to translate", False, {}
if not direction:
return "", "⚠️ Please select a translation direction", False, {}
# Input validation
if len(text) > 2000: # Increased limit for linguistic work
return "", "⚠️ Text is too long. Please limit to 2000 characters.", False, {}
try:
start_time = time.time()
# Determine source and target languages
if direction == 'English → Siswati':
translator = self.translators['en_to_ss']
source_lang = "English"
target_lang = "Siswati"
source_code = "en"
target_code = "ss"
else:
translator = self.translators['ss_to_en']
source_lang = "Siswati"
target_lang = "English"
source_code = "ss"
target_code = "en"
logger.info(f"Translating from {source_lang} to {target_lang}")
# Analyze source text
source_analysis = self.analyze_text_complexity(text, source_code)
# Perform translation
result = translator(
text,
max_length=512,
early_stopping=True,
do_sample=False,
num_beams=4 # Better quality for linguistic analysis
)
translation = result[0]['translation_text']
# Analyze translated text
target_analysis = self.analyze_text_complexity(translation, target_code)
# Calculate processing time
processing_time = time.time() - start_time
# Linguistic comparison
analysis = {
'source': source_analysis,
'target': target_analysis,
'translation_ratio': len(translation) / len(text) if text else 0,
'word_ratio': target_analysis['word_count'] / source_analysis['word_count'] if source_analysis['word_count'] else 0,
'processing_time': processing_time,
'timestamp': datetime.now().isoformat()
}
# Save to history for linguistic research
if save_to_history:
history_entry = {
'source_text': text,
'translated_text': translation,
'direction': direction,
'source_lang': source_lang,
'target_lang': target_lang,
'analysis': analysis,
'timestamp': datetime.now().isoformat()
}
self.translation_history.append(history_entry)
# Success message with linguistic metadata
status_msg = f"✅ Translation completed in {processing_time:.2f}s | Word ratio: {analysis['word_ratio']:.2f} | Character ratio: {analysis['translation_ratio']:.2f}"
logger.info(f"Translation completed: {processing_time:.2f}s")
return translation, status_msg, True, analysis
except Exception as e:
error_msg = f"❌ Translation failed: {str(e)}"
logger.error(f"Translation error: {str(e)}")
logger.error(traceback.format_exc())
return "", error_msg, False, {}
def batch_translate(self, text_list: List[str], direction: str) -> List[Dict]:
"""Translate multiple texts for corpus analysis"""
results = []
for i, text in enumerate(text_list):
if text.strip():
translation, status, success, analysis = self.translate_text(text, direction, False)
results.append({
'index': i + 1,
'source': text,
'translation': translation,
'success': success,
'analysis': analysis
})
return results
def export_history_csv(self) -> str:
"""Export translation history as CSV for linguistic analysis"""
if not self.translation_history:
return None
output = io.StringIO()
writer = csv.writer(output)
# Headers
writer.writerow([
'Timestamp', 'Source Language', 'Target Language', 'Source Text',
'Translation', 'Source Words', 'Target Words', 'Word Ratio',
'Source Characters', 'Target Characters', 'Character Ratio',
'Lexical Diversity (Source)', 'Lexical Diversity (Target)',
'Processing Time (s)'
])
# Data rows
for entry in self.translation_history:
analysis = entry['analysis']
writer.writerow([
entry['timestamp'],
entry['source_lang'],
entry['target_lang'],
entry['source_text'],
entry['translated_text'],
analysis['source']['word_count'],
analysis['target']['word_count'],
analysis['word_ratio'],
analysis['source']['character_count'],
analysis['target']['character_count'],
analysis['translation_ratio'],
analysis['source']['lexical_diversity'],
analysis['target']['lexical_diversity'],
analysis['processing_time']
])
return output.getvalue()
# Initialize the app
app = LinguisticTranslationApp()
# Custom CSS for linguistic interface
custom_css = """
#logo {
display: block;
margin: 0 auto 20px auto;
}
.linguistic-panel {
background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
border: 1px solid #0891b2;
border-radius: 12px;
padding: 20px;
margin: 10px 0;
}
.analysis-metric {
background: white;
padding: 10px;
border-radius: 8px;
margin: 5px;
border-left: 4px solid #0891b2;
}
.status-success {
color: #059669 !important;
font-weight: 500;
}
.status-error {
color: #DC2626 !important;
font-weight: 500;
}
.gradient-text {
background: linear-gradient(45deg, #059669, #0891b2);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.linguistic-header {
text-align: center;
margin-bottom: 30px;
padding: 20px;
background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
border-radius: 16px;
border: 1px solid #cbd5e1;
}
.comparison-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 15px;
margin: 15px 0;
}
.metric-card {
background: white;
padding: 15px;
border-radius: 8px;
border: 1px solid #e2e8f0;
text-align: center;
}
"""
# Create the Gradio interface
with gr.Blocks(css=custom_css, title="Linguistic Translation Analysis Tool", theme=gr.themes.Soft()) as demo:
# Header section
with gr.Row():
with gr.Column():
gr.HTML("""
<div class='linguistic-header'>
<h1 class='gradient-text' style='font-size: 2.5em; margin-bottom: 10px;'>
🔬 Siswati ⇄ English Linguistic Analysis Tool
</h1>
<p style='font-size: 1.1em; color: #475569; max-width: 800px; margin: 0 auto;'>
Advanced translation system with comprehensive linguistic analysis for researchers,
linguists, and language documentation projects. Includes morphological insights,
statistical analysis, and corpus management features.
</p>
</div>
""")
# Main translation interface
with gr.Row():
with gr.Column(scale=2):
# Input section
with gr.Group():
gr.HTML("<h3>📝 Translation Input</h3>")
direction = gr.Radio(
choices=['English → Siswati', 'Siswati → English'],
label="Translation Direction",
value='English → Siswati',
interactive=True
)
input_text = gr.Textbox(
lines=6,
placeholder="Enter your text here for linguistic analysis... (maximum 2000 characters)",
label="Source Text",
max_lines=12,
show_copy_button=True
)
char_count = gr.HTML("Character count: 0/2000")
with gr.Row():
translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
clear_btn = gr.Button("🗑️ Clear", variant="secondary")
# Output section
with gr.Group():
gr.HTML("<h3>✨ Translation Output</h3>")
output_text = gr.Textbox(
label="Translation",
lines=6,
max_lines=12,
show_copy_button=True,
interactive=False
)
status_display = gr.HTML()
# Linguistic analysis panel
with gr.Column(scale=1):
with gr.Group():
gr.HTML("<h3>📊 Linguistic Analysis</h3>")
# Real-time metrics
with gr.Accordion("📈 Text Metrics", open=True):
metrics_display = gr.HTML("""
<div style='text-align: center; color: #64748b; padding: 20px;'>
<em>Translate text to see linguistic analysis</em>
</div>
""")
# Language-specific features
with gr.Accordion("🔍 Language Features", open=False):
features_display = gr.HTML("")
# Translation quality indicators
with gr.Accordion("⚖️ Translation Ratios", open=False):
ratios_display = gr.HTML("")
# Batch processing section
with gr.Accordion("📚 Batch Translation & Corpus Analysis", open=False):
with gr.Row():
with gr.Column():
gr.HTML("<h4>Upload text file or enter multiple lines:</h4>")
batch_input = gr.File(
label="Upload .txt file",
file_types=[".txt"],
type="filepath"
)
batch_text = gr.Textbox(
lines=8,
placeholder="Or paste multiple lines here (one per line)...",
label="Batch Text Input",
show_copy_button=True
)
batch_direction = gr.Radio(
choices=['English → Siswati', 'Siswati → English'],
label="Batch Translation Direction",
value='English → Siswati'
)
batch_btn = gr.Button("🔄 Process Batch", variant="primary")
with gr.Column():
batch_results = gr.Dataframe(
headers=["Index", "Source", "Translation", "Words (S→T)", "Chars (S→T)"],
label="Batch Results",
interactive=False
)
# Research tools section
with gr.Accordion("🔬 Research & Export Tools", open=False):
with gr.Row():
with gr.Column():
gr.HTML("<h4>Translation History & Export</h4>")
history_display = gr.Dataframe(
headers=["Timestamp", "Direction", "Source", "Translation"],
label="Translation History",
interactive=False
)
with gr.Row():
refresh_history_btn = gr.Button("🔄 Refresh History")
export_csv_btn = gr.Button("📊 Export CSV", variant="secondary")
clear_history_btn = gr.Button("🗑️ Clear History", variant="stop")
csv_download = gr.File(label="Download CSV", visible=False)
with gr.Column():
gr.HTML("<h4>Linguistic Resources</h4>")
gr.HTML("""
<div style='background: #f8fafc; padding: 20px; border-radius: 8px; border: 1px solid #e2e8f0;'>
<h5>📖 Siswati Language Notes:</h5>
<ul style='text-align: left; margin: 10px 0;'>
<li><strong>Script:</strong> Latin alphabet</li>
<li><strong>Family:</strong> Niger-Congo, Bantu</li>
<li><strong>Features:</strong> Agglutinative, click consonants</li>
<li><strong>Speakers:</strong> ~2.3 million (Eswatini, South Africa)</li>
</ul>
<h5>🔧 Research Features:</h5>
<ul style='text-align: left; margin: 10px 0;'>
<li>Morphological complexity analysis</li>
<li>Translation ratio tracking</li>
<li>Lexical diversity measurement</li>
<li>Batch processing for corpora</li>
<li>Export capabilities for further analysis</li>
</ul>
</div>
""")
# Examples for linguists
with gr.Accordion("💡 Linguistic Examples", open=False):
examples = gr.Examples(
examples=[
["The child is playing with traditional toys.", "English → Siswati"],
["Umntfwana udlala ngetinsisimane tesintu.", "Siswati → English"],
["Agglutination demonstrates morphological complexity in Bantu languages.", "English → Siswati"],
["Lolimi lune-morphology leyinkimbinkimbi.", "Siswati → English"],
["What are the phonological features of this language?", "English → Siswati"],
["Yini tinchubo te-phonology talolimi?", "Siswati → English"],
],
inputs=[input_text, direction],
label="Click examples to analyze linguistic features:"
)
# Footer
with gr.Row():
with gr.Column():
gr.HTML("""
<div style='text-align: center; margin-top: 40px; padding: 30px; border-top: 1px solid #E5E7EB; background: #f8fafc;'>
<div style='margin-bottom: 20px;'>
<a href='https://github.com/dsfsi/en-ss-m2m100-combo' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>📁 En→Ss Model Repository</a>
<a href='https://github.com/dsfsi/ss-en-m2m100-combo' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>📁 Ss→En Model Repository</a>
<a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>💬 Research Feedback</a>
</div>
<div style='color: #475569; font-size: 0.95em;'>
<strong>Research Team:</strong> Vukosi Marivate, Richard Lastrucci<br>
<em>Supporting African language documentation and computational linguistics research</em><br>
<small style='color: #64748b; margin-top: 10px; display: block;'>
For academic use: Please cite the original models in your publications
</small>
</div>
</div>
""")
# Event handlers
def update_char_count(text):
count = len(text) if text else 0
color = "#DC2626" if count > 2000 else "#059669" if count > 1600 else "#64748b"
return f"<span style='color: {color}; font-weight: 500;'>Character count: {count}/2000</span>"
def clear_all():
return "", "", "Character count: 0/2000", "", "", "", ""
def translate_with_analysis(text, direction):
translation, status, success, analysis = app.translate_text(text, direction)
status_html = f"<div class='{'status-success' if success else 'status-error'}'>{status}</div>"
if success and analysis:
# Create metrics display
source_metrics = analysis['source']
target_metrics = analysis['target']
metrics_html = f"""
<div class='comparison-grid'>
<div class='metric-card'>
<h5>📊 Source Text</h5>
<p><strong>Words:</strong> {source_metrics['word_count']}</p>
<p><strong>Characters:</strong> {source_metrics['character_count']}</p>
<p><strong>Sentences:</strong> {source_metrics['sentence_count']}</p>
<p><strong>Lexical Diversity:</strong> {source_metrics['lexical_diversity']:.3f}</p>
</div>
<div class='metric-card' style='border-left: 4px solid #059669;'>
<h5>📊 Translation</h5>
<p><strong>Words:</strong> {target_metrics['word_count']}</p>
<p><strong>Characters:</strong> {target_metrics['character_count']}</p>
<p><strong>Sentences:</strong> {target_metrics['sentence_count']}</p>
<p><strong>Lexical Diversity:</strong> {target_metrics['lexical_diversity']:.3f}</p>
</div>
</div>
"""
# Language features
features_html = ""
if 'potential_agglutination' in source_metrics:
features_html = f"""
<div class='analysis-metric'>
<h5>🔍 Siswati Features Detected:</h5>
<p><strong>Potential agglutinated words:</strong> {source_metrics['potential_agglutination']}</p>
<p><strong>Click consonants (c,q,x):</strong> {source_metrics['click_consonants']}</p>
<p><strong>Tone markers:</strong> {source_metrics['tone_markers']}</p>
</div>
"""
# Translation ratios
ratios_html = f"""
<div class='analysis-metric'>
<h5>⚖️ Translation Ratios:</h5>
<p><strong>Word ratio:</strong> {analysis['word_ratio']:.3f}</p>
<p><strong>Character ratio:</strong> {analysis['translation_ratio']:.3f}</p>
<p><strong>Processing time:</strong> {analysis['processing_time']:.3f}s</p>
</div>
"""
return translation, status_html, metrics_html, features_html, ratios_html
return translation, status_html, "", "", ""
def process_batch(file_path, batch_text, direction):
texts = []
if file_path:
try:
with open(file_path, 'r', encoding='utf-8') as f:
texts = [line.strip() for line in f.readlines() if line.strip()]
except Exception as e:
return [[f"Error reading file: {str(e)}", "", "", "", ""]]
elif batch_text:
texts = [line.strip() for line in batch_text.split('\n') if line.strip()]
if not texts:
return [["No text provided", "", "", "", ""]]
results = app.batch_translate(texts, direction)
# Format for display
display_data = []
for r in results:
if r['success']:
word_ratio = f"{r['analysis']['source']['word_count']}{r['analysis']['target']['word_count']}"
char_ratio = f"{r['analysis']['source']['character_count']}{r['analysis']['target']['character_count']}"
else:
word_ratio = "Error"
char_ratio = "Error"
display_data.append([
r['index'],
r['source'][:50] + "..." if len(r['source']) > 50 else r['source'],
r['translation'][:50] + "..." if len(r['translation']) > 50 else r['translation'],
word_ratio,
char_ratio
])
return display_data
def get_history():
if not app.translation_history:
return []
return [[
entry['timestamp'][:19], # Remove microseconds
entry['direction'],
entry['source_text'][:50] + "..." if len(entry['source_text']) > 50 else entry['source_text'],
entry['translated_text'][:50] + "..." if len(entry['translated_text']) > 50 else entry['translated_text']
] for entry in app.translation_history[-20:]] # Show last 20
def export_csv():
csv_content = app.export_history_csv()
if csv_content:
filename = f"translation_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
return gr.File.update(value=csv_content, visible=True, label=f"📊 {filename}")
return gr.File.update(visible=False)
def clear_history():
app.translation_history = []
return []
# Wire up events
input_text.change(fn=update_char_count, inputs=input_text, outputs=char_count)
translate_btn.click(
fn=translate_with_analysis,
inputs=[input_text, direction],
outputs=[output_text, status_display, metrics_display, features_display, ratios_display]
)
clear_btn.click(
fn=clear_all,
outputs=[input_text, output_text, char_count, status_display, metrics_display, features_display, ratios_display]
)
batch_btn.click(
fn=process_batch,
inputs=[batch_input, batch_text, batch_direction],
outputs=batch_results
)
refresh_history_btn.click(fn=get_history, outputs=history_display)
export_csv_btn.click(fn=export_csv, outputs=csv_download)
clear_history_btn.click(fn=clear_history, outputs=history_display)
# Auto-translate on Enter
input_text.submit(
fn=translate_with_analysis,
inputs=[input_text, direction],
outputs=[output_text, status_display, metrics_display, features_display, ratios_display]
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
)