dsfsi-ss-en-m2m100-translation

Sleeping

App Files Files Community

dsfsi-ss-en-m2m100-translation / app.py

vukosi

Updated with a Claude Driven Modernization

cf370ec verified 5 months ago

raw

history blame

25.8 kB

	import gradio as gr
	import logging
	import time
	import json
	import csv
	import io
	from transformers import pipeline
	from typing import Tuple, Optional, List, Dict
	import traceback
	from datetime import datetime
	import re

	# Configure logging for debugging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class LinguisticTranslationApp:
	def __init__(self):
	self.translators = {}
	self.translation_history = []
	self.load_models()

	def load_models(self):
	"""Load translation models with error handling"""
	try:
	logger.info("Loading translation models...")
	self.translators['en_to_ss'] = pipeline(
	"translation",
	model="dsfsi/en-ss-m2m100-combo",
	src_lang="en",
	tgt_lang="ss"
	)
	self.translators['ss_to_en'] = pipeline(
	"translation",
	model="dsfsi/ss-en-m2m100-combo",
	src_lang="ss",
	tgt_lang="en"
	)
	logger.info("Models loaded successfully!")
	except Exception as e:
	logger.error(f"Error loading models: {str(e)}")
	raise e

	def analyze_text_complexity(self, text: str, lang: str) -> Dict:
	"""Analyze linguistic features of the input text"""
	words = text.split()
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	# Basic linguistic metrics
	analysis = {
	'character_count': len(text),
	'word_count': len(words),
	'sentence_count': len(sentences),
	'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0,
	'avg_sentence_length': len(words) / len(sentences) if sentences else 0,
	'unique_words': len(set(word.lower() for word in words)),
	'lexical_diversity': len(set(word.lower() for word in words)) / len(words) if words else 0
	}

	# Language-specific features
	if lang == 'ss': # Siswati
	# Check for common Siswati features
	analysis['potential_agglutination'] = sum(1 for word in words if len(word) > 10)
	analysis['click_consonants'] = sum(text.count(click) for click in ['c', 'q', 'x'])
	analysis['tone_markers'] = text.count('́') + text.count('̀') # Acute and grave accents

	return analysis

	def translate_text(self, text: str, direction: str, save_to_history: bool = True) -> Tuple[str, str, bool, Dict]:
	"""
	Translate text with comprehensive linguistic analysis

	Returns:
	Tuple[str, str, bool, Dict]: (translated_text, status_message, success, analysis)
	"""
	if not text or not text.strip():
	return "", "⚠️ Please enter some text to translate", False, {}

	if not direction:
	return "", "⚠️ Please select a translation direction", False, {}

	# Input validation
	if len(text) > 2000: # Increased limit for linguistic work
	return "", "⚠️ Text is too long. Please limit to 2000 characters.", False, {}

	try:
	start_time = time.time()

	# Determine source and target languages
	if direction == 'English → Siswati':
	translator = self.translators['en_to_ss']
	source_lang = "English"
	target_lang = "Siswati"
	source_code = "en"
	target_code = "ss"
	else:
	translator = self.translators['ss_to_en']
	source_lang = "Siswati"
	target_lang = "English"
	source_code = "ss"
	target_code = "en"

	logger.info(f"Translating from {source_lang} to {target_lang}")

	# Analyze source text
	source_analysis = self.analyze_text_complexity(text, source_code)

	# Perform translation
	result = translator(
	text,
	max_length=512,
	early_stopping=True,
	do_sample=False,
	num_beams=4 # Better quality for linguistic analysis
	)

	translation = result[0]['translation_text']

	# Analyze translated text
	target_analysis = self.analyze_text_complexity(translation, target_code)

	# Calculate processing time
	processing_time = time.time() - start_time

	# Linguistic comparison
	analysis = {
	'source': source_analysis,
	'target': target_analysis,
	'translation_ratio': len(translation) / len(text) if text else 0,
	'word_ratio': target_analysis['word_count'] / source_analysis['word_count'] if source_analysis['word_count'] else 0,
	'processing_time': processing_time,
	'timestamp': datetime.now().isoformat()
	}

	# Save to history for linguistic research
	if save_to_history:
	history_entry = {
	'source_text': text,
	'translated_text': translation,
	'direction': direction,
	'source_lang': source_lang,
	'target_lang': target_lang,
	'analysis': analysis,
	'timestamp': datetime.now().isoformat()
	}
	self.translation_history.append(history_entry)

	# Success message with linguistic metadata
	status_msg = f"✅ Translation completed in {processing_time:.2f}s \| Word ratio: {analysis['word_ratio']:.2f} \| Character ratio: {analysis['translation_ratio']:.2f}"

	logger.info(f"Translation completed: {processing_time:.2f}s")

	return translation, status_msg, True, analysis

	except Exception as e:
	error_msg = f"❌ Translation failed: {str(e)}"
	logger.error(f"Translation error: {str(e)}")
	logger.error(traceback.format_exc())
	return "", error_msg, False, {}

	def batch_translate(self, text_list: List[str], direction: str) -> List[Dict]:
	"""Translate multiple texts for corpus analysis"""
	results = []
	for i, text in enumerate(text_list):
	if text.strip():
	translation, status, success, analysis = self.translate_text(text, direction, False)
	results.append({
	'index': i + 1,
	'source': text,
	'translation': translation,
	'success': success,
	'analysis': analysis
	})
	return results

	def export_history_csv(self) -> str:
	"""Export translation history as CSV for linguistic analysis"""
	if not self.translation_history:
	return None

	output = io.StringIO()
	writer = csv.writer(output)

	# Headers
	writer.writerow([
	'Timestamp', 'Source Language', 'Target Language', 'Source Text',
	'Translation', 'Source Words', 'Target Words', 'Word Ratio',
	'Source Characters', 'Target Characters', 'Character Ratio',
	'Lexical Diversity (Source)', 'Lexical Diversity (Target)',
	'Processing Time (s)'
	])

	# Data rows
	for entry in self.translation_history:
	analysis = entry['analysis']
	writer.writerow([
	entry['timestamp'],
	entry['source_lang'],
	entry['target_lang'],
	entry['source_text'],
	entry['translated_text'],
	analysis['source']['word_count'],
	analysis['target']['word_count'],
	analysis['word_ratio'],
	analysis['source']['character_count'],
	analysis['target']['character_count'],
	analysis['translation_ratio'],
	analysis['source']['lexical_diversity'],
	analysis['target']['lexical_diversity'],
	analysis['processing_time']
	])

	return output.getvalue()

	# Initialize the app
	app = LinguisticTranslationApp()

	# Custom CSS for linguistic interface
	custom_css = """
	#logo {
	display: block;
	margin: 0 auto 20px auto;
	}

	.linguistic-panel {
	background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
	border: 1px solid #0891b2;
	border-radius: 12px;
	padding: 20px;
	margin: 10px 0;
	}

	.analysis-metric {
	background: white;
	padding: 10px;
	border-radius: 8px;
	margin: 5px;
	border-left: 4px solid #0891b2;
	}

	.status-success {
	color: #059669 !important;
	font-weight: 500;
	}

	.status-error {
	color: #DC2626 !important;
	font-weight: 500;
	}

	.gradient-text {
	background: linear-gradient(45deg, #059669, #0891b2);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	}

	.linguistic-header {
	text-align: center;
	margin-bottom: 30px;
	padding: 20px;
	background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
	border-radius: 16px;
	border: 1px solid #cbd5e1;
	}

	.comparison-grid {
	display: grid;
	grid-template-columns: 1fr 1fr;
	gap: 15px;
	margin: 15px 0;
	}

	.metric-card {
	background: white;
	padding: 15px;
	border-radius: 8px;
	border: 1px solid #e2e8f0;
	text-align: center;
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(css=custom_css, title="Linguistic Translation Analysis Tool", theme=gr.themes.Soft()) as demo:

	# Header section
	with gr.Row():
	with gr.Column():
	gr.HTML("""
	<div class='linguistic-header'>
	<h1 class='gradient-text' style='font-size: 2.5em; margin-bottom: 10px;'>
	🔬 Siswati ⇄ English Linguistic Analysis Tool
	</h1>
	<p style='font-size: 1.1em; color: #475569; max-width: 800px; margin: 0 auto;'>
	Advanced translation system with comprehensive linguistic analysis for researchers,
	linguists, and language documentation projects. Includes morphological insights,
	statistical analysis, and corpus management features.
	</p>
	</div>
	""")

	# Main translation interface
	with gr.Row():
	with gr.Column(scale=2):
	# Input section
	with gr.Group():
	gr.HTML("<h3>📝 Translation Input</h3>")
	direction = gr.Radio(
	choices=['English → Siswati', 'Siswati → English'],
	label="Translation Direction",
	value='English → Siswati',
	interactive=True
	)

	input_text = gr.Textbox(
	lines=6,
	placeholder="Enter your text here for linguistic analysis... (maximum 2000 characters)",
	label="Source Text",
	max_lines=12,
	show_copy_button=True
	)

	char_count = gr.HTML("Character count: 0/2000")

	with gr.Row():
	translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	# Output section
	with gr.Group():
	gr.HTML("<h3>✨ Translation Output</h3>")
	output_text = gr.Textbox(
	label="Translation",
	lines=6,
	max_lines=12,
	show_copy_button=True,
	interactive=False
	)
	status_display = gr.HTML()

	# Linguistic analysis panel
	with gr.Column(scale=1):
	with gr.Group():
	gr.HTML("<h3>📊 Linguistic Analysis</h3>")

	# Real-time metrics
	with gr.Accordion("📈 Text Metrics", open=True):
	metrics_display = gr.HTML("""
	<div style='text-align: center; color: #64748b; padding: 20px;'>
	<em>Translate text to see linguistic analysis</em>
	</div>
	""")

	# Language-specific features
	with gr.Accordion("🔍 Language Features", open=False):
	features_display = gr.HTML("")

	# Translation quality indicators
	with gr.Accordion("⚖️ Translation Ratios", open=False):
	ratios_display = gr.HTML("")

	# Batch processing section
	with gr.Accordion("📚 Batch Translation & Corpus Analysis", open=False):
	with gr.Row():
	with gr.Column():
	gr.HTML("<h4>Upload text file or enter multiple lines:</h4>")
	batch_input = gr.File(
	label="Upload .txt file",
	file_types=[".txt"],
	type="filepath"
	)
	batch_text = gr.Textbox(
	lines=8,
	placeholder="Or paste multiple lines here (one per line)...",
	label="Batch Text Input",
	show_copy_button=True
	)
	batch_direction = gr.Radio(
	choices=['English → Siswati', 'Siswati → English'],
	label="Batch Translation Direction",
	value='English → Siswati'
	)
	batch_btn = gr.Button("🔄 Process Batch", variant="primary")

	with gr.Column():
	batch_results = gr.Dataframe(
	headers=["Index", "Source", "Translation", "Words (S→T)", "Chars (S→T)"],
	label="Batch Results",
	interactive=False
	)

	# Research tools section
	with gr.Accordion("🔬 Research & Export Tools", open=False):
	with gr.Row():
	with gr.Column():
	gr.HTML("<h4>Translation History & Export</h4>")
	history_display = gr.Dataframe(
	headers=["Timestamp", "Direction", "Source", "Translation"],
	label="Translation History",
	interactive=False
	)

	with gr.Row():
	refresh_history_btn = gr.Button("🔄 Refresh History")
	export_csv_btn = gr.Button("📊 Export CSV", variant="secondary")
	clear_history_btn = gr.Button("🗑️ Clear History", variant="stop")

	csv_download = gr.File(label="Download CSV", visible=False)

	with gr.Column():
	gr.HTML("<h4>Linguistic Resources</h4>")
	gr.HTML("""
	<div style='background: #f8fafc; padding: 20px; border-radius: 8px; border: 1px solid #e2e8f0;'>
	<h5>📖 Siswati Language Notes:</h5>
	<ul style='text-align: left; margin: 10px 0;'>
	<li><strong>Script:</strong> Latin alphabet</li>
	<li><strong>Family:</strong> Niger-Congo, Bantu</li>
	<li><strong>Features:</strong> Agglutinative, click consonants</li>
	<li><strong>Speakers:</strong> ~2.3 million (Eswatini, South Africa)</li>
	</ul>
	<h5>🔧 Research Features:</h5>
	<ul style='text-align: left; margin: 10px 0;'>
	<li>Morphological complexity analysis</li>
	<li>Translation ratio tracking</li>
	<li>Lexical diversity measurement</li>
	<li>Batch processing for corpora</li>
	<li>Export capabilities for further analysis</li>
	</ul>
	</div>
	""")

	# Examples for linguists
	with gr.Accordion("💡 Linguistic Examples", open=False):
	examples = gr.Examples(
	examples=[
	["The child is playing with traditional toys.", "English → Siswati"],
	["Umntfwana udlala ngetinsisimane tesintu.", "Siswati → English"],
	["Agglutination demonstrates morphological complexity in Bantu languages.", "English → Siswati"],
	["Lolimi lune-morphology leyinkimbinkimbi.", "Siswati → English"],
	["What are the phonological features of this language?", "English → Siswati"],
	["Yini tinchubo te-phonology talolimi?", "Siswati → English"],
	],
	inputs=[input_text, direction],
	label="Click examples to analyze linguistic features:"
	)

	# Footer
	with gr.Row():
	with gr.Column():
	gr.HTML("""
	<div style='text-align: center; margin-top: 40px; padding: 30px; border-top: 1px solid #E5E7EB; background: #f8fafc;'>
	<div style='margin-bottom: 20px;'>
	<a href='https://github.com/dsfsi/en-ss-m2m100-combo' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>📁 En→Ss Model Repository</a>
	<a href='https://github.com/dsfsi/ss-en-m2m100-combo' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>📁 Ss→En Model Repository</a>
	<a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank' style='margin: 0 15px; color: #0891b2; text-decoration: none;'>💬 Research Feedback</a>
	</div>
	<div style='color: #475569; font-size: 0.95em;'>
	<strong>Research Team:</strong> Vukosi Marivate, Richard Lastrucci<br>
	<em>Supporting African language documentation and computational linguistics research</em><br>
	<small style='color: #64748b; margin-top: 10px; display: block;'>
	For academic use: Please cite the original models in your publications
	</small>
	</div>
	</div>
	""")

	# Event handlers
	def update_char_count(text):
	count = len(text) if text else 0
	color = "#DC2626" if count > 2000 else "#059669" if count > 1600 else "#64748b"
	return f"<span style='color: {color}; font-weight: 500;'>Character count: {count}/2000</span>"

	def clear_all():
	return "", "", "Character count: 0/2000", "", "", "", ""

	def translate_with_analysis(text, direction):
	translation, status, success, analysis = app.translate_text(text, direction)
	status_html = f"<div class='{'status-success' if success else 'status-error'}'>{status}</div>"

	if success and analysis:
	# Create metrics display
	source_metrics = analysis['source']
	target_metrics = analysis['target']

	metrics_html = f"""
	<div class='comparison-grid'>
	<div class='metric-card'>
	<h5>📊 Source Text</h5>
	<p><strong>Words:</strong> {source_metrics['word_count']}</p>
	<p><strong>Characters:</strong> {source_metrics['character_count']}</p>
	<p><strong>Sentences:</strong> {source_metrics['sentence_count']}</p>
	<p><strong>Lexical Diversity:</strong> {source_metrics['lexical_diversity']:.3f}</p>
	</div>
	<div class='metric-card' style='border-left: 4px solid #059669;'>
	<h5>📊 Translation</h5>
	<p><strong>Words:</strong> {target_metrics['word_count']}</p>
	<p><strong>Characters:</strong> {target_metrics['character_count']}</p>
	<p><strong>Sentences:</strong> {target_metrics['sentence_count']}</p>
	<p><strong>Lexical Diversity:</strong> {target_metrics['lexical_diversity']:.3f}</p>
	</div>
	</div>
	"""

	# Language features
	features_html = ""
	if 'potential_agglutination' in source_metrics:
	features_html = f"""
	<div class='analysis-metric'>
	<h5>🔍 Siswati Features Detected:</h5>
	<p><strong>Potential agglutinated words:</strong> {source_metrics['potential_agglutination']}</p>
	<p><strong>Click consonants (c,q,x):</strong> {source_metrics['click_consonants']}</p>
	<p><strong>Tone markers:</strong> {source_metrics['tone_markers']}</p>
	</div>
	"""

	# Translation ratios
	ratios_html = f"""
	<div class='analysis-metric'>
	<h5>⚖️ Translation Ratios:</h5>
	<p><strong>Word ratio:</strong> {analysis['word_ratio']:.3f}</p>
	<p><strong>Character ratio:</strong> {analysis['translation_ratio']:.3f}</p>
	<p><strong>Processing time:</strong> {analysis['processing_time']:.3f}s</p>
	</div>
	"""

	return translation, status_html, metrics_html, features_html, ratios_html

	return translation, status_html, "", "", ""

	def process_batch(file_path, batch_text, direction):
	texts = []

	if file_path:
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	texts = [line.strip() for line in f.readlines() if line.strip()]
	except Exception as e:
	return [[f"Error reading file: {str(e)}", "", "", "", ""]]
	elif batch_text:
	texts = [line.strip() for line in batch_text.split('\n') if line.strip()]

	if not texts:
	return [["No text provided", "", "", "", ""]]

	results = app.batch_translate(texts, direction)

	# Format for display
	display_data = []
	for r in results:
	if r['success']:
	word_ratio = f"{r['analysis']['source']['word_count']}→{r['analysis']['target']['word_count']}"
	char_ratio = f"{r['analysis']['source']['character_count']}→{r['analysis']['target']['character_count']}"
	else:
	word_ratio = "Error"
	char_ratio = "Error"

	display_data.append([
	r['index'],
	r['source'][:50] + "..." if len(r['source']) > 50 else r['source'],
	r['translation'][:50] + "..." if len(r['translation']) > 50 else r['translation'],
	word_ratio,
	char_ratio
	])

	return display_data

	def get_history():
	if not app.translation_history:
	return []

	return [[
	entry['timestamp'][:19], # Remove microseconds
	entry['direction'],
	entry['source_text'][:50] + "..." if len(entry['source_text']) > 50 else entry['source_text'],
	entry['translated_text'][:50] + "..." if len(entry['translated_text']) > 50 else entry['translated_text']
	] for entry in app.translation_history[-20:]] # Show last 20

	def export_csv():
	csv_content = app.export_history_csv()
	if csv_content:
	filename = f"translation_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
	return gr.File.update(value=csv_content, visible=True, label=f"📊 {filename}")
	return gr.File.update(visible=False)

	def clear_history():
	app.translation_history = []
	return []

	# Wire up events
	input_text.change(fn=update_char_count, inputs=input_text, outputs=char_count)

	translate_btn.click(
	fn=translate_with_analysis,
	inputs=[input_text, direction],
	outputs=[output_text, status_display, metrics_display, features_display, ratios_display]
	)

	clear_btn.click(
	fn=clear_all,
	outputs=[input_text, output_text, char_count, status_display, metrics_display, features_display, ratios_display]
	)

	batch_btn.click(
	fn=process_batch,
	inputs=[batch_input, batch_text, batch_direction],
	outputs=batch_results
	)

	refresh_history_btn.click(fn=get_history, outputs=history_display)
	export_csv_btn.click(fn=export_csv, outputs=csv_download)
	clear_history_btn.click(fn=clear_history, outputs=history_display)

	# Auto-translate on Enter
	input_text.submit(
	fn=translate_with_analysis,
	inputs=[input_text, direction],
	outputs=[output_text, status_display, metrics_display, features_display, ratios_display]
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=True
	)