Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Analyze existing submissions to determine if sentence-level categorization is worth implementing. | |
| This script: | |
| 1. Segments submissions into sentences | |
| 2. Categorizes each sentence using current AI model | |
| 3. Compares sentence-level vs submission-level categories | |
| 4. Shows statistics to inform decision | |
| Run: python analyze_submissions_for_sentences.py | |
| """ | |
| import sys | |
| import os | |
| import re | |
| from collections import Counter, defaultdict | |
| from app import create_app, db | |
| from app.models.models import Submission | |
| from app.analyzer import get_analyzer | |
| import nltk | |
| # Try to download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| print("Downloading NLTK punkt tokenizer...") | |
| nltk.download('punkt', quiet=True) | |
| def segment_sentences(text): | |
| """Simple sentence segmentation""" | |
| try: | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| except: | |
| # Fallback: regex-based | |
| pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$' | |
| sentences = re.split(pattern, text) | |
| # Clean and filter | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| # Filter very short "sentences" | |
| sentences = [s for s in sentences if len(s.split()) >= 3] | |
| return sentences | |
| def analyze_submissions(): | |
| """Analyze submissions to see if sentence-level categorization is beneficial""" | |
| app = create_app() | |
| with app.app_context(): | |
| # Get all analyzed submissions | |
| submissions = Submission.query.filter(Submission.category != None).all() | |
| if not submissions: | |
| print("β No analyzed submissions found. Please run AI analysis first.") | |
| return | |
| print(f"\n{'='*70}") | |
| print(f"π SENTENCE-LEVEL CATEGORIZATION ANALYSIS") | |
| print(f"{'='*70}\n") | |
| print(f"Analyzing {len(submissions)} submissions...\n") | |
| # Load analyzer | |
| analyzer = get_analyzer() | |
| # Statistics | |
| total_submissions = len(submissions) | |
| total_sentences = 0 | |
| multi_sentence_count = 0 | |
| multi_category_count = 0 | |
| sentence_counts = [] | |
| category_changes = [] | |
| submission_details = [] | |
| # Analyze each submission | |
| for submission in submissions: | |
| # Segment into sentences | |
| sentences = segment_sentences(submission.message) | |
| sentence_count = len(sentences) | |
| total_sentences += sentence_count | |
| sentence_counts.append(sentence_count) | |
| if sentence_count > 1: | |
| multi_sentence_count += 1 | |
| # Categorize each sentence | |
| sentence_categories = [] | |
| for sentence in sentences: | |
| try: | |
| category = analyzer.analyze(sentence) | |
| sentence_categories.append(category) | |
| except Exception as e: | |
| print(f"Error analyzing sentence: {e}") | |
| sentence_categories.append(None) | |
| # Check if categories differ | |
| unique_categories = set([c for c in sentence_categories if c]) | |
| if len(unique_categories) > 1: | |
| multi_category_count += 1 | |
| category_changes.append({ | |
| 'id': submission.id, | |
| 'text': submission.message, | |
| 'submission_category': submission.category, | |
| 'sentence_categories': sentence_categories, | |
| 'sentences': sentences, | |
| 'contributor_type': submission.contributor_type | |
| }) | |
| # Print Statistics | |
| print(f"{'β'*70}") | |
| print(f"π STATISTICS") | |
| print(f"{'β'*70}\n") | |
| print(f"Total Submissions: {total_submissions}") | |
| print(f"Total Sentences: {total_sentences}") | |
| print(f"Avg Sentences/Submission: {total_sentences/total_submissions:.1f}") | |
| print(f"Multi-sentence (>1): {multi_sentence_count} ({multi_sentence_count/total_submissions*100:.1f}%)") | |
| print(f"Multi-category: {multi_category_count} ({multi_category_count/total_submissions*100:.1f}%)") | |
| # Sentence distribution | |
| print(f"\nπ Sentence Count Distribution:") | |
| sentence_dist = Counter(sentence_counts) | |
| for count in sorted(sentence_dist.keys()): | |
| bar = 'β' * int(sentence_dist[count] / total_submissions * 50) | |
| print(f" {count} sentence(s): {sentence_dist[count]:3d} {bar}") | |
| # Category changes | |
| if category_changes: | |
| print(f"\n{'β'*70}") | |
| print(f"π SUBMISSIONS WITH MULTIPLE CATEGORIES ({len(category_changes)})") | |
| print(f"{'β'*70}\n") | |
| for idx, item in enumerate(category_changes[:10], 1): # Show first 10 | |
| print(f"\n{idx}. Submission #{item['id']} ({item['contributor_type']})") | |
| print(f" Submission-level: {item['submission_category']}") | |
| print(f" Text: \"{item['text'][:100]}{'...' if len(item['text']) > 100 else ''}\"") | |
| print(f" Sentence breakdown:") | |
| for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1): | |
| marker = "β οΈ" if category != item['submission_category'] else "β" | |
| print(f" {marker} S{i} [{category:12s}] \"{sentence[:60]}{'...' if len(sentence) > 60 else ''}\"") | |
| if len(category_changes) > 10: | |
| print(f"\n ... and {len(category_changes) - 10} more") | |
| # Category distribution comparison | |
| print(f"\n{'β'*70}") | |
| print(f"π CATEGORY DISTRIBUTION COMPARISON") | |
| print(f"{'β'*70}\n") | |
| # Submission-level counts | |
| submission_cats = Counter([s.category for s in submissions if s.category]) | |
| # Sentence-level counts | |
| sentence_cats = Counter() | |
| for item in category_changes: | |
| for cat in item['sentence_categories']: | |
| if cat: | |
| sentence_cats[cat] += 1 | |
| print(f"{'Category':<15} {'Submission-Level':<20} {'Sentence-Level (multi-cat only)':<30}") | |
| print(f"{'-'*15} {'-'*20} {'-'*30}") | |
| categories = ['Vision', 'Problem', 'Objectives', 'Directives', 'Values', 'Actions'] | |
| for cat in categories: | |
| sub_count = submission_cats.get(cat, 0) | |
| sen_count = sentence_cats.get(cat, 0) | |
| sub_bar = 'β' * int(sub_count / total_submissions * 20) | |
| sen_bar = 'β' * int(sen_count / multi_category_count * 20) if multi_category_count > 0 else '' | |
| print(f"{cat:<15} {sub_count:3d} {sub_bar:<15} {sen_count:3d} {sen_bar:<15}") | |
| # Recommendation | |
| print(f"\n{'='*70}") | |
| print(f"π‘ RECOMMENDATION") | |
| print(f"{'='*70}\n") | |
| multi_cat_percentage = (multi_category_count / total_submissions * 100) if total_submissions > 0 else 0 | |
| if multi_cat_percentage > 40: | |
| print(f"β STRONGLY RECOMMEND sentence-level categorization") | |
| print(f" {multi_cat_percentage:.1f}% of submissions contain multiple categories.") | |
| print(f" Current system is losing significant semantic detail.") | |
| print(f"\n π Expected benefits:") | |
| print(f" β’ {multi_category_count} submissions will have richer categorization") | |
| print(f" β’ Training data will be ~{total_sentences - total_submissions} examples richer") | |
| print(f" β’ Analytics will be more accurate") | |
| elif multi_cat_percentage > 20: | |
| print(f"β οΈ RECOMMEND sentence-level categorization (or proof of concept)") | |
| print(f" {multi_cat_percentage:.1f}% of submissions contain multiple categories.") | |
| print(f" Moderate benefit expected.") | |
| print(f"\n π‘ Suggestion: Start with proof of concept (display only)") | |
| print(f" Then decide if full implementation is worth it.") | |
| else: | |
| print(f"βΉοΈ OPTIONAL - Multi-label might be sufficient") | |
| print(f" Only {multi_cat_percentage:.1f}% of submissions contain multiple categories.") | |
| print(f" Sentence-level might be overkill.") | |
| print(f"\n π‘ Consider:") | |
| print(f" β’ Multi-label classification (simpler)") | |
| print(f" β’ Or keep current system if working well") | |
| # Implementation effort | |
| print(f"\nπ Implementation Effort:") | |
| print(f" β’ Full sentence-level: 13-20 hours") | |
| print(f" β’ Proof of concept: 4-6 hours") | |
| print(f" β’ Multi-label: 4-6 hours") | |
| print(f"\n{'='*70}\n") | |
| # Export detailed results | |
| export_path = "sentence_analysis_results.txt" | |
| with open(export_path, 'w') as f: | |
| f.write("DETAILED SENTENCE-LEVEL ANALYSIS RESULTS\n") | |
| f.write("="*70 + "\n\n") | |
| f.write(f"Total Submissions: {total_submissions}\n") | |
| f.write(f"Multi-category Submissions: {multi_category_count} ({multi_cat_percentage:.1f}%)\n\n") | |
| f.write("\nDETAILED BREAKDOWN:\n\n") | |
| for idx, item in enumerate(category_changes, 1): | |
| f.write(f"\n{idx}. Submission #{item['id']}\n") | |
| f.write(f" Contributor: {item['contributor_type']}\n") | |
| f.write(f" Submission Category: {item['submission_category']}\n") | |
| f.write(f" Full Text: {item['text']}\n") | |
| f.write(f" Sentences:\n") | |
| for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1): | |
| f.write(f" {i}. [{category}] {sentence}\n") | |
| f.write("\n") | |
| print(f"π Detailed results exported to: {export_path}") | |
| if __name__ == '__main__': | |
| try: | |
| analyze_submissions() | |
| except Exception as e: | |
| print(f"\nβ Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |