#!/usr/bin/env python3 """ Analyze existing submissions to determine if sentence-level categorization is worth implementing. This script: 1. Segments submissions into sentences 2. Categorizes each sentence using current AI model 3. Compares sentence-level vs submission-level categories 4. Shows statistics to inform decision Run: python analyze_submissions_for_sentences.py """ import sys import os import re from collections import Counter, defaultdict from app import create_app, db from app.models.models import Submission from app.analyzer import get_analyzer import nltk # Try to download required NLTK data try: nltk.data.find('tokenizers/punkt') except LookupError: print("Downloading NLTK punkt tokenizer...") nltk.download('punkt', quiet=True) def segment_sentences(text): """Simple sentence segmentation""" try: from nltk.tokenize import sent_tokenize sentences = sent_tokenize(text) except: # Fallback: regex-based pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$' sentences = re.split(pattern, text) # Clean and filter sentences = [s.strip() for s in sentences if s.strip()] # Filter very short "sentences" sentences = [s for s in sentences if len(s.split()) >= 3] return sentences def analyze_submissions(): """Analyze submissions to see if sentence-level categorization is beneficial""" app = create_app() with app.app_context(): # Get all analyzed submissions submissions = Submission.query.filter(Submission.category != None).all() if not submissions: print("āŒ No analyzed submissions found. Please run AI analysis first.") return print(f"\n{'='*70}") print(f"šŸ“Š SENTENCE-LEVEL CATEGORIZATION ANALYSIS") print(f"{'='*70}\n") print(f"Analyzing {len(submissions)} submissions...\n") # Load analyzer analyzer = get_analyzer() # Statistics total_submissions = len(submissions) total_sentences = 0 multi_sentence_count = 0 multi_category_count = 0 sentence_counts = [] category_changes = [] submission_details = [] # Analyze each submission for submission in submissions: # Segment into sentences sentences = segment_sentences(submission.message) sentence_count = len(sentences) total_sentences += sentence_count sentence_counts.append(sentence_count) if sentence_count > 1: multi_sentence_count += 1 # Categorize each sentence sentence_categories = [] for sentence in sentences: try: category = analyzer.analyze(sentence) sentence_categories.append(category) except Exception as e: print(f"Error analyzing sentence: {e}") sentence_categories.append(None) # Check if categories differ unique_categories = set([c for c in sentence_categories if c]) if len(unique_categories) > 1: multi_category_count += 1 category_changes.append({ 'id': submission.id, 'text': submission.message, 'submission_category': submission.category, 'sentence_categories': sentence_categories, 'sentences': sentences, 'contributor_type': submission.contributor_type }) # Print Statistics print(f"{'─'*70}") print(f"šŸ“ˆ STATISTICS") print(f"{'─'*70}\n") print(f"Total Submissions: {total_submissions}") print(f"Total Sentences: {total_sentences}") print(f"Avg Sentences/Submission: {total_sentences/total_submissions:.1f}") print(f"Multi-sentence (>1): {multi_sentence_count} ({multi_sentence_count/total_submissions*100:.1f}%)") print(f"Multi-category: {multi_category_count} ({multi_category_count/total_submissions*100:.1f}%)") # Sentence distribution print(f"\nšŸ“Š Sentence Count Distribution:") sentence_dist = Counter(sentence_counts) for count in sorted(sentence_dist.keys()): bar = 'ā–ˆ' * int(sentence_dist[count] / total_submissions * 50) print(f" {count} sentence(s): {sentence_dist[count]:3d} {bar}") # Category changes if category_changes: print(f"\n{'─'*70}") print(f"šŸ”„ SUBMISSIONS WITH MULTIPLE CATEGORIES ({len(category_changes)})") print(f"{'─'*70}\n") for idx, item in enumerate(category_changes[:10], 1): # Show first 10 print(f"\n{idx}. Submission #{item['id']} ({item['contributor_type']})") print(f" Submission-level: {item['submission_category']}") print(f" Text: \"{item['text'][:100]}{'...' if len(item['text']) > 100 else ''}\"") print(f" Sentence breakdown:") for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1): marker = "āš ļø" if category != item['submission_category'] else "āœ“" print(f" {marker} S{i} [{category:12s}] \"{sentence[:60]}{'...' if len(sentence) > 60 else ''}\"") if len(category_changes) > 10: print(f"\n ... and {len(category_changes) - 10} more") # Category distribution comparison print(f"\n{'─'*70}") print(f"šŸ“Š CATEGORY DISTRIBUTION COMPARISON") print(f"{'─'*70}\n") # Submission-level counts submission_cats = Counter([s.category for s in submissions if s.category]) # Sentence-level counts sentence_cats = Counter() for item in category_changes: for cat in item['sentence_categories']: if cat: sentence_cats[cat] += 1 print(f"{'Category':<15} {'Submission-Level':<20} {'Sentence-Level (multi-cat only)':<30}") print(f"{'-'*15} {'-'*20} {'-'*30}") categories = ['Vision', 'Problem', 'Objectives', 'Directives', 'Values', 'Actions'] for cat in categories: sub_count = submission_cats.get(cat, 0) sen_count = sentence_cats.get(cat, 0) sub_bar = 'ā–ˆ' * int(sub_count / total_submissions * 20) sen_bar = 'ā–ˆ' * int(sen_count / multi_category_count * 20) if multi_category_count > 0 else '' print(f"{cat:<15} {sub_count:3d} {sub_bar:<15} {sen_count:3d} {sen_bar:<15}") # Recommendation print(f"\n{'='*70}") print(f"šŸ’” RECOMMENDATION") print(f"{'='*70}\n") multi_cat_percentage = (multi_category_count / total_submissions * 100) if total_submissions > 0 else 0 if multi_cat_percentage > 40: print(f"āœ… STRONGLY RECOMMEND sentence-level categorization") print(f" {multi_cat_percentage:.1f}% of submissions contain multiple categories.") print(f" Current system is losing significant semantic detail.") print(f"\n šŸ“ˆ Expected benefits:") print(f" • {multi_category_count} submissions will have richer categorization") print(f" • Training data will be ~{total_sentences - total_submissions} examples richer") print(f" • Analytics will be more accurate") elif multi_cat_percentage > 20: print(f"āš ļø RECOMMEND sentence-level categorization (or proof of concept)") print(f" {multi_cat_percentage:.1f}% of submissions contain multiple categories.") print(f" Moderate benefit expected.") print(f"\n šŸ’” Suggestion: Start with proof of concept (display only)") print(f" Then decide if full implementation is worth it.") else: print(f"ā„¹ļø OPTIONAL - Multi-label might be sufficient") print(f" Only {multi_cat_percentage:.1f}% of submissions contain multiple categories.") print(f" Sentence-level might be overkill.") print(f"\n šŸ’” Consider:") print(f" • Multi-label classification (simpler)") print(f" • Or keep current system if working well") # Implementation effort print(f"\nšŸ“‹ Implementation Effort:") print(f" • Full sentence-level: 13-20 hours") print(f" • Proof of concept: 4-6 hours") print(f" • Multi-label: 4-6 hours") print(f"\n{'='*70}\n") # Export detailed results export_path = "sentence_analysis_results.txt" with open(export_path, 'w') as f: f.write("DETAILED SENTENCE-LEVEL ANALYSIS RESULTS\n") f.write("="*70 + "\n\n") f.write(f"Total Submissions: {total_submissions}\n") f.write(f"Multi-category Submissions: {multi_category_count} ({multi_cat_percentage:.1f}%)\n\n") f.write("\nDETAILED BREAKDOWN:\n\n") for idx, item in enumerate(category_changes, 1): f.write(f"\n{idx}. Submission #{item['id']}\n") f.write(f" Contributor: {item['contributor_type']}\n") f.write(f" Submission Category: {item['submission_category']}\n") f.write(f" Full Text: {item['text']}\n") f.write(f" Sentences:\n") for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1): f.write(f" {i}. [{category}] {sentence}\n") f.write("\n") print(f"šŸ“„ Detailed results exported to: {export_path}") if __name__ == '__main__': try: analyze_submissions() except Exception as e: print(f"\nāŒ Error: {e}") import traceback traceback.print_exc() sys.exit(1)