Spaces:

abdfajar707
/

BGN_Analisis_Media2

Paused

File size: 30,334 Bytes

ec0d319

import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

# Import libraries untuk NLP
try:
    from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
    from transformers import T5Tokenizer, T5ForConditionalGeneration
    from sentence_transformers import SentenceTransformer
    from keybert import KeyBERT
    import torch
except ImportError as e:
    print(f"Error importing NLP libraries: {e}")

class MediaAnalyzer:
    def __init__(self):
        self.tokenizer = None
        self.bert_model = None
        self.embedding_model = None
        self.emotion_tokenizer = None
        self.emotion_model = None
        self.emotion_config = None
        self.summarization_tokenizer = None
        self.summarization_model = None
        self.ner_analyzer = None
        self.keyword_extractor = None
        self.models_loaded = False
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Emotion labels untuk model Indonesia
        self.emotion_labels = [
            'senang', 'sedih', 'marah', 'takut', 'jijik', 'terkejut'
        ]
        
    def load_models(self):
        """Load semua model NLP yang diperlukan"""
        try:
            print("Loading NLP models...")
            print(f"Using device: {self.device}")
            
            # Model BERT Indonesia untuk tokenization dan embedding
            model_name = 'cahya/bert-base-indonesian-1.5G'
            print(f"Loading BERT model: {model_name}")
            self.tokenizer = BertTokenizer.from_pretrained(model_name)
            self.bert_model = BertModel.from_pretrained(model_name)
            self.bert_model.to(self.device)
            self.bert_model.eval()  # Set ke evaluation mode
            
            # Model Emotion Classification Indonesia
            try:
                print("Loading Indonesian emotion classification model...")
                self.emotion_tokenizer = BertTokenizer.from_pretrained("thoriqfy/indobert-emotion-classification")
                self.emotion_config = BertConfig.from_pretrained("thoriqfy/indobert-emotion-classification")
                self.emotion_model = BertForSequenceClassification.from_pretrained(
                    "thoriqfy/indobert-emotion-classification", 
                    config=self.emotion_config
                )
                self.emotion_model.to(self.device)
                self.emotion_model.eval()
                print("✅ Indonesian emotion classification model loaded successfully")
            except Exception as e:
                print(f"❌ Indonesian emotion classification model failed to load: {e}")
                self.emotion_tokenizer = None
                self.emotion_model = None
            
            # Model Summarization Indonesia T5
            try:
                print("Loading Indonesian T5 summarization model...")
                self.summarization_tokenizer = T5Tokenizer.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
                self.summarization_model = T5ForConditionalGeneration.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
                self.summarization_model.to(self.device)
                self.summarization_model.eval()
                print("✅ Indonesian T5 summarization model loaded successfully")
            except Exception as e:
                print(f"❌ Indonesian T5 summarization model failed to load: {e}")
                self.summarization_tokenizer = None
                self.summarization_model = None
            
            # Model embedding alternatif (jika BERT utama gagal)
            try:
                self.embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
                if torch.cuda.is_available():
                    self.embedding_model = self.embedding_model.to(self.device)
            except Exception as e:
                print(f"SentenceTransformer failed: {e}")
                self.embedding_model = None
            
            # NER dengan model Indonesia
            try:
                from transformers import pipeline
                self.ner_analyzer = pipeline(
                    "ner",
                    model="indolem/indobert-base-uncased-ner",
                    tokenizer="indolem/indobert-base-uncased-ner",
                    aggregation_strategy="simple",
                    device=0 if torch.cuda.is_available() else -1
                )
            except Exception as e:
                print(f"NER analyzer failed to load: {e}")
                self.ner_analyzer = None
            
            # Keyword extraction dengan KeyBERT menggunakan BERT Indonesia
            try:
                # Gunakan BERT Indonesia untuk KeyBERT
                self.keyword_extractor = KeyBERT(model='cahya/bert-base-indonesian-1.5G')
            except Exception as e:
                print(f"KeyBERT with IndoBERT failed: {e}, using default")
                try:
                    self.keyword_extractor = KeyBERT()
                except:
                    self.keyword_extractor = None
            
            self.models_loaded = True
            print("All models loaded successfully!")
            
        except Exception as e:
            print(f"Error loading models: {e}")
            self.models_loaded = False
    
    def clean_text(self, text):
        """Cleaning text dasar"""
        if pd.isna(text):
            return ""
        
        text = str(text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove mentions dan hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove karakter khusus
        text = re.sub(r'[^\w\s.,!?]', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        # Remove angka
        text = re.sub(r'\d+', '', text)
        
        return text
    
    def preprocess_text(self, texts):
        """Preprocessing batch text dengan tokenizer BERT Indonesia"""
        cleaned_texts = [self.clean_text(text) for text in texts]
        
        # Tokenization menggunakan BERT tokenizer
        if self.models_loaded and self.tokenizer is not None:
            try:
                tokenized_texts = []
                for text in cleaned_texts:
                    if text.strip():
                        tokens = self.tokenizer.tokenize(text)
                        tokenized_texts.append(" ".join(tokens))
                    else:
                        tokenized_texts.append("")
                return tokenized_texts
            except Exception as e:
                print(f"Tokenization error: {e}")
                return cleaned_texts
        else:
            return cleaned_texts
    
    def create_embeddings(self, texts):
        """Membuat embedding menggunakan BERT Indonesia dengan PyTorch"""
        if not self.models_loaded or self.bert_model is None:
            return None
        
        try:
            # Method 1: Gunakan SentenceTransformer jika tersedia
            if self.embedding_model is not None:
                embeddings = self.embedding_model.encode(texts)
                return embeddings
            
            # Method 2: Gunakan BERT Indonesia langsung dengan PyTorch
            embeddings = []
            
            with torch.no_grad():  # Nonaktifkan gradient calculation untuk inference
                for text in texts:
                    if text.strip():
                        # Tokenize dan encode text
                        encoded_input = self.tokenizer(
                            text, 
                            return_tensors='pt', 
                            max_length=512, 
                            truncation=True,
                            padding=True
                        )
                        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
                        
                        # Dapatkan embeddings dari BERT
                        outputs = self.bert_model(**encoded_input)
                        # Gunakan [CLS] token embedding sebagai representasi dokumen
                        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                        embeddings.append(cls_embedding[0])
                    else:
                        embeddings.append(np.zeros(768))  # Default size untuk BERT base
            
            return np.array(embeddings)
            
        except Exception as e:
            print(f"Error creating embeddings: {e}")
            return None
    
    def analyze_emotion(self, texts):
        """Analisis emosi untuk kumpulan teks menggunakan model Indonesia"""
        if not self.models_loaded or self.emotion_model is None:
            # Fallback emotion analysis menggunakan keyword-based approach
            return self._fallback_emotion_analysis(texts)
        
        try:
            emotions = []
            
            for text in texts:
                if len(text.strip()) > 10:  # Minimal panjang teks
                    try:
                        # Tokenize text
                        inputs = self.emotion_tokenizer(
                            text[:512],  # Truncate untuk model
                            return_tensors='pt',
                            max_length=512,
                            truncation=True,
                            padding=True
                        )
                        inputs = {k: v.to(self.device) for k, v in inputs.items()}
                        
                        # Predict emotion
                        with torch.no_grad():
                            outputs = self.emotion_model(**inputs)
                            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                        
                        # Get predicted emotion and confidence
                        confidence, predicted_class = torch.max(predictions, dim=1)
                        emotion_idx = predicted_class.cpu().item()
                        confidence_score = confidence.cpu().item()
                        
                        # Map to emotion label
                        if emotion_idx < len(self.emotion_labels):
                            emotion_label = self.emotion_labels[emotion_idx]
                        else:
                            emotion_label = 'netral'
                        
                        emotions.append({
                            'emotion': emotion_label,
                            'confidence': round(confidence_score, 3),
                            'all_scores': predictions.cpu().numpy()[0]
                        })
                        
                    except Exception as e:
                        print(f"Error in emotion prediction for text: {e}")
                        emotions.append({
                            'emotion': 'netral',
                            'confidence': 0.0,
                            'all_scores': None
                        })
                else:
                    emotions.append({
                        'emotion': 'netral',
                        'confidence': 0.0,
                        'all_scores': None
                    })
            
            return emotions
            
        except Exception as e:
            print(f"Error in emotion analysis: {e}")
            return self._fallback_emotion_analysis(texts)
    
    def _fallback_emotion_analysis(self, texts):
        """Fallback emotion analysis menggunakan keyword matching"""
        emotion_keywords = {
            'senang': ['senang', 'bahagia', 'gembira', 'suka', 'senang', 'bangga', 'puas', 'legah', 'sukacita'],
            'sedih': ['sedih', 'duka', 'pilu', 'kecewa', 'menyesal', 'haru', 'murung', 'nestapa', 'duka'],
            'marah': ['marah', 'jengkel', 'kesal', 'geram', 'benci', 'dendam', 'gemas', 'berang', 'naik darah'],
            'takut': ['takut', 'khawatir', 'cemas', 'waswas', 'ngeri', 'gentar', 'tremor', 'panik', 'was-was'],
            'jijik': ['jijik', 'muak', 'mual', 'enggan', 'menjijikkan', 'kotor', 'kumuh'],
            'terkejut': ['terkejut', 'kaget', 'heran', 'takjub', 'terperanjat', 'mengagetkan'],
            'netral': ['netral', 'biasa', 'wajar', 'normal', 'lazim', 'umum']
        }
        
        emotions = []
        for text in texts:
            text_lower = text.lower()
            emotion_scores = {emotion: 0 for emotion in emotion_keywords.keys()}
            
            for emotion, keywords in emotion_keywords.items():
                for keyword in keywords:
                    if keyword in text_lower:
                        emotion_scores[emotion] += 1
            
            # Pilih emotion dengan score tertinggi
            max_score = max(emotion_scores.values())
            if max_score > 0:
                dominant_emotion = max(emotion_scores.items(), key=lambda x: x[1])[0]
                confidence = emotion_scores[dominant_emotion] / sum(emotion_scores.values())
            else:
                dominant_emotion = 'netral'
                confidence = 0.0
            
            emotions.append({
                'emotion': dominant_emotion,
                'confidence': round(confidence, 3),
                'all_scores': None
            })
        
        return emotions
    
    def extract_keywords(self, texts, top_n=5):
        """Ekstraksi keyword dari teks menggunakan KeyBERT dengan BERT Indonesia"""
        if not self.models_loaded or self.keyword_extractor is None:
            return self._fallback_keyword_extraction(texts, top_n)
        
        try:
            all_keywords = []
            for text in texts:
                if len(text.strip()) > 20:
                    keywords = self.keyword_extractor.extract_keywords(
                        text, 
                        keyphrase_ngram_range=(1, 2), 
                        stop_words=None,
                        top_n=top_n,
                        diversity=0.5
                    )
                    all_keywords.append([kw[0] for kw in keywords])
                else:
                    all_keywords.append([])
            return all_keywords
        except Exception as e:
            print(f"Error in keyword extraction: {e}")
            return self._fallback_keyword_extraction(texts, top_n)
    
    def _fallback_keyword_extraction(self, texts, top_n=5):
        """Fallback keyword extraction menggunakan TF-IDF sederhana"""
        from collections import Counter
        import re
        
        # Indonesian stopwords
        stopwords = {
            'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'dengan', 'ini', 'itu', 'ada',
            'tidak', 'dalam', 'akan', 'atau', 'juga', 'saya', 'kamu', 'kami', 'kita', 'mereka',
            'adalah', 'sudah', 'belum', 'telah', 'dapat', 'bisa', 'boleh', 'harus', 'perlu',
            'lagi', 'saja', 'hanya', 'sangat', 'sekali', 'lebih', 'paling', 'sementara'
        }
        
        all_keywords = []
        for text in texts:
            if len(text.strip()) > 10:
                # Tokenize sederhana
                words = re.findall(r'\b\w+\b', text.lower())
                # Hapus stopwords
                words = [word for word in words if word not in stopwords and len(word) > 2]
                # Ambil kata paling umum
                word_counts = Counter(words)
                keywords = [word for word, count in word_counts.most_common(top_n)]
                all_keywords.append(keywords)
            else:
                all_keywords.append([])
        return all_keywords
    
    def summarize_text(self, texts, max_length=100, min_length=30):
        """Summarization teks menggunakan model T5 Indonesia"""
        if not self.models_loaded or self.summarization_model is None:
            return self._fallback_summarization(texts, max_length)
        
        try:
            summaries = []
            for text in texts:
                if len(text.strip()) > 100:
                    try:
                        # Preprocess text untuk summarization
                        cleaned_text = self.clean_text(text)
                        
                        # Encode text
                        input_ids = self.summarization_tokenizer.encode(
                            cleaned_text, 
                            return_tensors='pt',
                            max_length=512,
                            truncation=True
                        ).to(self.device)
                        
                        # Generate summary dengan parameter yang dioptimalkan
                        summary_ids = self.summarization_model.generate(
                            input_ids,
                            min_length=min_length,
                            max_length=max_length,
                            num_beams=8,
                            repetition_penalty=2.5,
                            length_penalty=1.0,
                            early_stopping=True,
                            no_repeat_ngram_size=2,
                            use_cache=True,
                            do_sample=True,
                            temperature=0.8,
                            top_k=50,
                            top_p=0.95
                        )
                        
                        # Decode summary
                        summary_text = self.summarization_tokenizer.decode(
                            summary_ids[0], 
                            skip_special_tokens=True
                        )
                        
                        # Post-processing summary
                        if summary_text.strip():
                            # Hilangkan kutipan jika ada
                            summary_text = re.sub(r'^["\']|["\']$', '', summary_text)
                            summaries.append(summary_text.strip())
                        else:
                            summaries.append(self._fallback_summarization([text], max_length)[0])
                            
                    except Exception as e:
                        print(f"Error in T5 summarization: {e}")
                        summaries.append(self._fallback_summarization([text], max_length)[0])
                else:
                    # Untuk teks pendek, gunakan teks asli
                    summaries.append(text[:max_length])
            
            return summaries
            
        except Exception as e:
            print(f"Error in summarization: {e}")
            return self._fallback_summarization(texts, max_length)
    
    def _fallback_summarization(self, texts, max_length=100):
        """Fallback summarization menggunakan ekstraksi kalimat penting"""
        summaries = []
        for text in texts:
            if len(text.strip()) > 100:
                # Split menjadi kalimat
                sentences = re.split(r'[.!?]+', text)
                sentences = [s.strip() for s in sentences if s.strip()]
                
                # Ambil 2-3 kalimat pertama sebagai summary sederhana
                if len(sentences) > 3:
                    summary = ' '.join(sentences[:3])
                else:
                    summary = ' '.join(sentences)
                
                summaries.append(summary[:max_length])
            else:
                summaries.append(text[:max_length])
        return summaries
    
    def perform_ner(self, texts):
        """Named Entity Recognition"""
        if not self.models_loaded or self.ner_analyzer is None:
            return self._fallback_ner(texts)
        
        try:
            all_entities = []
            for text in texts:
                if len(text.strip()) > 10:
                    entities = self.ner_analyzer(text[:512])
                    # Group entities by type
                    entity_dict = {}
                    for entity in entities:
                        entity_type = entity['entity_group']
                        if entity_type not in entity_dict:
                            entity_dict[entity_type] = []
                        entity_dict[entity_type].append(entity['word'])
                    all_entities.append(entity_dict)
                else:
                    all_entities.append({})
            return all_entities
        except Exception as e:
            print(f"Error in NER: {e}")
            return self._fallback_ner(texts)
    
    def _fallback_ner(self, texts):
        """Fallback NER menggunakan pattern matching sederhana"""
        all_entities = []
        for text in texts:
            entities = {}
            
            # Pattern matching sederhana untuk entitas umum
            # Orang (kapital di tengah kalimat)
            people = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
            if people:
                entities['PER'] = list(set(people))
            
            # Organisasi (mengandung kata tertentu)
            org_keywords = ['PT', 'CV', 'Inc', 'Corp', 'Company', 'Perusahaan', 'Universitas', 'Institut']
            organizations = []
            for word in org_keywords:
                orgs = re.findall(rf'\b{word}\.?\s+[A-Za-z]+\b', text)
                organizations.extend(orgs)
            if organizations:
                entities['ORG'] = organizations
            
            # Lokasi (kata kapital yang mungkin lokasi)
            locations = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', text)
            # Filter yang bukan orang (nama depan + belakang)
            common_locations = ['Indonesia', 'Jakarta', 'Surabaya', 'Bandung', 'Medan', 'Semarang']
            locations = [loc for loc in locations if len(loc.split()) == 1 or loc in common_locations]
            if locations:
                entities['LOC'] = locations
            
            all_entities.append(entities)
        
        return all_entities
    
    def analyze_media(self, df, content_column='content'):
        """Pipeline analisis media lengkap"""
        print("Starting media analysis pipeline...")
        
        # Load models jika belum diload
        if not self.models_loaded:
            self.load_models()
        
        if not self.models_loaded:
            print("Models failed to load, using basic analysis")
            return self._basic_analysis(df, content_column)
        
        # Filter data yang memiliki content
        valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
        analysis_df = df[valid_content].copy()
        
        if len(analysis_df) == 0:
            print("No valid content to analyze")
            return df, pd.DataFrame()
        
        print(f"Analyzing {len(analysis_df)} articles...")
        
        # Pipeline analisis
        texts = analysis_df[content_column].tolist()
        
        # 1. Cleaning dan preprocessing
        print("Step 1: Cleaning and preprocessing text...")
        cleaned_texts = self.preprocess_text(texts)
        
        # 2. Embedding (sample untuk efisiensi jika data banyak)
        print("Step 2: Creating embeddings...")
        if len(cleaned_texts) > 100:
            sample_texts = cleaned_texts[:100]
            embeddings = self.create_embeddings(sample_texts)
        else:
            embeddings = self.create_embeddings(cleaned_texts)
        
        # 3. Emotion Analysis dengan model Indonesia
        print("Step 3: Analyzing emotions with Indonesian model...")
        emotions = self.analyze_emotion(cleaned_texts)
        
        # 4. Keyword Extraction
        print("Step 4: Extracting keywords...")
        keywords = self.extract_keywords(cleaned_texts)
        
        # 5. Summarization dengan model T5 Indonesia
        print("Step 5: Generating summaries with T5 model...")
        long_text_indices = [i for i, text in enumerate(cleaned_texts) if len(text) > 100]
        summaries = [""] * len(cleaned_texts)
        
        if long_text_indices and self.summarization_model:
            print(f"Generating summaries for {len(long_text_indices)} long texts...")
            long_texts = [cleaned_texts[i] for i in long_text_indices]
            try:
                long_summaries = self.summarize_text(long_texts, max_length=80, min_length=20)
                for i, summary in zip(long_text_indices, long_summaries):
                    summaries[i] = summary
                print("✅ T5 summarization completed successfully")
            except Exception as e:
                print(f"❌ T5 summarization failed: {e}")
                # Fallback untuk teks yang gagal
                fallback_summaries = self._fallback_summarization(long_texts, 80)
                for i, summary in zip(long_text_indices, fallback_summaries):
                    summaries[i] = summary
        else:
            print("No long texts found for summarization")
        
        # 6. NER (sample untuk efisiensi jika data banyak)
        print("Step 6: Performing NER...")
        if len(cleaned_texts) > 50:
            ner_texts = cleaned_texts[:50]
            ner_results = self.perform_ner(ner_texts)
            # Extend untuk semua teks
            ner_results.extend([{}] * (len(cleaned_texts) - len(ner_results)))
        else:
            ner_results = self.perform_ner(cleaned_texts)
        
        # Tambahkan hasil analisis ke DataFrame
        analysis_df = analysis_df.reset_index(drop=True)
        
        analysis_df['cleaned_content'] = cleaned_texts
        analysis_df['emotion'] = [e['emotion'] for e in emotions]
        analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
        analysis_df['keywords'] = keywords
        analysis_df['summary'] = summaries
        analysis_df['entities'] = ner_results
        
        # Hitung statistik tambahan
        analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
        analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
        analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0
        
        # Buat DataFrame terpisah untuk statistik
        emotion_distribution = pd.Series([e['emotion'] for e in emotions]).value_counts().to_dict()
        
        stats_data = {
            'total_articles': len(analysis_df),
            'articles_analyzed': len(analysis_df),
            'articles_with_emotion': len([e for e in emotions if e['emotion'] != 'netral' and e['confidence'] > 0.3]),
            'articles_with_summary': analysis_df['has_summary'].sum(),
            'avg_emotion_confidence': np.mean([e['confidence'] for e in emotions]),
            'total_keywords': sum(len(kw) for kw in keywords),
            'avg_content_length': np.mean(analysis_df['content_length']),
            'avg_word_count': np.mean(analysis_df['word_count']),
            'emotion_distribution': emotion_distribution,
            'total_entities': sum(len(entities) for entities in ner_results),
            'summarization_model': 'T5-Indonesian' if self.summarization_model else 'Fallback',
            'emotion_model': 'IndoBERT-Emotion' if self.emotion_model else 'Fallback'
        }
        
        stats_df = pd.DataFrame([stats_data])
        
        print("Media analysis completed!")
        print(f"Emotion distribution: {emotion_distribution}")
        return analysis_df, stats_df
    
    def _basic_analysis(self, df, content_column='content'):
        """Analisis dasar jika model NLP gagal load"""
        print("Performing basic analysis...")
        
        valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
        analysis_df = df[valid_content].copy()
        
        if len(analysis_df) == 0:
            return df, pd.DataFrame()
        
        # Basic cleaning
        analysis_df['cleaned_content'] = analysis_df[content_column].apply(self.clean_text)
        analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
        analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
        
        # Basic emotion analysis fallback
        emotions = self._fallback_emotion_analysis(analysis_df['cleaned_content'].tolist())
        analysis_df['emotion'] = [e['emotion'] for e in emotions]
        analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
        
        # Basic keyword extraction
        keywords = self._fallback_keyword_extraction(analysis_df['cleaned_content'].tolist())
        analysis_df['keywords'] = keywords
        
        # Basic summarization fallback
        summaries = self._fallback_summarization(analysis_df['cleaned_content'].tolist())
        analysis_df['summary'] = summaries
        analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0
        
        # Basic stats
        stats_data = {
            'total_articles': len(analysis_df),
            'articles_analyzed': len(analysis_df),
            'avg_content_length': analysis_df['content_length'].mean(),
            'avg_word_count': analysis_df['word_count'].mean(),
            'total_sources': analysis_df['source'].nunique(),
            'emotion_distribution': analysis_df['emotion'].value_counts().to_dict(),
            'articles_with_summary': analysis_df['has_summary'].sum(),
            'summarization_model': 'Fallback',
            'emotion_model': 'Fallback'
        }
        
        stats_df = pd.DataFrame([stats_data])
        
        return analysis_df, stats_df

# Singleton instance
media_analyzer = MediaAnalyzer()