BGN_Analisis_Media2 / analisis_media.py
abdfajar707's picture
Upload 6 files
ec0d319 verified
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
# Import libraries untuk NLP
try:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import torch
except ImportError as e:
print(f"Error importing NLP libraries: {e}")
class MediaAnalyzer:
def __init__(self):
self.tokenizer = None
self.bert_model = None
self.embedding_model = None
self.emotion_tokenizer = None
self.emotion_model = None
self.emotion_config = None
self.summarization_tokenizer = None
self.summarization_model = None
self.ner_analyzer = None
self.keyword_extractor = None
self.models_loaded = False
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Emotion labels untuk model Indonesia
self.emotion_labels = [
'senang', 'sedih', 'marah', 'takut', 'jijik', 'terkejut'
]
def load_models(self):
"""Load semua model NLP yang diperlukan"""
try:
print("Loading NLP models...")
print(f"Using device: {self.device}")
# Model BERT Indonesia untuk tokenization dan embedding
model_name = 'cahya/bert-base-indonesian-1.5G'
print(f"Loading BERT model: {model_name}")
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.bert_model = BertModel.from_pretrained(model_name)
self.bert_model.to(self.device)
self.bert_model.eval() # Set ke evaluation mode
# Model Emotion Classification Indonesia
try:
print("Loading Indonesian emotion classification model...")
self.emotion_tokenizer = BertTokenizer.from_pretrained("thoriqfy/indobert-emotion-classification")
self.emotion_config = BertConfig.from_pretrained("thoriqfy/indobert-emotion-classification")
self.emotion_model = BertForSequenceClassification.from_pretrained(
"thoriqfy/indobert-emotion-classification",
config=self.emotion_config
)
self.emotion_model.to(self.device)
self.emotion_model.eval()
print("βœ… Indonesian emotion classification model loaded successfully")
except Exception as e:
print(f"❌ Indonesian emotion classification model failed to load: {e}")
self.emotion_tokenizer = None
self.emotion_model = None
# Model Summarization Indonesia T5
try:
print("Loading Indonesian T5 summarization model...")
self.summarization_tokenizer = T5Tokenizer.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
self.summarization_model = T5ForConditionalGeneration.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
self.summarization_model.to(self.device)
self.summarization_model.eval()
print("βœ… Indonesian T5 summarization model loaded successfully")
except Exception as e:
print(f"❌ Indonesian T5 summarization model failed to load: {e}")
self.summarization_tokenizer = None
self.summarization_model = None
# Model embedding alternatif (jika BERT utama gagal)
try:
self.embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
if torch.cuda.is_available():
self.embedding_model = self.embedding_model.to(self.device)
except Exception as e:
print(f"SentenceTransformer failed: {e}")
self.embedding_model = None
# NER dengan model Indonesia
try:
from transformers import pipeline
self.ner_analyzer = pipeline(
"ner",
model="indolem/indobert-base-uncased-ner",
tokenizer="indolem/indobert-base-uncased-ner",
aggregation_strategy="simple",
device=0 if torch.cuda.is_available() else -1
)
except Exception as e:
print(f"NER analyzer failed to load: {e}")
self.ner_analyzer = None
# Keyword extraction dengan KeyBERT menggunakan BERT Indonesia
try:
# Gunakan BERT Indonesia untuk KeyBERT
self.keyword_extractor = KeyBERT(model='cahya/bert-base-indonesian-1.5G')
except Exception as e:
print(f"KeyBERT with IndoBERT failed: {e}, using default")
try:
self.keyword_extractor = KeyBERT()
except:
self.keyword_extractor = None
self.models_loaded = True
print("All models loaded successfully!")
except Exception as e:
print(f"Error loading models: {e}")
self.models_loaded = False
def clean_text(self, text):
"""Cleaning text dasar"""
if pd.isna(text):
return ""
text = str(text)
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove mentions dan hashtags
text = re.sub(r'@\w+|#\w+', '', text)
# Remove karakter khusus
text = re.sub(r'[^\w\s.,!?]', ' ', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Remove angka
text = re.sub(r'\d+', '', text)
return text
def preprocess_text(self, texts):
"""Preprocessing batch text dengan tokenizer BERT Indonesia"""
cleaned_texts = [self.clean_text(text) for text in texts]
# Tokenization menggunakan BERT tokenizer
if self.models_loaded and self.tokenizer is not None:
try:
tokenized_texts = []
for text in cleaned_texts:
if text.strip():
tokens = self.tokenizer.tokenize(text)
tokenized_texts.append(" ".join(tokens))
else:
tokenized_texts.append("")
return tokenized_texts
except Exception as e:
print(f"Tokenization error: {e}")
return cleaned_texts
else:
return cleaned_texts
def create_embeddings(self, texts):
"""Membuat embedding menggunakan BERT Indonesia dengan PyTorch"""
if not self.models_loaded or self.bert_model is None:
return None
try:
# Method 1: Gunakan SentenceTransformer jika tersedia
if self.embedding_model is not None:
embeddings = self.embedding_model.encode(texts)
return embeddings
# Method 2: Gunakan BERT Indonesia langsung dengan PyTorch
embeddings = []
with torch.no_grad(): # Nonaktifkan gradient calculation untuk inference
for text in texts:
if text.strip():
# Tokenize dan encode text
encoded_input = self.tokenizer(
text,
return_tensors='pt',
max_length=512,
truncation=True,
padding=True
)
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
# Dapatkan embeddings dari BERT
outputs = self.bert_model(**encoded_input)
# Gunakan [CLS] token embedding sebagai representasi dokumen
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
embeddings.append(cls_embedding[0])
else:
embeddings.append(np.zeros(768)) # Default size untuk BERT base
return np.array(embeddings)
except Exception as e:
print(f"Error creating embeddings: {e}")
return None
def analyze_emotion(self, texts):
"""Analisis emosi untuk kumpulan teks menggunakan model Indonesia"""
if not self.models_loaded or self.emotion_model is None:
# Fallback emotion analysis menggunakan keyword-based approach
return self._fallback_emotion_analysis(texts)
try:
emotions = []
for text in texts:
if len(text.strip()) > 10: # Minimal panjang teks
try:
# Tokenize text
inputs = self.emotion_tokenizer(
text[:512], # Truncate untuk model
return_tensors='pt',
max_length=512,
truncation=True,
padding=True
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# Predict emotion
with torch.no_grad():
outputs = self.emotion_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
# Get predicted emotion and confidence
confidence, predicted_class = torch.max(predictions, dim=1)
emotion_idx = predicted_class.cpu().item()
confidence_score = confidence.cpu().item()
# Map to emotion label
if emotion_idx < len(self.emotion_labels):
emotion_label = self.emotion_labels[emotion_idx]
else:
emotion_label = 'netral'
emotions.append({
'emotion': emotion_label,
'confidence': round(confidence_score, 3),
'all_scores': predictions.cpu().numpy()[0]
})
except Exception as e:
print(f"Error in emotion prediction for text: {e}")
emotions.append({
'emotion': 'netral',
'confidence': 0.0,
'all_scores': None
})
else:
emotions.append({
'emotion': 'netral',
'confidence': 0.0,
'all_scores': None
})
return emotions
except Exception as e:
print(f"Error in emotion analysis: {e}")
return self._fallback_emotion_analysis(texts)
def _fallback_emotion_analysis(self, texts):
"""Fallback emotion analysis menggunakan keyword matching"""
emotion_keywords = {
'senang': ['senang', 'bahagia', 'gembira', 'suka', 'senang', 'bangga', 'puas', 'legah', 'sukacita'],
'sedih': ['sedih', 'duka', 'pilu', 'kecewa', 'menyesal', 'haru', 'murung', 'nestapa', 'duka'],
'marah': ['marah', 'jengkel', 'kesal', 'geram', 'benci', 'dendam', 'gemas', 'berang', 'naik darah'],
'takut': ['takut', 'khawatir', 'cemas', 'waswas', 'ngeri', 'gentar', 'tremor', 'panik', 'was-was'],
'jijik': ['jijik', 'muak', 'mual', 'enggan', 'menjijikkan', 'kotor', 'kumuh'],
'terkejut': ['terkejut', 'kaget', 'heran', 'takjub', 'terperanjat', 'mengagetkan'],
'netral': ['netral', 'biasa', 'wajar', 'normal', 'lazim', 'umum']
}
emotions = []
for text in texts:
text_lower = text.lower()
emotion_scores = {emotion: 0 for emotion in emotion_keywords.keys()}
for emotion, keywords in emotion_keywords.items():
for keyword in keywords:
if keyword in text_lower:
emotion_scores[emotion] += 1
# Pilih emotion dengan score tertinggi
max_score = max(emotion_scores.values())
if max_score > 0:
dominant_emotion = max(emotion_scores.items(), key=lambda x: x[1])[0]
confidence = emotion_scores[dominant_emotion] / sum(emotion_scores.values())
else:
dominant_emotion = 'netral'
confidence = 0.0
emotions.append({
'emotion': dominant_emotion,
'confidence': round(confidence, 3),
'all_scores': None
})
return emotions
def extract_keywords(self, texts, top_n=5):
"""Ekstraksi keyword dari teks menggunakan KeyBERT dengan BERT Indonesia"""
if not self.models_loaded or self.keyword_extractor is None:
return self._fallback_keyword_extraction(texts, top_n)
try:
all_keywords = []
for text in texts:
if len(text.strip()) > 20:
keywords = self.keyword_extractor.extract_keywords(
text,
keyphrase_ngram_range=(1, 2),
stop_words=None,
top_n=top_n,
diversity=0.5
)
all_keywords.append([kw[0] for kw in keywords])
else:
all_keywords.append([])
return all_keywords
except Exception as e:
print(f"Error in keyword extraction: {e}")
return self._fallback_keyword_extraction(texts, top_n)
def _fallback_keyword_extraction(self, texts, top_n=5):
"""Fallback keyword extraction menggunakan TF-IDF sederhana"""
from collections import Counter
import re
# Indonesian stopwords
stopwords = {
'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'dengan', 'ini', 'itu', 'ada',
'tidak', 'dalam', 'akan', 'atau', 'juga', 'saya', 'kamu', 'kami', 'kita', 'mereka',
'adalah', 'sudah', 'belum', 'telah', 'dapat', 'bisa', 'boleh', 'harus', 'perlu',
'lagi', 'saja', 'hanya', 'sangat', 'sekali', 'lebih', 'paling', 'sementara'
}
all_keywords = []
for text in texts:
if len(text.strip()) > 10:
# Tokenize sederhana
words = re.findall(r'\b\w+\b', text.lower())
# Hapus stopwords
words = [word for word in words if word not in stopwords and len(word) > 2]
# Ambil kata paling umum
word_counts = Counter(words)
keywords = [word for word, count in word_counts.most_common(top_n)]
all_keywords.append(keywords)
else:
all_keywords.append([])
return all_keywords
def summarize_text(self, texts, max_length=100, min_length=30):
"""Summarization teks menggunakan model T5 Indonesia"""
if not self.models_loaded or self.summarization_model is None:
return self._fallback_summarization(texts, max_length)
try:
summaries = []
for text in texts:
if len(text.strip()) > 100:
try:
# Preprocess text untuk summarization
cleaned_text = self.clean_text(text)
# Encode text
input_ids = self.summarization_tokenizer.encode(
cleaned_text,
return_tensors='pt',
max_length=512,
truncation=True
).to(self.device)
# Generate summary dengan parameter yang dioptimalkan
summary_ids = self.summarization_model.generate(
input_ids,
min_length=min_length,
max_length=max_length,
num_beams=8,
repetition_penalty=2.5,
length_penalty=1.0,
early_stopping=True,
no_repeat_ngram_size=2,
use_cache=True,
do_sample=True,
temperature=0.8,
top_k=50,
top_p=0.95
)
# Decode summary
summary_text = self.summarization_tokenizer.decode(
summary_ids[0],
skip_special_tokens=True
)
# Post-processing summary
if summary_text.strip():
# Hilangkan kutipan jika ada
summary_text = re.sub(r'^["\']|["\']$', '', summary_text)
summaries.append(summary_text.strip())
else:
summaries.append(self._fallback_summarization([text], max_length)[0])
except Exception as e:
print(f"Error in T5 summarization: {e}")
summaries.append(self._fallback_summarization([text], max_length)[0])
else:
# Untuk teks pendek, gunakan teks asli
summaries.append(text[:max_length])
return summaries
except Exception as e:
print(f"Error in summarization: {e}")
return self._fallback_summarization(texts, max_length)
def _fallback_summarization(self, texts, max_length=100):
"""Fallback summarization menggunakan ekstraksi kalimat penting"""
summaries = []
for text in texts:
if len(text.strip()) > 100:
# Split menjadi kalimat
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
# Ambil 2-3 kalimat pertama sebagai summary sederhana
if len(sentences) > 3:
summary = ' '.join(sentences[:3])
else:
summary = ' '.join(sentences)
summaries.append(summary[:max_length])
else:
summaries.append(text[:max_length])
return summaries
def perform_ner(self, texts):
"""Named Entity Recognition"""
if not self.models_loaded or self.ner_analyzer is None:
return self._fallback_ner(texts)
try:
all_entities = []
for text in texts:
if len(text.strip()) > 10:
entities = self.ner_analyzer(text[:512])
# Group entities by type
entity_dict = {}
for entity in entities:
entity_type = entity['entity_group']
if entity_type not in entity_dict:
entity_dict[entity_type] = []
entity_dict[entity_type].append(entity['word'])
all_entities.append(entity_dict)
else:
all_entities.append({})
return all_entities
except Exception as e:
print(f"Error in NER: {e}")
return self._fallback_ner(texts)
def _fallback_ner(self, texts):
"""Fallback NER menggunakan pattern matching sederhana"""
all_entities = []
for text in texts:
entities = {}
# Pattern matching sederhana untuk entitas umum
# Orang (kapital di tengah kalimat)
people = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
if people:
entities['PER'] = list(set(people))
# Organisasi (mengandung kata tertentu)
org_keywords = ['PT', 'CV', 'Inc', 'Corp', 'Company', 'Perusahaan', 'Universitas', 'Institut']
organizations = []
for word in org_keywords:
orgs = re.findall(rf'\b{word}\.?\s+[A-Za-z]+\b', text)
organizations.extend(orgs)
if organizations:
entities['ORG'] = organizations
# Lokasi (kata kapital yang mungkin lokasi)
locations = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', text)
# Filter yang bukan orang (nama depan + belakang)
common_locations = ['Indonesia', 'Jakarta', 'Surabaya', 'Bandung', 'Medan', 'Semarang']
locations = [loc for loc in locations if len(loc.split()) == 1 or loc in common_locations]
if locations:
entities['LOC'] = locations
all_entities.append(entities)
return all_entities
def analyze_media(self, df, content_column='content'):
"""Pipeline analisis media lengkap"""
print("Starting media analysis pipeline...")
# Load models jika belum diload
if not self.models_loaded:
self.load_models()
if not self.models_loaded:
print("Models failed to load, using basic analysis")
return self._basic_analysis(df, content_column)
# Filter data yang memiliki content
valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
analysis_df = df[valid_content].copy()
if len(analysis_df) == 0:
print("No valid content to analyze")
return df, pd.DataFrame()
print(f"Analyzing {len(analysis_df)} articles...")
# Pipeline analisis
texts = analysis_df[content_column].tolist()
# 1. Cleaning dan preprocessing
print("Step 1: Cleaning and preprocessing text...")
cleaned_texts = self.preprocess_text(texts)
# 2. Embedding (sample untuk efisiensi jika data banyak)
print("Step 2: Creating embeddings...")
if len(cleaned_texts) > 100:
sample_texts = cleaned_texts[:100]
embeddings = self.create_embeddings(sample_texts)
else:
embeddings = self.create_embeddings(cleaned_texts)
# 3. Emotion Analysis dengan model Indonesia
print("Step 3: Analyzing emotions with Indonesian model...")
emotions = self.analyze_emotion(cleaned_texts)
# 4. Keyword Extraction
print("Step 4: Extracting keywords...")
keywords = self.extract_keywords(cleaned_texts)
# 5. Summarization dengan model T5 Indonesia
print("Step 5: Generating summaries with T5 model...")
long_text_indices = [i for i, text in enumerate(cleaned_texts) if len(text) > 100]
summaries = [""] * len(cleaned_texts)
if long_text_indices and self.summarization_model:
print(f"Generating summaries for {len(long_text_indices)} long texts...")
long_texts = [cleaned_texts[i] for i in long_text_indices]
try:
long_summaries = self.summarize_text(long_texts, max_length=80, min_length=20)
for i, summary in zip(long_text_indices, long_summaries):
summaries[i] = summary
print("βœ… T5 summarization completed successfully")
except Exception as e:
print(f"❌ T5 summarization failed: {e}")
# Fallback untuk teks yang gagal
fallback_summaries = self._fallback_summarization(long_texts, 80)
for i, summary in zip(long_text_indices, fallback_summaries):
summaries[i] = summary
else:
print("No long texts found for summarization")
# 6. NER (sample untuk efisiensi jika data banyak)
print("Step 6: Performing NER...")
if len(cleaned_texts) > 50:
ner_texts = cleaned_texts[:50]
ner_results = self.perform_ner(ner_texts)
# Extend untuk semua teks
ner_results.extend([{}] * (len(cleaned_texts) - len(ner_results)))
else:
ner_results = self.perform_ner(cleaned_texts)
# Tambahkan hasil analisis ke DataFrame
analysis_df = analysis_df.reset_index(drop=True)
analysis_df['cleaned_content'] = cleaned_texts
analysis_df['emotion'] = [e['emotion'] for e in emotions]
analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
analysis_df['keywords'] = keywords
analysis_df['summary'] = summaries
analysis_df['entities'] = ner_results
# Hitung statistik tambahan
analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0
# Buat DataFrame terpisah untuk statistik
emotion_distribution = pd.Series([e['emotion'] for e in emotions]).value_counts().to_dict()
stats_data = {
'total_articles': len(analysis_df),
'articles_analyzed': len(analysis_df),
'articles_with_emotion': len([e for e in emotions if e['emotion'] != 'netral' and e['confidence'] > 0.3]),
'articles_with_summary': analysis_df['has_summary'].sum(),
'avg_emotion_confidence': np.mean([e['confidence'] for e in emotions]),
'total_keywords': sum(len(kw) for kw in keywords),
'avg_content_length': np.mean(analysis_df['content_length']),
'avg_word_count': np.mean(analysis_df['word_count']),
'emotion_distribution': emotion_distribution,
'total_entities': sum(len(entities) for entities in ner_results),
'summarization_model': 'T5-Indonesian' if self.summarization_model else 'Fallback',
'emotion_model': 'IndoBERT-Emotion' if self.emotion_model else 'Fallback'
}
stats_df = pd.DataFrame([stats_data])
print("Media analysis completed!")
print(f"Emotion distribution: {emotion_distribution}")
return analysis_df, stats_df
def _basic_analysis(self, df, content_column='content'):
"""Analisis dasar jika model NLP gagal load"""
print("Performing basic analysis...")
valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
analysis_df = df[valid_content].copy()
if len(analysis_df) == 0:
return df, pd.DataFrame()
# Basic cleaning
analysis_df['cleaned_content'] = analysis_df[content_column].apply(self.clean_text)
analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
# Basic emotion analysis fallback
emotions = self._fallback_emotion_analysis(analysis_df['cleaned_content'].tolist())
analysis_df['emotion'] = [e['emotion'] for e in emotions]
analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
# Basic keyword extraction
keywords = self._fallback_keyword_extraction(analysis_df['cleaned_content'].tolist())
analysis_df['keywords'] = keywords
# Basic summarization fallback
summaries = self._fallback_summarization(analysis_df['cleaned_content'].tolist())
analysis_df['summary'] = summaries
analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0
# Basic stats
stats_data = {
'total_articles': len(analysis_df),
'articles_analyzed': len(analysis_df),
'avg_content_length': analysis_df['content_length'].mean(),
'avg_word_count': analysis_df['word_count'].mean(),
'total_sources': analysis_df['source'].nunique(),
'emotion_distribution': analysis_df['emotion'].value_counts().to_dict(),
'articles_with_summary': analysis_df['has_summary'].sum(),
'summarization_model': 'Fallback',
'emotion_model': 'Fallback'
}
stats_df = pd.DataFrame([stats_data])
return analysis_df, stats_df
# Singleton instance
media_analyzer = MediaAnalyzer()