Spaces:

abdfajar707
/

BGN_Analisis_Media2

Paused

App Files Files Community

BGN_Analisis_Media2 / analisis_media.py

abdfajar707

Upload 6 files

ec0d319 verified about 1 month ago

raw

history blame contribute delete

30.3 kB

	import pandas as pd
	import numpy as np
	import re
	import warnings
	warnings.filterwarnings('ignore')

	# Import libraries untuk NLP
	try:
	from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	from sentence_transformers import SentenceTransformer
	from keybert import KeyBERT
	import torch
	except ImportError as e:
	print(f"Error importing NLP libraries: {e}")

	class MediaAnalyzer:
	def __init__(self):
	self.tokenizer = None
	self.bert_model = None
	self.embedding_model = None
	self.emotion_tokenizer = None
	self.emotion_model = None
	self.emotion_config = None
	self.summarization_tokenizer = None
	self.summarization_model = None
	self.ner_analyzer = None
	self.keyword_extractor = None
	self.models_loaded = False
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Emotion labels untuk model Indonesia
	self.emotion_labels = [
	'senang', 'sedih', 'marah', 'takut', 'jijik', 'terkejut'
	]

	def load_models(self):
	"""Load semua model NLP yang diperlukan"""
	try:
	print("Loading NLP models...")
	print(f"Using device: {self.device}")

	# Model BERT Indonesia untuk tokenization dan embedding
	model_name = 'cahya/bert-base-indonesian-1.5G'
	print(f"Loading BERT model: {model_name}")
	self.tokenizer = BertTokenizer.from_pretrained(model_name)
	self.bert_model = BertModel.from_pretrained(model_name)
	self.bert_model.to(self.device)
	self.bert_model.eval() # Set ke evaluation mode

	# Model Emotion Classification Indonesia
	try:
	print("Loading Indonesian emotion classification model...")
	self.emotion_tokenizer = BertTokenizer.from_pretrained("thoriqfy/indobert-emotion-classification")
	self.emotion_config = BertConfig.from_pretrained("thoriqfy/indobert-emotion-classification")
	self.emotion_model = BertForSequenceClassification.from_pretrained(
	"thoriqfy/indobert-emotion-classification",
	config=self.emotion_config
	)
	self.emotion_model.to(self.device)
	self.emotion_model.eval()
	print("✅ Indonesian emotion classification model loaded successfully")
	except Exception as e:
	print(f"❌ Indonesian emotion classification model failed to load: {e}")
	self.emotion_tokenizer = None
	self.emotion_model = None

	# Model Summarization Indonesia T5
	try:
	print("Loading Indonesian T5 summarization model...")
	self.summarization_tokenizer = T5Tokenizer.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
	self.summarization_model = T5ForConditionalGeneration.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
	self.summarization_model.to(self.device)
	self.summarization_model.eval()
	print("✅ Indonesian T5 summarization model loaded successfully")
	except Exception as e:
	print(f"❌ Indonesian T5 summarization model failed to load: {e}")
	self.summarization_tokenizer = None
	self.summarization_model = None

	# Model embedding alternatif (jika BERT utama gagal)
	try:
	self.embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
	if torch.cuda.is_available():
	self.embedding_model = self.embedding_model.to(self.device)
	except Exception as e:
	print(f"SentenceTransformer failed: {e}")
	self.embedding_model = None

	# NER dengan model Indonesia
	try:
	from transformers import pipeline
	self.ner_analyzer = pipeline(
	"ner",
	model="indolem/indobert-base-uncased-ner",
	tokenizer="indolem/indobert-base-uncased-ner",
	aggregation_strategy="simple",
	device=0 if torch.cuda.is_available() else -1
	)
	except Exception as e:
	print(f"NER analyzer failed to load: {e}")
	self.ner_analyzer = None

	# Keyword extraction dengan KeyBERT menggunakan BERT Indonesia
	try:
	# Gunakan BERT Indonesia untuk KeyBERT
	self.keyword_extractor = KeyBERT(model='cahya/bert-base-indonesian-1.5G')
	except Exception as e:
	print(f"KeyBERT with IndoBERT failed: {e}, using default")
	try:
	self.keyword_extractor = KeyBERT()
	except:
	self.keyword_extractor = None

	self.models_loaded = True
	print("All models loaded successfully!")

	except Exception as e:
	print(f"Error loading models: {e}")
	self.models_loaded = False

	def clean_text(self, text):
	"""Cleaning text dasar"""
	if pd.isna(text):
	return ""

	text = str(text)
	# Remove URLs
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text, flags=re.MULTILINE)
	# Remove mentions dan hashtags
	text = re.sub(r'@\w+\|#\w+', '', text)
	# Remove karakter khusus
	text = re.sub(r'[^\w\s.,!?]', ' ', text)
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()
	# Remove angka
	text = re.sub(r'\d+', '', text)

	return text

	def preprocess_text(self, texts):
	"""Preprocessing batch text dengan tokenizer BERT Indonesia"""
	cleaned_texts = [self.clean_text(text) for text in texts]

	# Tokenization menggunakan BERT tokenizer
	if self.models_loaded and self.tokenizer is not None:
	try:
	tokenized_texts = []
	for text in cleaned_texts:
	if text.strip():
	tokens = self.tokenizer.tokenize(text)
	tokenized_texts.append(" ".join(tokens))
	else:
	tokenized_texts.append("")
	return tokenized_texts
	except Exception as e:
	print(f"Tokenization error: {e}")
	return cleaned_texts
	else:
	return cleaned_texts

	def create_embeddings(self, texts):
	"""Membuat embedding menggunakan BERT Indonesia dengan PyTorch"""
	if not self.models_loaded or self.bert_model is None:
	return None

	try:
	# Method 1: Gunakan SentenceTransformer jika tersedia
	if self.embedding_model is not None:
	embeddings = self.embedding_model.encode(texts)
	return embeddings

	# Method 2: Gunakan BERT Indonesia langsung dengan PyTorch
	embeddings = []

	with torch.no_grad(): # Nonaktifkan gradient calculation untuk inference
	for text in texts:
	if text.strip():
	# Tokenize dan encode text
	encoded_input = self.tokenizer(
	text,
	return_tensors='pt',
	max_length=512,
	truncation=True,
	padding=True
	)
	encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}

	# Dapatkan embeddings dari BERT
	outputs = self.bert_model(**encoded_input)
	# Gunakan [CLS] token embedding sebagai representasi dokumen
	cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
	embeddings.append(cls_embedding[0])
	else:
	embeddings.append(np.zeros(768)) # Default size untuk BERT base

	return np.array(embeddings)

	except Exception as e:
	print(f"Error creating embeddings: {e}")
	return None

	def analyze_emotion(self, texts):
	"""Analisis emosi untuk kumpulan teks menggunakan model Indonesia"""
	if not self.models_loaded or self.emotion_model is None:
	# Fallback emotion analysis menggunakan keyword-based approach
	return self._fallback_emotion_analysis(texts)

	try:
	emotions = []

	for text in texts:
	if len(text.strip()) > 10: # Minimal panjang teks
	try:
	# Tokenize text
	inputs = self.emotion_tokenizer(
	text[:512], # Truncate untuk model
	return_tensors='pt',
	max_length=512,
	truncation=True,
	padding=True
	)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	# Predict emotion
	with torch.no_grad():
	outputs = self.emotion_model(**inputs)
	predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

	# Get predicted emotion and confidence
	confidence, predicted_class = torch.max(predictions, dim=1)
	emotion_idx = predicted_class.cpu().item()
	confidence_score = confidence.cpu().item()

	# Map to emotion label
	if emotion_idx < len(self.emotion_labels):
	emotion_label = self.emotion_labels[emotion_idx]
	else:
	emotion_label = 'netral'

	emotions.append({
	'emotion': emotion_label,
	'confidence': round(confidence_score, 3),
	'all_scores': predictions.cpu().numpy()[0]
	})

	except Exception as e:
	print(f"Error in emotion prediction for text: {e}")
	emotions.append({
	'emotion': 'netral',
	'confidence': 0.0,
	'all_scores': None
	})
	else:
	emotions.append({
	'emotion': 'netral',
	'confidence': 0.0,
	'all_scores': None
	})

	return emotions

	except Exception as e:
	print(f"Error in emotion analysis: {e}")
	return self._fallback_emotion_analysis(texts)

	def _fallback_emotion_analysis(self, texts):
	"""Fallback emotion analysis menggunakan keyword matching"""
	emotion_keywords = {
	'senang': ['senang', 'bahagia', 'gembira', 'suka', 'senang', 'bangga', 'puas', 'legah', 'sukacita'],
	'sedih': ['sedih', 'duka', 'pilu', 'kecewa', 'menyesal', 'haru', 'murung', 'nestapa', 'duka'],
	'marah': ['marah', 'jengkel', 'kesal', 'geram', 'benci', 'dendam', 'gemas', 'berang', 'naik darah'],
	'takut': ['takut', 'khawatir', 'cemas', 'waswas', 'ngeri', 'gentar', 'tremor', 'panik', 'was-was'],
	'jijik': ['jijik', 'muak', 'mual', 'enggan', 'menjijikkan', 'kotor', 'kumuh'],
	'terkejut': ['terkejut', 'kaget', 'heran', 'takjub', 'terperanjat', 'mengagetkan'],
	'netral': ['netral', 'biasa', 'wajar', 'normal', 'lazim', 'umum']
	}

	emotions = []
	for text in texts:
	text_lower = text.lower()
	emotion_scores = {emotion: 0 for emotion in emotion_keywords.keys()}

	for emotion, keywords in emotion_keywords.items():
	for keyword in keywords:
	if keyword in text_lower:
	emotion_scores[emotion] += 1

	# Pilih emotion dengan score tertinggi
	max_score = max(emotion_scores.values())
	if max_score > 0:
	dominant_emotion = max(emotion_scores.items(), key=lambda x: x[1])[0]
	confidence = emotion_scores[dominant_emotion] / sum(emotion_scores.values())
	else:
	dominant_emotion = 'netral'
	confidence = 0.0

	emotions.append({
	'emotion': dominant_emotion,
	'confidence': round(confidence, 3),
	'all_scores': None
	})

	return emotions

	def extract_keywords(self, texts, top_n=5):
	"""Ekstraksi keyword dari teks menggunakan KeyBERT dengan BERT Indonesia"""
	if not self.models_loaded or self.keyword_extractor is None:
	return self._fallback_keyword_extraction(texts, top_n)

	try:
	all_keywords = []
	for text in texts:
	if len(text.strip()) > 20:
	keywords = self.keyword_extractor.extract_keywords(
	text,
	keyphrase_ngram_range=(1, 2),
	stop_words=None,
	top_n=top_n,
	diversity=0.5
	)
	all_keywords.append([kw[0] for kw in keywords])
	else:
	all_keywords.append([])
	return all_keywords
	except Exception as e:
	print(f"Error in keyword extraction: {e}")
	return self._fallback_keyword_extraction(texts, top_n)

	def _fallback_keyword_extraction(self, texts, top_n=5):
	"""Fallback keyword extraction menggunakan TF-IDF sederhana"""
	from collections import Counter
	import re

	# Indonesian stopwords
	stopwords = {
	'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'dengan', 'ini', 'itu', 'ada',
	'tidak', 'dalam', 'akan', 'atau', 'juga', 'saya', 'kamu', 'kami', 'kita', 'mereka',
	'adalah', 'sudah', 'belum', 'telah', 'dapat', 'bisa', 'boleh', 'harus', 'perlu',
	'lagi', 'saja', 'hanya', 'sangat', 'sekali', 'lebih', 'paling', 'sementara'
	}

	all_keywords = []
	for text in texts:
	if len(text.strip()) > 10:
	# Tokenize sederhana
	words = re.findall(r'\b\w+\b', text.lower())
	# Hapus stopwords
	words = [word for word in words if word not in stopwords and len(word) > 2]
	# Ambil kata paling umum
	word_counts = Counter(words)
	keywords = [word for word, count in word_counts.most_common(top_n)]
	all_keywords.append(keywords)
	else:
	all_keywords.append([])
	return all_keywords

	def summarize_text(self, texts, max_length=100, min_length=30):
	"""Summarization teks menggunakan model T5 Indonesia"""
	if not self.models_loaded or self.summarization_model is None:
	return self._fallback_summarization(texts, max_length)

	try:
	summaries = []
	for text in texts:
	if len(text.strip()) > 100:
	try:
	# Preprocess text untuk summarization
	cleaned_text = self.clean_text(text)

	# Encode text
	input_ids = self.summarization_tokenizer.encode(
	cleaned_text,
	return_tensors='pt',
	max_length=512,
	truncation=True
	).to(self.device)

	# Generate summary dengan parameter yang dioptimalkan
	summary_ids = self.summarization_model.generate(
	input_ids,
	min_length=min_length,
	max_length=max_length,
	num_beams=8,
	repetition_penalty=2.5,
	length_penalty=1.0,
	early_stopping=True,
	no_repeat_ngram_size=2,
	use_cache=True,
	do_sample=True,
	temperature=0.8,
	top_k=50,
	top_p=0.95
	)

	# Decode summary
	summary_text = self.summarization_tokenizer.decode(
	summary_ids[0],
	skip_special_tokens=True
	)

	# Post-processing summary
	if summary_text.strip():
	# Hilangkan kutipan jika ada
	summary_text = re.sub(r'^["\']\|["\']$', '', summary_text)
	summaries.append(summary_text.strip())
	else:
	summaries.append(self._fallback_summarization([text], max_length)[0])

	except Exception as e:
	print(f"Error in T5 summarization: {e}")
	summaries.append(self._fallback_summarization([text], max_length)[0])
	else:
	# Untuk teks pendek, gunakan teks asli
	summaries.append(text[:max_length])

	return summaries

	except Exception as e:
	print(f"Error in summarization: {e}")
	return self._fallback_summarization(texts, max_length)

	def _fallback_summarization(self, texts, max_length=100):
	"""Fallback summarization menggunakan ekstraksi kalimat penting"""
	summaries = []
	for text in texts:
	if len(text.strip()) > 100:
	# Split menjadi kalimat
	sentences = re.split(r'[.!?]+', text)
	sentences = [s.strip() for s in sentences if s.strip()]

	# Ambil 2-3 kalimat pertama sebagai summary sederhana
	if len(sentences) > 3:
	summary = ' '.join(sentences[:3])
	else:
	summary = ' '.join(sentences)

	summaries.append(summary[:max_length])
	else:
	summaries.append(text[:max_length])
	return summaries

	def perform_ner(self, texts):
	"""Named Entity Recognition"""
	if not self.models_loaded or self.ner_analyzer is None:
	return self._fallback_ner(texts)

	try:
	all_entities = []
	for text in texts:
	if len(text.strip()) > 10:
	entities = self.ner_analyzer(text[:512])
	# Group entities by type
	entity_dict = {}
	for entity in entities:
	entity_type = entity['entity_group']
	if entity_type not in entity_dict:
	entity_dict[entity_type] = []
	entity_dict[entity_type].append(entity['word'])
	all_entities.append(entity_dict)
	else:
	all_entities.append({})
	return all_entities
	except Exception as e:
	print(f"Error in NER: {e}")
	return self._fallback_ner(texts)

	def _fallback_ner(self, texts):
	"""Fallback NER menggunakan pattern matching sederhana"""
	all_entities = []
	for text in texts:
	entities = {}

	# Pattern matching sederhana untuk entitas umum
	# Orang (kapital di tengah kalimat)
	people = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
	if people:
	entities['PER'] = list(set(people))

	# Organisasi (mengandung kata tertentu)
	org_keywords = ['PT', 'CV', 'Inc', 'Corp', 'Company', 'Perusahaan', 'Universitas', 'Institut']
	organizations = []
	for word in org_keywords:
	orgs = re.findall(rf'\b{word}\.?\s+[A-Za-z]+\b', text)
	organizations.extend(orgs)
	if organizations:
	entities['ORG'] = organizations

	# Lokasi (kata kapital yang mungkin lokasi)
	locations = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', text)
	# Filter yang bukan orang (nama depan + belakang)
	common_locations = ['Indonesia', 'Jakarta', 'Surabaya', 'Bandung', 'Medan', 'Semarang']
	locations = [loc for loc in locations if len(loc.split()) == 1 or loc in common_locations]
	if locations:
	entities['LOC'] = locations

	all_entities.append(entities)

	return all_entities

	def analyze_media(self, df, content_column='content'):
	"""Pipeline analisis media lengkap"""
	print("Starting media analysis pipeline...")

	# Load models jika belum diload
	if not self.models_loaded:
	self.load_models()

	if not self.models_loaded:
	print("Models failed to load, using basic analysis")
	return self._basic_analysis(df, content_column)

	# Filter data yang memiliki content
	valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
	analysis_df = df[valid_content].copy()

	if len(analysis_df) == 0:
	print("No valid content to analyze")
	return df, pd.DataFrame()

	print(f"Analyzing {len(analysis_df)} articles...")

	# Pipeline analisis
	texts = analysis_df[content_column].tolist()

	# 1. Cleaning dan preprocessing
	print("Step 1: Cleaning and preprocessing text...")
	cleaned_texts = self.preprocess_text(texts)

	# 2. Embedding (sample untuk efisiensi jika data banyak)
	print("Step 2: Creating embeddings...")
	if len(cleaned_texts) > 100:
	sample_texts = cleaned_texts[:100]
	embeddings = self.create_embeddings(sample_texts)
	else:
	embeddings = self.create_embeddings(cleaned_texts)

	# 3. Emotion Analysis dengan model Indonesia
	print("Step 3: Analyzing emotions with Indonesian model...")
	emotions = self.analyze_emotion(cleaned_texts)

	# 4. Keyword Extraction
	print("Step 4: Extracting keywords...")
	keywords = self.extract_keywords(cleaned_texts)

	# 5. Summarization dengan model T5 Indonesia
	print("Step 5: Generating summaries with T5 model...")
	long_text_indices = [i for i, text in enumerate(cleaned_texts) if len(text) > 100]
	summaries = [""] * len(cleaned_texts)

	if long_text_indices and self.summarization_model:
	print(f"Generating summaries for {len(long_text_indices)} long texts...")
	long_texts = [cleaned_texts[i] for i in long_text_indices]
	try:
	long_summaries = self.summarize_text(long_texts, max_length=80, min_length=20)
	for i, summary in zip(long_text_indices, long_summaries):
	summaries[i] = summary
	print("✅ T5 summarization completed successfully")
	except Exception as e:
	print(f"❌ T5 summarization failed: {e}")
	# Fallback untuk teks yang gagal
	fallback_summaries = self._fallback_summarization(long_texts, 80)
	for i, summary in zip(long_text_indices, fallback_summaries):
	summaries[i] = summary
	else:
	print("No long texts found for summarization")

	# 6. NER (sample untuk efisiensi jika data banyak)
	print("Step 6: Performing NER...")
	if len(cleaned_texts) > 50:
	ner_texts = cleaned_texts[:50]
	ner_results = self.perform_ner(ner_texts)
	# Extend untuk semua teks
	ner_results.extend([{}] * (len(cleaned_texts) - len(ner_results)))
	else:
	ner_results = self.perform_ner(cleaned_texts)

	# Tambahkan hasil analisis ke DataFrame
	analysis_df = analysis_df.reset_index(drop=True)

	analysis_df['cleaned_content'] = cleaned_texts
	analysis_df['emotion'] = [e['emotion'] for e in emotions]
	analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
	analysis_df['keywords'] = keywords
	analysis_df['summary'] = summaries
	analysis_df['entities'] = ner_results

	# Hitung statistik tambahan
	analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
	analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
	analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0

	# Buat DataFrame terpisah untuk statistik
	emotion_distribution = pd.Series([e['emotion'] for e in emotions]).value_counts().to_dict()

	stats_data = {
	'total_articles': len(analysis_df),
	'articles_analyzed': len(analysis_df),
	'articles_with_emotion': len([e for e in emotions if e['emotion'] != 'netral' and e['confidence'] > 0.3]),
	'articles_with_summary': analysis_df['has_summary'].sum(),
	'avg_emotion_confidence': np.mean([e['confidence'] for e in emotions]),
	'total_keywords': sum(len(kw) for kw in keywords),
	'avg_content_length': np.mean(analysis_df['content_length']),
	'avg_word_count': np.mean(analysis_df['word_count']),
	'emotion_distribution': emotion_distribution,
	'total_entities': sum(len(entities) for entities in ner_results),
	'summarization_model': 'T5-Indonesian' if self.summarization_model else 'Fallback',
	'emotion_model': 'IndoBERT-Emotion' if self.emotion_model else 'Fallback'
	}

	stats_df = pd.DataFrame([stats_data])

	print("Media analysis completed!")
	print(f"Emotion distribution: {emotion_distribution}")
	return analysis_df, stats_df

	def _basic_analysis(self, df, content_column='content'):
	"""Analisis dasar jika model NLP gagal load"""
	print("Performing basic analysis...")

	valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
	analysis_df = df[valid_content].copy()

	if len(analysis_df) == 0:
	return df, pd.DataFrame()

	# Basic cleaning
	analysis_df['cleaned_content'] = analysis_df[content_column].apply(self.clean_text)
	analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
	analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()

	# Basic emotion analysis fallback
	emotions = self._fallback_emotion_analysis(analysis_df['cleaned_content'].tolist())
	analysis_df['emotion'] = [e['emotion'] for e in emotions]
	analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]

	# Basic keyword extraction
	keywords = self._fallback_keyword_extraction(analysis_df['cleaned_content'].tolist())
	analysis_df['keywords'] = keywords

	# Basic summarization fallback
	summaries = self._fallback_summarization(analysis_df['cleaned_content'].tolist())
	analysis_df['summary'] = summaries
	analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0

	# Basic stats
	stats_data = {
	'total_articles': len(analysis_df),
	'articles_analyzed': len(analysis_df),
	'avg_content_length': analysis_df['content_length'].mean(),
	'avg_word_count': analysis_df['word_count'].mean(),
	'total_sources': analysis_df['source'].nunique(),
	'emotion_distribution': analysis_df['emotion'].value_counts().to_dict(),
	'articles_with_summary': analysis_df['has_summary'].sum(),
	'summarization_model': 'Fallback',
	'emotion_model': 'Fallback'
	}

	stats_df = pd.DataFrame([stats_data])

	return analysis_df, stats_df

	# Singleton instance
	media_analyzer = MediaAnalyzer()