|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import re
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
try:
|
|
|
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
|
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
from keybert import KeyBERT
|
|
|
import torch
|
|
|
except ImportError as e:
|
|
|
print(f"Error importing NLP libraries: {e}")
|
|
|
|
|
|
class MediaAnalyzer:
|
|
|
def __init__(self):
|
|
|
self.tokenizer = None
|
|
|
self.bert_model = None
|
|
|
self.embedding_model = None
|
|
|
self.emotion_tokenizer = None
|
|
|
self.emotion_model = None
|
|
|
self.emotion_config = None
|
|
|
self.summarization_tokenizer = None
|
|
|
self.summarization_model = None
|
|
|
self.ner_analyzer = None
|
|
|
self.keyword_extractor = None
|
|
|
self.models_loaded = False
|
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
|
|
|
|
|
self.emotion_labels = [
|
|
|
'senang', 'sedih', 'marah', 'takut', 'jijik', 'terkejut'
|
|
|
]
|
|
|
|
|
|
def load_models(self):
|
|
|
"""Load semua model NLP yang diperlukan"""
|
|
|
try:
|
|
|
print("Loading NLP models...")
|
|
|
print(f"Using device: {self.device}")
|
|
|
|
|
|
|
|
|
model_name = 'cahya/bert-base-indonesian-1.5G'
|
|
|
print(f"Loading BERT model: {model_name}")
|
|
|
self.tokenizer = BertTokenizer.from_pretrained(model_name)
|
|
|
self.bert_model = BertModel.from_pretrained(model_name)
|
|
|
self.bert_model.to(self.device)
|
|
|
self.bert_model.eval()
|
|
|
|
|
|
|
|
|
try:
|
|
|
print("Loading Indonesian emotion classification model...")
|
|
|
self.emotion_tokenizer = BertTokenizer.from_pretrained("thoriqfy/indobert-emotion-classification")
|
|
|
self.emotion_config = BertConfig.from_pretrained("thoriqfy/indobert-emotion-classification")
|
|
|
self.emotion_model = BertForSequenceClassification.from_pretrained(
|
|
|
"thoriqfy/indobert-emotion-classification",
|
|
|
config=self.emotion_config
|
|
|
)
|
|
|
self.emotion_model.to(self.device)
|
|
|
self.emotion_model.eval()
|
|
|
print("β
Indonesian emotion classification model loaded successfully")
|
|
|
except Exception as e:
|
|
|
print(f"β Indonesian emotion classification model failed to load: {e}")
|
|
|
self.emotion_tokenizer = None
|
|
|
self.emotion_model = None
|
|
|
|
|
|
|
|
|
try:
|
|
|
print("Loading Indonesian T5 summarization model...")
|
|
|
self.summarization_tokenizer = T5Tokenizer.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
|
|
|
self.summarization_model = T5ForConditionalGeneration.from_pretrained("cahya/t5-base-indonesian-summarization-cased")
|
|
|
self.summarization_model.to(self.device)
|
|
|
self.summarization_model.eval()
|
|
|
print("β
Indonesian T5 summarization model loaded successfully")
|
|
|
except Exception as e:
|
|
|
print(f"β Indonesian T5 summarization model failed to load: {e}")
|
|
|
self.summarization_tokenizer = None
|
|
|
self.summarization_model = None
|
|
|
|
|
|
|
|
|
try:
|
|
|
self.embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
|
|
if torch.cuda.is_available():
|
|
|
self.embedding_model = self.embedding_model.to(self.device)
|
|
|
except Exception as e:
|
|
|
print(f"SentenceTransformer failed: {e}")
|
|
|
self.embedding_model = None
|
|
|
|
|
|
|
|
|
try:
|
|
|
from transformers import pipeline
|
|
|
self.ner_analyzer = pipeline(
|
|
|
"ner",
|
|
|
model="indolem/indobert-base-uncased-ner",
|
|
|
tokenizer="indolem/indobert-base-uncased-ner",
|
|
|
aggregation_strategy="simple",
|
|
|
device=0 if torch.cuda.is_available() else -1
|
|
|
)
|
|
|
except Exception as e:
|
|
|
print(f"NER analyzer failed to load: {e}")
|
|
|
self.ner_analyzer = None
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.keyword_extractor = KeyBERT(model='cahya/bert-base-indonesian-1.5G')
|
|
|
except Exception as e:
|
|
|
print(f"KeyBERT with IndoBERT failed: {e}, using default")
|
|
|
try:
|
|
|
self.keyword_extractor = KeyBERT()
|
|
|
except:
|
|
|
self.keyword_extractor = None
|
|
|
|
|
|
self.models_loaded = True
|
|
|
print("All models loaded successfully!")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error loading models: {e}")
|
|
|
self.models_loaded = False
|
|
|
|
|
|
def clean_text(self, text):
|
|
|
"""Cleaning text dasar"""
|
|
|
if pd.isna(text):
|
|
|
return ""
|
|
|
|
|
|
text = str(text)
|
|
|
|
|
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
|
|
|
|
|
text = re.sub(r'@\w+|#\w+', '', text)
|
|
|
|
|
|
text = re.sub(r'[^\w\s.,!?]', ' ', text)
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
text = re.sub(r'\d+', '', text)
|
|
|
|
|
|
return text
|
|
|
|
|
|
def preprocess_text(self, texts):
|
|
|
"""Preprocessing batch text dengan tokenizer BERT Indonesia"""
|
|
|
cleaned_texts = [self.clean_text(text) for text in texts]
|
|
|
|
|
|
|
|
|
if self.models_loaded and self.tokenizer is not None:
|
|
|
try:
|
|
|
tokenized_texts = []
|
|
|
for text in cleaned_texts:
|
|
|
if text.strip():
|
|
|
tokens = self.tokenizer.tokenize(text)
|
|
|
tokenized_texts.append(" ".join(tokens))
|
|
|
else:
|
|
|
tokenized_texts.append("")
|
|
|
return tokenized_texts
|
|
|
except Exception as e:
|
|
|
print(f"Tokenization error: {e}")
|
|
|
return cleaned_texts
|
|
|
else:
|
|
|
return cleaned_texts
|
|
|
|
|
|
def create_embeddings(self, texts):
|
|
|
"""Membuat embedding menggunakan BERT Indonesia dengan PyTorch"""
|
|
|
if not self.models_loaded or self.bert_model is None:
|
|
|
return None
|
|
|
|
|
|
try:
|
|
|
|
|
|
if self.embedding_model is not None:
|
|
|
embeddings = self.embedding_model.encode(texts)
|
|
|
return embeddings
|
|
|
|
|
|
|
|
|
embeddings = []
|
|
|
|
|
|
with torch.no_grad():
|
|
|
for text in texts:
|
|
|
if text.strip():
|
|
|
|
|
|
encoded_input = self.tokenizer(
|
|
|
text,
|
|
|
return_tensors='pt',
|
|
|
max_length=512,
|
|
|
truncation=True,
|
|
|
padding=True
|
|
|
)
|
|
|
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
|
|
|
|
|
|
|
|
|
outputs = self.bert_model(**encoded_input)
|
|
|
|
|
|
cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
|
|
embeddings.append(cls_embedding[0])
|
|
|
else:
|
|
|
embeddings.append(np.zeros(768))
|
|
|
|
|
|
return np.array(embeddings)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error creating embeddings: {e}")
|
|
|
return None
|
|
|
|
|
|
def analyze_emotion(self, texts):
|
|
|
"""Analisis emosi untuk kumpulan teks menggunakan model Indonesia"""
|
|
|
if not self.models_loaded or self.emotion_model is None:
|
|
|
|
|
|
return self._fallback_emotion_analysis(texts)
|
|
|
|
|
|
try:
|
|
|
emotions = []
|
|
|
|
|
|
for text in texts:
|
|
|
if len(text.strip()) > 10:
|
|
|
try:
|
|
|
|
|
|
inputs = self.emotion_tokenizer(
|
|
|
text[:512],
|
|
|
return_tensors='pt',
|
|
|
max_length=512,
|
|
|
truncation=True,
|
|
|
padding=True
|
|
|
)
|
|
|
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
outputs = self.emotion_model(**inputs)
|
|
|
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
|
|
|
|
|
|
|
|
confidence, predicted_class = torch.max(predictions, dim=1)
|
|
|
emotion_idx = predicted_class.cpu().item()
|
|
|
confidence_score = confidence.cpu().item()
|
|
|
|
|
|
|
|
|
if emotion_idx < len(self.emotion_labels):
|
|
|
emotion_label = self.emotion_labels[emotion_idx]
|
|
|
else:
|
|
|
emotion_label = 'netral'
|
|
|
|
|
|
emotions.append({
|
|
|
'emotion': emotion_label,
|
|
|
'confidence': round(confidence_score, 3),
|
|
|
'all_scores': predictions.cpu().numpy()[0]
|
|
|
})
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error in emotion prediction for text: {e}")
|
|
|
emotions.append({
|
|
|
'emotion': 'netral',
|
|
|
'confidence': 0.0,
|
|
|
'all_scores': None
|
|
|
})
|
|
|
else:
|
|
|
emotions.append({
|
|
|
'emotion': 'netral',
|
|
|
'confidence': 0.0,
|
|
|
'all_scores': None
|
|
|
})
|
|
|
|
|
|
return emotions
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error in emotion analysis: {e}")
|
|
|
return self._fallback_emotion_analysis(texts)
|
|
|
|
|
|
def _fallback_emotion_analysis(self, texts):
|
|
|
"""Fallback emotion analysis menggunakan keyword matching"""
|
|
|
emotion_keywords = {
|
|
|
'senang': ['senang', 'bahagia', 'gembira', 'suka', 'senang', 'bangga', 'puas', 'legah', 'sukacita'],
|
|
|
'sedih': ['sedih', 'duka', 'pilu', 'kecewa', 'menyesal', 'haru', 'murung', 'nestapa', 'duka'],
|
|
|
'marah': ['marah', 'jengkel', 'kesal', 'geram', 'benci', 'dendam', 'gemas', 'berang', 'naik darah'],
|
|
|
'takut': ['takut', 'khawatir', 'cemas', 'waswas', 'ngeri', 'gentar', 'tremor', 'panik', 'was-was'],
|
|
|
'jijik': ['jijik', 'muak', 'mual', 'enggan', 'menjijikkan', 'kotor', 'kumuh'],
|
|
|
'terkejut': ['terkejut', 'kaget', 'heran', 'takjub', 'terperanjat', 'mengagetkan'],
|
|
|
'netral': ['netral', 'biasa', 'wajar', 'normal', 'lazim', 'umum']
|
|
|
}
|
|
|
|
|
|
emotions = []
|
|
|
for text in texts:
|
|
|
text_lower = text.lower()
|
|
|
emotion_scores = {emotion: 0 for emotion in emotion_keywords.keys()}
|
|
|
|
|
|
for emotion, keywords in emotion_keywords.items():
|
|
|
for keyword in keywords:
|
|
|
if keyword in text_lower:
|
|
|
emotion_scores[emotion] += 1
|
|
|
|
|
|
|
|
|
max_score = max(emotion_scores.values())
|
|
|
if max_score > 0:
|
|
|
dominant_emotion = max(emotion_scores.items(), key=lambda x: x[1])[0]
|
|
|
confidence = emotion_scores[dominant_emotion] / sum(emotion_scores.values())
|
|
|
else:
|
|
|
dominant_emotion = 'netral'
|
|
|
confidence = 0.0
|
|
|
|
|
|
emotions.append({
|
|
|
'emotion': dominant_emotion,
|
|
|
'confidence': round(confidence, 3),
|
|
|
'all_scores': None
|
|
|
})
|
|
|
|
|
|
return emotions
|
|
|
|
|
|
def extract_keywords(self, texts, top_n=5):
|
|
|
"""Ekstraksi keyword dari teks menggunakan KeyBERT dengan BERT Indonesia"""
|
|
|
if not self.models_loaded or self.keyword_extractor is None:
|
|
|
return self._fallback_keyword_extraction(texts, top_n)
|
|
|
|
|
|
try:
|
|
|
all_keywords = []
|
|
|
for text in texts:
|
|
|
if len(text.strip()) > 20:
|
|
|
keywords = self.keyword_extractor.extract_keywords(
|
|
|
text,
|
|
|
keyphrase_ngram_range=(1, 2),
|
|
|
stop_words=None,
|
|
|
top_n=top_n,
|
|
|
diversity=0.5
|
|
|
)
|
|
|
all_keywords.append([kw[0] for kw in keywords])
|
|
|
else:
|
|
|
all_keywords.append([])
|
|
|
return all_keywords
|
|
|
except Exception as e:
|
|
|
print(f"Error in keyword extraction: {e}")
|
|
|
return self._fallback_keyword_extraction(texts, top_n)
|
|
|
|
|
|
def _fallback_keyword_extraction(self, texts, top_n=5):
|
|
|
"""Fallback keyword extraction menggunakan TF-IDF sederhana"""
|
|
|
from collections import Counter
|
|
|
import re
|
|
|
|
|
|
|
|
|
stopwords = {
|
|
|
'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'dengan', 'ini', 'itu', 'ada',
|
|
|
'tidak', 'dalam', 'akan', 'atau', 'juga', 'saya', 'kamu', 'kami', 'kita', 'mereka',
|
|
|
'adalah', 'sudah', 'belum', 'telah', 'dapat', 'bisa', 'boleh', 'harus', 'perlu',
|
|
|
'lagi', 'saja', 'hanya', 'sangat', 'sekali', 'lebih', 'paling', 'sementara'
|
|
|
}
|
|
|
|
|
|
all_keywords = []
|
|
|
for text in texts:
|
|
|
if len(text.strip()) > 10:
|
|
|
|
|
|
words = re.findall(r'\b\w+\b', text.lower())
|
|
|
|
|
|
words = [word for word in words if word not in stopwords and len(word) > 2]
|
|
|
|
|
|
word_counts = Counter(words)
|
|
|
keywords = [word for word, count in word_counts.most_common(top_n)]
|
|
|
all_keywords.append(keywords)
|
|
|
else:
|
|
|
all_keywords.append([])
|
|
|
return all_keywords
|
|
|
|
|
|
def summarize_text(self, texts, max_length=100, min_length=30):
|
|
|
"""Summarization teks menggunakan model T5 Indonesia"""
|
|
|
if not self.models_loaded or self.summarization_model is None:
|
|
|
return self._fallback_summarization(texts, max_length)
|
|
|
|
|
|
try:
|
|
|
summaries = []
|
|
|
for text in texts:
|
|
|
if len(text.strip()) > 100:
|
|
|
try:
|
|
|
|
|
|
cleaned_text = self.clean_text(text)
|
|
|
|
|
|
|
|
|
input_ids = self.summarization_tokenizer.encode(
|
|
|
cleaned_text,
|
|
|
return_tensors='pt',
|
|
|
max_length=512,
|
|
|
truncation=True
|
|
|
).to(self.device)
|
|
|
|
|
|
|
|
|
summary_ids = self.summarization_model.generate(
|
|
|
input_ids,
|
|
|
min_length=min_length,
|
|
|
max_length=max_length,
|
|
|
num_beams=8,
|
|
|
repetition_penalty=2.5,
|
|
|
length_penalty=1.0,
|
|
|
early_stopping=True,
|
|
|
no_repeat_ngram_size=2,
|
|
|
use_cache=True,
|
|
|
do_sample=True,
|
|
|
temperature=0.8,
|
|
|
top_k=50,
|
|
|
top_p=0.95
|
|
|
)
|
|
|
|
|
|
|
|
|
summary_text = self.summarization_tokenizer.decode(
|
|
|
summary_ids[0],
|
|
|
skip_special_tokens=True
|
|
|
)
|
|
|
|
|
|
|
|
|
if summary_text.strip():
|
|
|
|
|
|
summary_text = re.sub(r'^["\']|["\']$', '', summary_text)
|
|
|
summaries.append(summary_text.strip())
|
|
|
else:
|
|
|
summaries.append(self._fallback_summarization([text], max_length)[0])
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error in T5 summarization: {e}")
|
|
|
summaries.append(self._fallback_summarization([text], max_length)[0])
|
|
|
else:
|
|
|
|
|
|
summaries.append(text[:max_length])
|
|
|
|
|
|
return summaries
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error in summarization: {e}")
|
|
|
return self._fallback_summarization(texts, max_length)
|
|
|
|
|
|
def _fallback_summarization(self, texts, max_length=100):
|
|
|
"""Fallback summarization menggunakan ekstraksi kalimat penting"""
|
|
|
summaries = []
|
|
|
for text in texts:
|
|
|
if len(text.strip()) > 100:
|
|
|
|
|
|
sentences = re.split(r'[.!?]+', text)
|
|
|
sentences = [s.strip() for s in sentences if s.strip()]
|
|
|
|
|
|
|
|
|
if len(sentences) > 3:
|
|
|
summary = ' '.join(sentences[:3])
|
|
|
else:
|
|
|
summary = ' '.join(sentences)
|
|
|
|
|
|
summaries.append(summary[:max_length])
|
|
|
else:
|
|
|
summaries.append(text[:max_length])
|
|
|
return summaries
|
|
|
|
|
|
def perform_ner(self, texts):
|
|
|
"""Named Entity Recognition"""
|
|
|
if not self.models_loaded or self.ner_analyzer is None:
|
|
|
return self._fallback_ner(texts)
|
|
|
|
|
|
try:
|
|
|
all_entities = []
|
|
|
for text in texts:
|
|
|
if len(text.strip()) > 10:
|
|
|
entities = self.ner_analyzer(text[:512])
|
|
|
|
|
|
entity_dict = {}
|
|
|
for entity in entities:
|
|
|
entity_type = entity['entity_group']
|
|
|
if entity_type not in entity_dict:
|
|
|
entity_dict[entity_type] = []
|
|
|
entity_dict[entity_type].append(entity['word'])
|
|
|
all_entities.append(entity_dict)
|
|
|
else:
|
|
|
all_entities.append({})
|
|
|
return all_entities
|
|
|
except Exception as e:
|
|
|
print(f"Error in NER: {e}")
|
|
|
return self._fallback_ner(texts)
|
|
|
|
|
|
def _fallback_ner(self, texts):
|
|
|
"""Fallback NER menggunakan pattern matching sederhana"""
|
|
|
all_entities = []
|
|
|
for text in texts:
|
|
|
entities = {}
|
|
|
|
|
|
|
|
|
|
|
|
people = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
|
|
|
if people:
|
|
|
entities['PER'] = list(set(people))
|
|
|
|
|
|
|
|
|
org_keywords = ['PT', 'CV', 'Inc', 'Corp', 'Company', 'Perusahaan', 'Universitas', 'Institut']
|
|
|
organizations = []
|
|
|
for word in org_keywords:
|
|
|
orgs = re.findall(rf'\b{word}\.?\s+[A-Za-z]+\b', text)
|
|
|
organizations.extend(orgs)
|
|
|
if organizations:
|
|
|
entities['ORG'] = organizations
|
|
|
|
|
|
|
|
|
locations = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', text)
|
|
|
|
|
|
common_locations = ['Indonesia', 'Jakarta', 'Surabaya', 'Bandung', 'Medan', 'Semarang']
|
|
|
locations = [loc for loc in locations if len(loc.split()) == 1 or loc in common_locations]
|
|
|
if locations:
|
|
|
entities['LOC'] = locations
|
|
|
|
|
|
all_entities.append(entities)
|
|
|
|
|
|
return all_entities
|
|
|
|
|
|
def analyze_media(self, df, content_column='content'):
|
|
|
"""Pipeline analisis media lengkap"""
|
|
|
print("Starting media analysis pipeline...")
|
|
|
|
|
|
|
|
|
if not self.models_loaded:
|
|
|
self.load_models()
|
|
|
|
|
|
if not self.models_loaded:
|
|
|
print("Models failed to load, using basic analysis")
|
|
|
return self._basic_analysis(df, content_column)
|
|
|
|
|
|
|
|
|
valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
|
|
|
analysis_df = df[valid_content].copy()
|
|
|
|
|
|
if len(analysis_df) == 0:
|
|
|
print("No valid content to analyze")
|
|
|
return df, pd.DataFrame()
|
|
|
|
|
|
print(f"Analyzing {len(analysis_df)} articles...")
|
|
|
|
|
|
|
|
|
texts = analysis_df[content_column].tolist()
|
|
|
|
|
|
|
|
|
print("Step 1: Cleaning and preprocessing text...")
|
|
|
cleaned_texts = self.preprocess_text(texts)
|
|
|
|
|
|
|
|
|
print("Step 2: Creating embeddings...")
|
|
|
if len(cleaned_texts) > 100:
|
|
|
sample_texts = cleaned_texts[:100]
|
|
|
embeddings = self.create_embeddings(sample_texts)
|
|
|
else:
|
|
|
embeddings = self.create_embeddings(cleaned_texts)
|
|
|
|
|
|
|
|
|
print("Step 3: Analyzing emotions with Indonesian model...")
|
|
|
emotions = self.analyze_emotion(cleaned_texts)
|
|
|
|
|
|
|
|
|
print("Step 4: Extracting keywords...")
|
|
|
keywords = self.extract_keywords(cleaned_texts)
|
|
|
|
|
|
|
|
|
print("Step 5: Generating summaries with T5 model...")
|
|
|
long_text_indices = [i for i, text in enumerate(cleaned_texts) if len(text) > 100]
|
|
|
summaries = [""] * len(cleaned_texts)
|
|
|
|
|
|
if long_text_indices and self.summarization_model:
|
|
|
print(f"Generating summaries for {len(long_text_indices)} long texts...")
|
|
|
long_texts = [cleaned_texts[i] for i in long_text_indices]
|
|
|
try:
|
|
|
long_summaries = self.summarize_text(long_texts, max_length=80, min_length=20)
|
|
|
for i, summary in zip(long_text_indices, long_summaries):
|
|
|
summaries[i] = summary
|
|
|
print("β
T5 summarization completed successfully")
|
|
|
except Exception as e:
|
|
|
print(f"β T5 summarization failed: {e}")
|
|
|
|
|
|
fallback_summaries = self._fallback_summarization(long_texts, 80)
|
|
|
for i, summary in zip(long_text_indices, fallback_summaries):
|
|
|
summaries[i] = summary
|
|
|
else:
|
|
|
print("No long texts found for summarization")
|
|
|
|
|
|
|
|
|
print("Step 6: Performing NER...")
|
|
|
if len(cleaned_texts) > 50:
|
|
|
ner_texts = cleaned_texts[:50]
|
|
|
ner_results = self.perform_ner(ner_texts)
|
|
|
|
|
|
ner_results.extend([{}] * (len(cleaned_texts) - len(ner_results)))
|
|
|
else:
|
|
|
ner_results = self.perform_ner(cleaned_texts)
|
|
|
|
|
|
|
|
|
analysis_df = analysis_df.reset_index(drop=True)
|
|
|
|
|
|
analysis_df['cleaned_content'] = cleaned_texts
|
|
|
analysis_df['emotion'] = [e['emotion'] for e in emotions]
|
|
|
analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
|
|
|
analysis_df['keywords'] = keywords
|
|
|
analysis_df['summary'] = summaries
|
|
|
analysis_df['entities'] = ner_results
|
|
|
|
|
|
|
|
|
analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
|
|
|
analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
|
|
|
analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0
|
|
|
|
|
|
|
|
|
emotion_distribution = pd.Series([e['emotion'] for e in emotions]).value_counts().to_dict()
|
|
|
|
|
|
stats_data = {
|
|
|
'total_articles': len(analysis_df),
|
|
|
'articles_analyzed': len(analysis_df),
|
|
|
'articles_with_emotion': len([e for e in emotions if e['emotion'] != 'netral' and e['confidence'] > 0.3]),
|
|
|
'articles_with_summary': analysis_df['has_summary'].sum(),
|
|
|
'avg_emotion_confidence': np.mean([e['confidence'] for e in emotions]),
|
|
|
'total_keywords': sum(len(kw) for kw in keywords),
|
|
|
'avg_content_length': np.mean(analysis_df['content_length']),
|
|
|
'avg_word_count': np.mean(analysis_df['word_count']),
|
|
|
'emotion_distribution': emotion_distribution,
|
|
|
'total_entities': sum(len(entities) for entities in ner_results),
|
|
|
'summarization_model': 'T5-Indonesian' if self.summarization_model else 'Fallback',
|
|
|
'emotion_model': 'IndoBERT-Emotion' if self.emotion_model else 'Fallback'
|
|
|
}
|
|
|
|
|
|
stats_df = pd.DataFrame([stats_data])
|
|
|
|
|
|
print("Media analysis completed!")
|
|
|
print(f"Emotion distribution: {emotion_distribution}")
|
|
|
return analysis_df, stats_df
|
|
|
|
|
|
def _basic_analysis(self, df, content_column='content'):
|
|
|
"""Analisis dasar jika model NLP gagal load"""
|
|
|
print("Performing basic analysis...")
|
|
|
|
|
|
valid_content = df[content_column].notna() & (df[content_column].str.len() > 10)
|
|
|
analysis_df = df[valid_content].copy()
|
|
|
|
|
|
if len(analysis_df) == 0:
|
|
|
return df, pd.DataFrame()
|
|
|
|
|
|
|
|
|
analysis_df['cleaned_content'] = analysis_df[content_column].apply(self.clean_text)
|
|
|
analysis_df['content_length'] = analysis_df['cleaned_content'].str.len()
|
|
|
analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len()
|
|
|
|
|
|
|
|
|
emotions = self._fallback_emotion_analysis(analysis_df['cleaned_content'].tolist())
|
|
|
analysis_df['emotion'] = [e['emotion'] for e in emotions]
|
|
|
analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions]
|
|
|
|
|
|
|
|
|
keywords = self._fallback_keyword_extraction(analysis_df['cleaned_content'].tolist())
|
|
|
analysis_df['keywords'] = keywords
|
|
|
|
|
|
|
|
|
summaries = self._fallback_summarization(analysis_df['cleaned_content'].tolist())
|
|
|
analysis_df['summary'] = summaries
|
|
|
analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0
|
|
|
|
|
|
|
|
|
stats_data = {
|
|
|
'total_articles': len(analysis_df),
|
|
|
'articles_analyzed': len(analysis_df),
|
|
|
'avg_content_length': analysis_df['content_length'].mean(),
|
|
|
'avg_word_count': analysis_df['word_count'].mean(),
|
|
|
'total_sources': analysis_df['source'].nunique(),
|
|
|
'emotion_distribution': analysis_df['emotion'].value_counts().to_dict(),
|
|
|
'articles_with_summary': analysis_df['has_summary'].sum(),
|
|
|
'summarization_model': 'Fallback',
|
|
|
'emotion_model': 'Fallback'
|
|
|
}
|
|
|
|
|
|
stats_df = pd.DataFrame([stats_data])
|
|
|
|
|
|
return analysis_df, stats_df
|
|
|
|
|
|
|
|
|
media_analyzer = MediaAnalyzer() |