# ========================= # File: identity_manager.py # ========================= from __future__ import annotations from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING if TYPE_CHECKING: from chromadb.api.models.Collection import Collection class IdentityManager: """ Encapsula toda la lógica de asignación de identidades (caras + voces) y su proyección sobre frames, clips y SRT. """ def __init__(self, face_collection: Optional["Collection"] = None, voice_collection: Optional["Collection"] = None): self.face_collection = face_collection self.voice_collection = voice_collection # --------------------------- Faces / Frames --------------------------- def assign_faces_to_frames( self, frames: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """ `frames` es una lista de dicts con al menos: {image_path, start, end, faces:[{embedding?, bbox?}]} Devuelve los mismos frames con `faces` enriquecidos con `identity` y `distance` si hay DB. """ if self.face_collection is None: return frames out = [] for fr in frames: faces = fr.get("faces") or [] enr: List[Dict[str, Any]] = [] for f in faces: emb = f.get("embedding") or f.get("vector") if not emb: enr.append(f) continue try: q = self.face_collection.query(query_embeddings=[emb], n_results=1, include=["metadatas", "distances"]) # type: ignore metas = q.get("metadatas", [[]])[0] dists = q.get("distances", [[]])[0] if metas: md = metas[0] or {} f = dict(f) f["identity"] = md.get("identity") or md.get("name") if dists: f["distance"] = float(dists[0]) except Exception: pass enr.append(f) fr2 = dict(fr) fr2["faces"] = enr out.append(fr2) return out # --------------------------- Voices / Segments ------------------------ def assign_voices_to_segments( self, audio_segments: List[Dict[str, Any]], distance_threshold: Optional[float] = None, ) -> List[Dict[str, Any]]: """ Añade `voice_vecinos` y `voice_identity` a cada segmento si hay colección de voz. """ if self.voice_collection is None: return audio_segments out = [] for a in audio_segments: emb = a.get("voice_embedding") if not emb: out.append(a) continue try: q = self.voice_collection.query(query_embeddings=[emb], n_results=3, include=["metadatas", "distances"]) # type: ignore metas = q.get("metadatas", [[]])[0] dists = q.get("distances", [[]])[0] vecinos = [] top_id = None top_dist = None for m, d in zip(metas, dists): name = (m or {}).get("identity") or (m or {}).get("name") vecinos.append({"identity": name, "distance": float(d)}) if top_id is None: top_id, top_dist = name, float(d) a2 = dict(a) a2["voice_vecinos"] = vecinos if top_id is not None: if distance_threshold is None or (top_dist is not None and top_dist <= distance_threshold): a2["voice_identity"] = top_id out.append(a2) except Exception: out.append(a) return out # --------------------------- Map to SRT/Timelines --------------------- @staticmethod def map_identities_over_ranges( per_second_frames: List[Dict[str, Any]], ranges: List[Dict[str, Any]], key: str = "faces", out_key: str = "persona", ) -> List[Dict[str, Any]]: """ Para cada rango temporal (keyframes, audio_segments, etc.), agrega quién aparece según los frames por segundo. """ out: List[Dict[str, Any]] = [] for rng in ranges: s, e = float(rng.get("start", 0.0)), float(rng.get("end", 0.0)) present = [] for fr in per_second_frames: fs, fe = float(fr.get("start", 0.0)), float(fr.get("end", 0.0)) if fe <= s or fs >= e: continue for f in fr.get(key) or []: ident = f.get("identity") if ident and ident not in present: present.append(ident) r2 = dict(rng) r2[out_key] = present out.append(r2) return out