Spaces:

liloumln
/

AIPROJECT

Sleeping

App Files Files Community

AIPROJECT / nlp_utils.py

liloumln

Upload 9 files

5f35544 verified about 2 months ago

raw

history blame contribute delete

3.71 kB

	import os, json, re, datetime
	from typing import Dict, List
	from transformers import pipeline
	from faster_whisper import WhisperModel

	# --------- Lazy singletons ---------
	_SUMMARIZER = None
	_EXTRACTOR = None
	_WHISPER = None

	def get_summarizer():
	global _SUMMARIZER
	if _SUMMARIZER is None:
	_SUMMARIZER = pipeline("summarization", model="facebook/bart-large-cnn")
	return _SUMMARIZER

	def get_extractor():
	"""Flan-T5 used for JSON-style action/decision extraction via text2text pipeline."""
	global _EXTRACTOR
	if _EXTRACTOR is None:
	_EXTRACTOR = pipeline("text2text-generation", model="google/flan-t5-large", max_new_tokens=256)
	return _EXTRACTOR

	def get_whisper(device: str = "auto"):
	global _WHISPER
	if _WHISPER is None:
	# Small multilingual model: works for English + French audio
	_WHISPER = WhisperModel("Systran/faster-whisper-small", device=device, compute_type="int8")
	return _WHISPER

	# --------- Core ---------
	def transcribe_audio(audio_path: str) -> str:
	model = get_whisper()
	segments, info = model.transcribe(audio_path, beam_size=1)
	text = " ".join(seg.text.strip() for seg in segments)
	return text.strip()

	def _chunk(text: str, max_chars: int) -> List[str]:
	parts, buf, size = [], [], 0
	import re as _re
	for sent in _re.split(r'(?<=[\.!\?])\s+', text):
	if size + len(sent) > max_chars and buf:
	parts.append(" ".join(buf)); buf, size = [], 0
	buf.append(sent); size += len(sent) + 1
	if buf: parts.append(" ".join(buf))
	return parts

	def summarize(text: str) -> str:
	summarizer = get_summarizer()
	chunks = _chunk(text, 2200)
	partials = [summarizer(ch, do_sample=False)[0]["summary_text"] for ch in chunks]
	merged = " ".join(partials)
	final = summarizer(merged, do_sample=False, max_length=200, min_length=60)[0]["summary_text"]
	return final

	def extract_actions_decisions(text: str) -> Dict[str, List[str]]:
	prompt = f"""You are a meeting note-taking assistant.
	From the transcript below, extract:
	1) a concise list of "Action Items" (who does what, use infinitive verb, include deadline if mentioned)
	2) a list of "Decisions" (short statements)

	Return strict JSON with this shape:
	{{"actions": ["...","..."], "decisions": ["...","..."]}}

	Transcript:
	{text[:7000]}
	"""
	gen = get_extractor()
	out = gen(prompt)[0]["generated_text"]
	try:
	data = json.loads(out)
	actions = [s.strip() for s in data.get("actions", []) if s.strip()]
	decisions = [s.strip() for s in data.get("decisions", []) if s.strip()]
	return {"actions": actions, "decisions": decisions}
	except Exception:
	# Fallback heuristic if JSON parsing fails
	actions, decisions = [], []
	for line in text.splitlines():
	if re.search(r"(?i)\b(action\|todo\|to do):", line):
	actions.append(re.sub(r"(?i)^.?:\s", "", line).strip())
	if re.search(r"(?i)\b(decision\|decisions):", line):
	decisions.append(re.sub(r"(?i)^.?:\s", "", line).strip())
	return {"actions": actions, "decisions": decisions}

	def make_minutes_md(title: str, summary: str, actions: List[str], decisions: List[str]) -> str:
	now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
	lines = [
	f"# {title} — Minutes",
	f"_Generated on {now}_",
	"",
	"## Summary",
	summary.strip() if summary else "—",
	"",
	"## Action Items",
	*[f"- [ ] {a}" for a in (actions or ["—"])],
	"",
	"## Decisions",
	*[f"- {d}" for d in (decisions or ["—"])],
	"",
	]
	return "\n".join(lines)