Spaces:

michaellupo74
/

grants-rag

Running

App Files Files Community

grants-rag / app /ingest.py

michaellupo74

feat(tags): global Matthew 25 tagging + expose tags in index

c66bdcc about 2 months ago

raw

history blame contribute delete

36 kB

	# app/ingest.py
	from __future__ import annotations
	import json
	from pathlib import Path
	from typing import Dict, List, Any, Tuple, Optional

	import yaml
	import numpy as np
	from sentence_transformers import SentenceTransformer

	from app.paths import DOCSTORE_DIR, INDEX_DIR
	from .normalize import normalize # ← central normalizer

	import re
	import time
	import hashlib
	import requests
	from bs4 import BeautifulSoup
	from datetime import datetime, timezone
	from difflib import SequenceMatcher
	from urllib.parse import urljoin
	from app.tags import infer_matt25_tags # ← NEW



	# -------------------- Config --------------------

	def load_config(cfg_path: str) -> Dict:
	with open(cfg_path, "r", encoding="utf-8") as f:
	return yaml.safe_load(f)

	# -------------------- Capacity / Geo Filters (config-driven) --------------------
	# controls live in config/sources.yaml:
	# filters:
	# capacity_only: true
	# pa_md_only: false

	_INCLUDE_PATTERNS = [re.compile(p, re.I) for p in [
	r"\bcapacity(?:[-\s]?building)?\b",
	r"\btechnical\s+assistance\b",
	r"\bTA\b",
	r"\borganizational\s+(capacity\|effectiveness\|development\|readiness\|stabilization)\b",
	r"\borganization(?:al)?\s+infrastructure\b",
	r"\bback[-\s]?office\b\|\bbackbone\s+organization\b",
	r"\bgovernance\b\|\bboard\s+development\b\|\bboard\s+training\b",
	r"\bpre[-\s]?development\b\|\bpredevelopment\b\|\bplanning\s+grant\b",
	r"\bdata\s+systems?\b\|\bCRM\b\|\bcase\s+management\b",
	r"\b(staff\|workforce)\s+capacity\b\|\bhire\s+(?:staff\|positions?)\b",
	r"\bscal(?:e\|ing)\s+capacity\b\|\bexpand\s+capacity\b",
	r"\bnonprofit\b\|\bfaith[-\s]?based\b\|\bcommunity[-\s]?based\b",
	# broaden for transportation / human services
	r"\bprovider\s+capacity\b\|\bservice\s+capacity\b\|\borganizational\s+support\b",
	]]

	_EXCLUDE_PATTERNS = [re.compile(p, re.I) for p in [
	r"\bteaching\s+assistant\b\|\bTAs\b",
	r"\bbench\s+capacity\b\|\bmanufacturing\s+capacity\b(?!.*organiz)",
	r"\bclinical\s+trial\b\|\blaboratory\s+capacity\b(?!.*community)",
	r"\b(postsecondary\|university\|college)\b(?!.*community\s+partner)",
	r"\bconstruction\b(?!.*(admin\|organiz\|back[-\s]?office\|governance\|systems))",
	]]

	_PA_MD_HINTS = re.compile(
	r"\b("
	r"Pennsylvania\|PA\b\|Harrisburg\|Philadelphia\|Allegheny\|Montgomery County\b\|Pittsburgh\|Scranton\|Erie\|"
	r"Maryland\|MD\b\|Annapolis\|Baltimore\|Prince\s+George'?s\|Howard County\b"
	r")\b",
	re.I,
	)

	def _doc_text_from_row(rec: Dict[str, Any]) -> str:
	title = rec.get("title") or ""
	synopsis = rec.get("synopsis") or rec.get("summary") or ""
	agency = rec.get("agency") or ""
	eligibility = rec.get("eligibility") or ""
	categories = " ".join(rec.get("categories") or []) if isinstance(rec.get("categories"), list) else (rec.get("categories") or "")
	geo = rec.get("geo") or ""
	return "\n".join([title, synopsis, agency, eligibility, categories, geo]).strip()

	def _is_capacity_building_text(text: str) -> bool:
	if not text:
	return False
	if any(p.search(text) for p in _EXCLUDE_PATTERNS):
	return False
	return any(p.search(text) for p in _INCLUDE_PATTERNS)

	def _is_pa_md_text(text: str) -> bool:
	if not text:
	return False
	return bool(_PA_MD_HINTS.search(text))

	# -------------------- Deadline parsing helpers --------------------

	# Common date shapes: "October 15, 2025", "Oct 15, 2025", "10/15/2025", "2025-10-15"
	_MONTHS = {
	"january":1, "jan":1, "february":2, "feb":2, "march":3, "mar":3, "april":4, "apr":4,
	"may":5, "june":6, "jun":6, "july":7, "jul":7, "august":8, "aug":8, "september":9, "sep":9, "sept":9,
	"october":10, "oct":10, "november":11, "nov":11, "december":12, "dec":12
	}

	# Capture label + date snippets
	_DEADLINE_LINES = re.compile(
	r"(deadline\|applications?\s+due\|closing\s+date\|due\s+date\|submission\s+deadline)\s[:\-]?\s(.+)",
	re.I
	)

	# Capture absolute dates
	_DATE_ISO = re.compile(r"\b(20\d{2})[-/\.](\d{1,2})[-/\.](\d{1,2})\b") # 2025-10-15 or 2025/10/15
	_DATE_US = re.compile(r"\b(\d{1,2})[-/\.](\d{1,2})[-/\.](20\d{2})\b") # 10/15/2025
	_DATE_LONG = re.compile(
	r"\b([A-Za-z]{3,9})\s+(\d{1,2})(?:st\|nd\|rd\|th)?\,?\s+(20\d{2})\b", re.I) # Oct 15, 2025 / October 15 2025

	def _to_iso(y:int, m:int, d:int) -> Optional[str]:
	try:
	return datetime(y, m, d, tzinfo=timezone.utc).date().isoformat()
	except Exception:
	return None

	def _parse_date_fragment(s: str) -> Optional[str]:
	s = (s or "").strip()
	# ISO first
	m = _DATE_ISO.search(s)
	if m:
	y, mm, dd = int(m.group(1)), int(m.group(2)), int(m.group(3))
	return _to_iso(y, mm, dd)
	# US 10/15/2025
	m = _DATE_US.search(s)
	if m:
	mm, dd, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
	return _to_iso(y, mm, dd)
	# Long months
	m = _DATE_LONG.search(s)
	if m:
	mon = m.group(1).lower()
	dd = int(m.group(2))
	y = int(m.group(3))
	mm = _MONTHS.get(mon)
	if mm:
	return _to_iso(y, mm, dd)
	return None

	def _extract_deadline(text: str) -> Tuple[Optional[str], Optional[str]]:
	"""
	Try to locate a deadline-like line, then parse a date.
	Returns (deadline_iso, raw_snippet) or (None, None).
	"""
	if not text:
	return None, None

	# Look for labeled lines first (strong signal)
	for line in text.splitlines():
	m = _DEADLINE_LINES.search(line)
	if m:
	iso = _parse_date_fragment(m.group(2))
	if iso:
	return iso, line.strip()

	# Fallback: scan whole text for any date (first match)
	iso = _parse_date_fragment(text)
	if iso:
	return iso, None
	return None, None

	def _compute_is_active(deadline_iso: Optional[str]) -> bool:
	if not deadline_iso:
	return True # treat unknown/TBD as active
	try:
	return datetime.fromisoformat(deadline_iso).date() >= datetime.utcnow().date()
	except Exception:
	return True

	def _attach_matt25_tags(rec: Dict[str, Any]) -> None:
	"""Add Matthew 25 tags by inspecting core text fields; preserve any manual tags."""
	blob = " ".join(filter(None, [
	rec.get("title", ""),
	rec.get("synopsis") or rec.get("summary") or "",
	rec.get("eligibility", ""),
	rec.get("agency", ""),
	]))
	manual = rec.get("tags") or []
	if not isinstance(manual, list):
	manual = [manual]
	auto = infer_matt25_tags(blob)
	rec["tags"] = sorted(set(manual) \| set(auto))


	# -------------------- Grants.gov collector --------------------

	def _collect_from_grantsgov_api(src: Dict) -> List[Dict[str, Any]]:
	"""
	Calls the Grants.gov Search2 client and returns a list of RAW dicts
	(adapter may already be close to unified; we'll still run normalize()).
	"""
	from app.sources.grantsgov_api import search_grants # local import to avoid cycles

	api = src.get("api", {})
	page_size = int(api.get("page_size", src.get("page_size", 100)))
	max_pages = int(api.get("max_pages", src.get("max_pages", 5)))
	payload = api.get("payload", src.get("payload", {}))
	url = src.get("url", "")

	out = search_grants(url, payload, page_size=page_size, max_pages=max_pages)
	hits = out.get("hits", []) if isinstance(out, dict) else (out or [])
	return [h for h in hits if isinstance(h, dict)]

	# -------------------- HTTP helpers --------------------

	_HTTP_HEADERS = {
	"User-Agent": "grants-rag/1.0 (+https://example.local) requests",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	}

	def _http_get(url: str, timeout: int = 20) -> Optional[requests.Response]:
	try:
	r = requests.get(url, headers=_HTTP_HEADERS, timeout=timeout)
	if r.status_code == 200 and r.content:
	return r
	except requests.RequestException:
	return None
	return None

	def _soup(html: str) -> BeautifulSoup:
	# use lxml or html5lib if available for robustness
	return BeautifulSoup(html, "lxml")

	def _text_from_soup(s: BeautifulSoup, selectors: Optional[List[str]] = None) -> Tuple[str, str]:
	"""
	Returns (title, text). Uses selectors if provided;
	falls back to common content containers.
	"""
	title = s.title.string.strip() if s.title and s.title.string else ""

	nodes = []
	if selectors:
	for css in selectors:
	nodes.extend(s.select(css) or [])
	if not nodes:
	for css in ("main", "article", "#content", ".content", "[role='main']"):
	nodes.extend(s.select(css) or [])
	if not nodes:
	nodes = [s.body] if s.body else []

	parts: List[str] = []
	for n in nodes:
	if not n:
	continue
	txt = n.get_text(separator="\n", strip=True)
	if txt:
	parts.append(txt)
	body = "\n\n".join(parts).strip()
	return title, body

	def _make_id(*fields: str) -> str:
	h = hashlib.sha1()
	for f in fields:
	if f:
	h.update(f.encode("utf-8", "ignore"))
	h.update(b"\|")
	return h.hexdigest()

	def _normalize_web_record(
	source_name: str,
	url: str,
	title: str,
	body: str,
	static: Dict[str, Any],
	extra: Optional[Dict[str, Any]] = None,
	) -> Dict[str, Any]:
	"""
	Produce a record shaped like normalize() output so downstream stays unchanged.
	Adds parsed deadline + is_active when available.
	"""
	extra = extra or {}
	# If caller didn't pass a deadline, try to parse from body/title here.
	deadline_iso = extra.get("deadline")
	deadline_text = extra.get("deadline_text")
	if not deadline_iso:
	# Try parsing from body first, then from title.
	deadline_iso, deadline_text = _extract_deadline(body) or (None, None)
	if not deadline_iso and title:
	deadline_iso, _ = _extract_deadline(title) or (None, None)

	rec = {
	"id": extra.get("id") or _make_id(url, title or body[:160]),
	"title": title or extra.get("title") or url,
	"synopsis": (body or "")[:2000], # clip; embeddings use title+synopsis later
	"summary": None,
	"url": url,
	"source": source_name,
	"geo": static.get("geo"),
	"categories": static.get("categories"),
	"agency": extra.get("agency", ""),
	"eligibility": extra.get("eligibility", ""),
	# Store ISO (YYYY-MM-DD) in 'deadline' for consistency
	"deadline": deadline_iso,
	"deadline_text": deadline_text, # keep the raw line we matched (if any)
	"program_number": extra.get("program_number"),
	"posted_date": extra.get("posted_date"),
	}
	rec["is_active"] = _compute_is_active(rec["deadline"])

	# NEW: add Matthew 25 tags based on title/synopsis/eligibility text
	_attach_matt25_tags(rec)

	return rec

	# -------------------- Collectors: http_html / web_page --------------------

	def _collect_from_http_html(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Supports types: 'web_page' and 'http_html'
	Config keys supported:
	- url (str)
	- parse: { follow_links: bool, link_selectors: [..], content_selectors: [..] }
	- crawl: { schedule: "...", max_depth: int } # max_depth 0/None = only landing
	"""
	url = entry.get("url")
	if not url:
	return []
	r = _http_get(url)
	if not r:
	return []

	s = _soup(r.text)
	parse = entry.get("parse", {}) or entry.get("extract", {}) or {}
	content_selectors = parse.get("content_selectors") or []
	title, body = _text_from_soup(s, content_selectors)

	rows = []
	rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))

	# follow links?
	follow = bool(parse.get("follow_links"))
	link_selectors = parse.get("link_selectors") or []
	crawl = entry.get("crawl", {}) or {}
	max_depth = int(crawl.get("max_depth", 0) or 0)
	visited = set([url])

	def _enq_links(soup: BeautifulSoup) -> List[str]:
	if link_selectors:
	links = []
	for sel in link_selectors:
	for a in soup.select(sel) or []:
	href = a.get("href")
	if href and href.startswith("http"):
	links.append(href)
	out, seen = [], set()
	for h in links:
	if h not in seen:
	out.append(h)
	seen.add(h)
	return out[:40] # polite cap
	return []

	if follow and max_depth > 0:
	frontier = _enq_links(s)
	depth = 1
	while frontier and depth <= max_depth and len(rows) < 200:
	next_frontier = []
	for link in frontier:
	if link in visited:
	continue
	visited.add(link)
	rr = _http_get(link)
	if not rr:
	continue
	ss = _soup(rr.text)
	t2, b2 = _text_from_soup(ss, content_selectors)
	if b2:
	rows.append(_normalize_web_record(source_name, link, t2, b2, static, extra={"posted_date": None}))
	if depth < max_depth:
	next_frontier.extend(_enq_links(ss))
	time.sleep(0.1) # gentle
	frontier = next_frontier
	depth += 1

	return rows

	# -------------------- Collector: http_html_js (Playwright) --------------------

	def _collect_from_http_html_js(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	JS-rendered pages using Playwright, with per-card extraction and robust scrolling.
	entry.options:
	- wait_for (css or ms int)
	- scroll (bool)
	- scroll_selector (css) # NEW: scroll a container div, not the window
	- scroll_times (int) # NEW: default 20
	- scroll_wait_ms (int) # NEW: default 400
	- min_cards (int) # NEW: wait until at least N cards exist
	- click_selector (css)
	- max_pages (int)
	- timeout_ms (int)
	- network_idle (bool)
	- debug (bool)
	entry.selectors: card, title, link, description, meta
	"""
	try:
	from playwright.sync_api import sync_playwright # type: ignore
	except Exception:
	print(f"[collect][skip] {source_name}: Playwright not installed.")
	return []

	url = entry.get("url")
	if not url:
	return []

	options = entry.get("options", {}) or {}
	parse = entry.get("parse", {}) or entry.get("extract", {}) or {}
	selectors = entry.get("selectors", {}) or {}
	content_selectors = parse.get("content_selectors") or []

	timeout_ms = int(options.get("timeout_ms", 6000))
	network_idle = bool(options.get("network_idle", True))
	debug = bool(options.get("debug", False))
	max_pages = int(options.get("max_pages", 1))
	click_sel = options.get("click_selector") or entry.get("next_selector")
	wait_for = options.get("wait_for")

	rows: List[Dict[str, Any]] = []

	def _text_first(soup: BeautifulSoup, css_list: str) -> str:
	if not css_list:
	return ""
	for css in [c.strip() for c in css_list.split(",")]:
	el = soup.select_one(css)
	if el:
	txt = el.get_text(separator=" ", strip=True)
	if txt:
	return txt
	return ""

	def _attr_first(soup: BeautifulSoup, css_list: str, attr: str) -> Optional[str]:
	if not css_list:
	return None
	for css in [c.strip() for c in css_list.split(",")]:
	el = soup.select_one(css)
	if el:
	val = el.get(attr)
	if val:
	return val
	return None

	def _parse_cards(page_html: str, base_url: str) -> List[Dict[str, Any]]:
	s = _soup(page_html)
	card_css = selectors.get("card", "")
	if not card_css:
	title, body = _text_from_soup(s, content_selectors)
	return [_normalize_web_record(source_name, base_url, title, body, static, extra={"posted_date": None})]

	out: List[Dict[str, Any]] = []
	for card in s.select(card_css) or []:
	csoup = BeautifulSoup(str(card), "lxml")
	title = _text_first(csoup, selectors.get("title", "h1, h2, h3"))
	href = _attr_first(csoup, selectors.get("link", "a"), "href")
	link = urljoin(base_url, href) if href else base_url
	desc = _text_first(csoup, selectors.get("description", "p, .summary, .excerpt, .card-text"))
	meta = _text_first(csoup, selectors.get("meta", ".meta, .tags, .badge, .location"))
	body = "\n".join([p for p in (desc, meta) if p]).strip()
	if not (title or body):
	continue
	out.append(_normalize_web_record(source_name, link, title or link, body, static, extra={"posted_date": None}))
	return out

	def _wait_page_ready(page, *, wait_for, timeout_ms, options, selectors):
	# wait_for can be CSS or milliseconds
	if isinstance(wait_for, int):
	page.wait_for_timeout(wait_for)
	elif isinstance(wait_for, str) and wait_for:
	page.wait_for_selector(wait_for, timeout=min(timeout_ms, 15000))

	# Scroll window or a container div
	if options.get("scroll"):
	scroll_sel = options.get("scroll_selector")
	scroll_times = int(options.get("scroll_times", 20))
	scroll_wait = int(options.get("scroll_wait_ms", 400))
	if scroll_sel:
	page.evaluate(
	"""(sel, times, wait) => new Promise(res => {
	const el = document.querySelector(sel);
	if (!el) { res(); return; }
	let i = 0;
	const t = setInterval(() => {
	const y = el.scrollTop;
	el.scrollTop = el.scrollHeight;
	i++;
	if (el.scrollTop === y \|\| i >= times) { clearInterval(t); res(); }
	}, wait);
	})""",
	scroll_sel, scroll_times, scroll_wait
	)
	else:
	page.evaluate(
	"""(times, wait) => new Promise(res => {
	let i = 0;
	const t = setInterval(() => {
	const y = window.scrollY;
	window.scrollBy(0, document.body.scrollHeight);
	i++;
	if (window.scrollY === y \|\| i >= times) { clearInterval(t); res(); }
	}, wait);
	})""",
	scroll_times, scroll_wait
	)

	# Optionally wait for a minimum number of cards (virtualized lists)
	min_cards = int(options.get("min_cards", 0))
	card_css = (selectors or {}).get("card", "")
	if min_cards and card_css:
	try:
	page.wait_for_function(
	"""([sel, target]) => {
	const els = document.querySelectorAll(sel);
	return els && els.length >= target;
	}""",
	arg=[card_css, min_cards],
	timeout=min(timeout_ms, 15000),
	)
	except Exception:
	pass # best-effort

	def _try_once(p):
	def _route(route):
	r = route.request
	if r.resource_type in {"image", "media", "font"}:
	return route.abort()
	return route.continue_()

	browser = p.chromium.launch(headless=not debug)
	context = browser.new_context(user_agent=(
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
	"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
	))
	context.route("*/", _route)
	page = context.new_page()
	try:
	page.set_default_timeout(timeout_ms)
	page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
	if network_idle:
	page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))

	_wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)

	htmls = [page.content()]

	# pagination
	for _ in range(max_pages - 1):
	sel = click_sel
	if not sel or page.locator(sel).count() == 0:
	break
	page.click(sel)
	page.wait_for_load_state("domcontentloaded")
	if network_idle:
	page.wait_for_load_state("networkidle", timeout=min(timeout_ms, 15000))
	_wait_page_ready(page, wait_for=wait_for, timeout_ms=timeout_ms, options=options, selectors=selectors)
	htmls.append(page.content())

	for html in htmls:
	found = _parse_cards(html, url)
	rows.extend(found)
	print(f"[collect][cards] {source_name}: found {len(found)} cards on this page")
	browser.close()
	return True
	except Exception as e:
	if debug:
	try:
	snap = DOCSTORE_DIR / f"playwright_error_{hashlib.sha1(url.encode()).hexdigest()}.png"
	page.screenshot(path=str(snap))
	print(f"[collect][debug] Saved screenshot: {snap}")
	except Exception:
	pass
	print(f"[collect][warn] {source_name}: {e.__class__.__name__}: {e}")
	try:
	browser.close()
	except Exception:
	pass
	return False

	from playwright.sync_api import sync_playwright # late import for clarity
	with sync_playwright() as p:
	ok = _try_once(p)
	if not ok:
	time.sleep(1.5)
	_try_once(p)

	if not rows:
	print(f"[collect][skip] {source_name}: no content after retries.")
	return rows

	# -------------------- PDF collector --------------------

	def _collect_from_http_pdf(entry: Dict, source_name: str, static: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	type: 'http_pdf'
	keys:
	- url (single PDF fetch)
	"""
	url = entry.get("url")
	if not url:
	return []

	try:
	from pdfminer.high_level import extract_text # lazy import
	except Exception:
	return []

	rows = []
	r = _http_get(url, timeout=40)
	if not r:
	return rows
	tmp = DOCSTORE_DIR / (hashlib.sha1(url.encode("utf-8")).hexdigest() + ".pdf")
	try:
	DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
	tmp.write_bytes(r.content)
	body = extract_text(str(tmp)) or ""
	finally:
	try:
	tmp.unlink(missing_ok=True)
	except Exception:
	pass
	title = entry.get("name") or "PDF Document"

	rows.append(_normalize_web_record(source_name, url, title, body, static, extra={"posted_date": None}))
	return rows

	# -------------------- De-dup helpers (5.2) --------------------

	def _norm(text: str) -> str:
	t = (text or "").lower()
	t = re.sub(r'[^a-z0-9 ]+', ' ', t)
	return re.sub(r'\s+', ' ', t).strip()

	def _hash_fingerprint(title: str, agency: str, deadline: str) -> str:
	base = f"{_norm(title)}\|{_norm(agency)}\|{deadline or ''}"
	return hashlib.sha1(base.encode()).hexdigest()

	def _near_duplicate(a: Dict[str, Any], b: Dict[str, Any]) -> bool:
	# Deadlines equal or both missing
	dates_close = (a.get("deadline") == b.get("deadline")) or (not a.get("deadline") and not b.get("deadline"))
	t_sim = SequenceMatcher(None, _norm(a.get("title","")), _norm(b.get("title",""))).ratio()
	ag_sim = SequenceMatcher(None, _norm(a.get("agency","")), _norm(b.get("agency",""))).ratio()
	return dates_close and (t_sim > 0.88) and (ag_sim > 0.75)

	def _merge_records(primary: Dict[str, Any], other: Dict[str, Any]) -> Dict[str, Any]:
	"""Merge fields while preserving best data and provenance."""
	merged = dict(primary)

	# Prefer non-empty fields; combine categories; keep earliest posted_date; keep earliest deadline if different.
	def choose(a, b):
	return a if (a not in (None, "", [], {})) else b

	merged["url"] = choose(primary.get("url"), other.get("url"))
	merged["title"] = choose(primary.get("title"), other.get("title"))
	merged["synopsis"] = choose(primary.get("synopsis"), other.get("synopsis"))
	merged["summary"] = choose(primary.get("summary"), other.get("summary"))
	merged["agency"] = choose(primary.get("agency"), other.get("agency"))
	merged["eligibility"] = choose(primary.get("eligibility"), other.get("eligibility"))
	merged["program_number"] = choose(primary.get("program_number"), other.get("program_number"))
	merged["geo"] = choose(primary.get("geo"), other.get("geo"))

	# categories → union (list)
	cats_a = primary.get("categories") or []
	cats_b = other.get("categories") or []
	if not isinstance(cats_a, list): cats_a = [cats_a]
	if not isinstance(cats_b, list): cats_b = [cats_b]
	merged["categories"] = sorted(set([c for c in cats_a + cats_b if c]))

	# deadline: choose earlier known date (safer to surface sooner one)
	da, db = primary.get("deadline"), other.get("deadline")
	if da and db:
	merged["deadline"] = min(da, db)
	else:
	merged["deadline"] = da or db
	# carry a deadline_text if any
	merged["deadline_text"] = choose(primary.get("deadline_text"), other.get("deadline_text"))
	merged["is_active"] = _compute_is_active(merged.get("deadline"))

	# posted_date: keep earliest if both
	pa, pb = primary.get("posted_date"), other.get("posted_date")
	merged["posted_date"] = min(pa, pb) if (pa and pb) else (pa or pb)

	# provenance: combine sources + urls
	prov_sources = set()
	for s in (primary.get("source"), other.get("source")):
	if not s: continue
	if isinstance(s, list): prov_sources.update(s)
	else: prov_sources.add(s)
	merged["source"] = sorted(prov_sources) if prov_sources else None

	prov_urls = set()
	for u in (primary.get("url"), other.get("url")):
	if u: prov_urls.add(u)
	# keep a list of all discovered urls
	merged["all_urls"] = sorted(prov_urls.union(set(primary.get("all_urls") or []), set(other.get("all_urls") or [])))

	# recompute ID based on merged fingerprint (title/agency/deadline)
	merged["id"] = _hash_fingerprint(merged.get("title",""), merged.get("agency",""), merged.get("deadline",""))

	return merged

	def _dedupe_and_merge(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Exact fingerprint + fuzzy near-dup consolidation across sources."""
	uniques: List[Dict[str, Any]] = []
	by_fp: Dict[str, int] = {}

	for r in rows:
	fp = _hash_fingerprint(r.get("title",""), r.get("agency",""), r.get("deadline",""))
	if fp in by_fp:
	# exact dup: merge into existing
	idx = by_fp[fp]
	uniques[idx] = _merge_records(uniques[idx], r)
	continue

	# fuzzy check against current uniques
	found_idx = None
	for i, u in enumerate(uniques):
	if _near_duplicate(r, u):
	found_idx = i
	break

	if found_idx is not None:
	uniques[found_idx] = _merge_records(uniques[found_idx], r)
	# also index its fingerprint (so later exact matches land here)
	new_fp = _hash_fingerprint(uniques[found_idx].get("title",""),
	uniques[found_idx].get("agency",""),
	uniques[found_idx].get("deadline",""))
	by_fp[new_fp] = found_idx
	else:
	by_fp[fp] = len(uniques)
	# initialize provenance
	r.setdefault("all_urls", [r.get("url")] if r.get("url") else [])
	uniques.append(r)

	return uniques

	# -------------------- Write docstore & build index --------------------

	def _save_docstore(recs: List[Dict, Any]) -> str:
	DOCSTORE_DIR.mkdir(parents=True, exist_ok=True)
	path = DOCSTORE_DIR / "docstore.jsonl"
	with path.open("w", encoding="utf-8") as f:
	for r in recs:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")
	return str(path)

	def _build_index_from_docstore() -> int:
	ds_path = DOCSTORE_DIR / "docstore.jsonl"
	if not ds_path.exists():
	raise RuntimeError("Docstore not found. Run ingest first.")

	texts: List[str] = []
	metas: List[Dict[str, Any]] = []
	with ds_path.open("r", encoding="utf-8") as f:
	for line in f:
	rec = json.loads(line)
	title = rec.get("title") or ""
	synopsis = rec.get("synopsis") or rec.get("summary") or ""
	agency = rec.get("agency") or ""
	eligibility = rec.get("eligibility") or ""
	txt = "\n".join([title, synopsis, agency, eligibility]).strip()
	if not txt:
	continue
	texts.append(txt)
	metas.append({
	"id": rec.get("id"),
	"title": title,
	"url": rec.get("url"),
	"source": rec.get("source"),
	"tags": rec.get("tags"),
	"geo": rec.get("geo"),
	"categories": rec.get("categories"),
	"agency": agency,
	"deadline": rec.get("deadline"), # ISO if available
	"deadline_text": rec.get("deadline_text"),
	"is_active": rec.get("is_active"),
	"program_number": rec.get("program_number"),
	"posted_date": rec.get("posted_date"),
	"all_urls": rec.get("all_urls"),
	})

	print(f"[index] Rows loaded from docstore: {len(texts)}")

	if not texts:
	INDEX_DIR.mkdir(parents=True, exist_ok=True)
	(INDEX_DIR / "meta.json").write_text(json.dumps([], ensure_ascii=False))
	print("[index] No texts to embed. Wrote empty meta.json.")
	return 0

	# Embed (CPU default; portable)
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	model.max_seq_length = 256
	batch = max(8, min(32, len(texts)))
	emb = model.encode(
	texts,
	convert_to_numpy=True,
	normalize_embeddings=True,
	show_progress_bar=True,
	batch_size=batch,
	).astype(np.float32, copy=False)

	# FAISS index (Inner Product for cosine on normalized vectors)
	import faiss
	dim = emb.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(emb)

	INDEX_DIR.mkdir(parents=True, exist_ok=True)
	faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
	(INDEX_DIR / "meta.json").write_text(json.dumps(metas, ensure_ascii=False))

	print(f"[index] Wrote FAISS index with {emb.shape[0]} vectors (dim={dim}).")
	return len(texts)

	# -------------------- Public API: ingest --------------------

	__all__ = ["ingest"]

	def ingest(cfg_path: str = "config/sources.yaml", env: Dict \| None = None):
	"""
	Reads config, fetches from enabled sources via adapters, normalizes to a single schema,
	applies filters (capacity / PA-MD), de-dupes, writes docstore, and builds the FAISS index.
	Returns (docstore_path, n_indexed).
	"""
	cfg = load_config(cfg_path)

	# ---- Filters from config ----
	f_cfg = (cfg or {}).get("filters", {}) or {}
	capacity_only = bool(f_cfg.get("capacity_only", True))
	pa_md_only = bool(f_cfg.get("pa_md_only", False))
	print(f"[filters] capacity_only = {'TRUE' if capacity_only else 'FALSE'}")
	print(f"[filters] pa_md_only = {'TRUE' if pa_md_only else 'FALSE'}")

	all_rows: List[Dict[str, Any]] = []
	for entry in cfg.get("sources", []):
	if not entry.get("enabled"):
	continue

	name = entry.get("name", "<source>")
	geo = entry.get("geo") or "US"
	cats = entry.get("categories") or []
	static = {"geo": geo, "categories": cats}

	typ = entry.get("type")
	rows: List[Dict[str, Any]] = []

	# -------- Collect from each adapter --------
	if typ == "grantsgov_api":
	raw_hits = _collect_from_grantsgov_api(entry)
	rows = [normalize("grants_gov", h, static) for h in raw_hits]

	elif typ in ("web_page", "http_html"):
	rows = _collect_from_http_html(entry, name, static)

	elif typ == "http_html_js":
	rows = _collect_from_http_html_js(entry, name, static)

	elif typ == "http_pdf":
	rows = _collect_from_http_pdf(entry, name, static)

	elif typ == "local_sample":
	p = Path(entry["path"]).expanduser()
	blob = json.loads(p.read_text(encoding="utf-8"))
	items = blob.get("opportunities") or []
	rows = [normalize("local_sample", op, static) for op in items]

	else:
	print(f"[collect] {name}: unknown type '{typ}', skipping.")
	continue

	print(f"[collect] {name}: fetched_rows={len(rows)}")

	# ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
	if rows:
	if entry.get("skip_filters"):
	print(f"[filter] {name}: skip_filters=true → keeping all {len(rows)}")
	else:
	pre = len(rows)
	filtered = []
	for r in rows:
	t = _doc_text_from_row(r)
	if capacity_only and not _is_capacity_building_text(t):
	continue
	if pa_md_only and not _is_pa_md_text(t):
	continue
	filtered.append(r)
	print(
	f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
	f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
	)
	rows = filtered

	# ---- Apply capacity / geo filters BEFORE indexing (allow per-source bypass) ----
	if rows:
	if entry.get("skip_filters"):
	print(f"[filter] {name}: skip_filters=true → keeping all {len(rows)}")
	else:
	pre = len(rows)
	filtered = []
	for r in rows:
	t = _doc_text_from_row(r)
	if capacity_only and not _is_capacity_building_text(t):
	continue
	if pa_md_only and not _is_pa_md_text(t):
	continue
	filtered.append(r)
	print(
	f"[filter] {name}: kept {len(filtered)}/{pre} after filters "
	f"(capacity_only={capacity_only}, pa_md_only={pa_md_only})"
	)
	rows = filtered

	# >>> GLOBAL MATT 25 TAGGING (safer, one place) <<<
	for r in rows:
	# If you truly want to leave Grants.gov untouched, keep this guard:
	if r.get("source") == "grants_gov":
	continue
	_attach_matt25_tags(r)

	print(f"[collect] {name} → rows_after_filters={len(rows)}")
	all_rows.extend(rows)



	# ---- Cross-source DEDUPE + MERGE ----
	unique = _dedupe_and_merge(all_rows)
	print(f"[ingest] Unique records to index: {len(unique)}")

	path = _save_docstore(unique)
	n = _build_index_from_docstore()
	return path, n

	# -------------------- CLI --------------------

	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser()
	ap.add_argument("--config", default="config/sources.yaml")
	args = ap.parse_args()
	p, n = ingest(args.config)
	print(f"Ingested {n} records. Docstore at {p}")