grants-rag / config /sources.yaml
michaellupo74's picture
feat(ingest): JS card/grid + scroll container + skip_filters
b53e303
raw
history blame
12.6 kB
# Minimal, valid config β€” v6.3
filters:
capacity_only: false # keep only capacity-building items
pa_md_only: false # set to true to restrict index to PA/MD
sources:
# ---------- FEDERAL: Grants.gov (focused for buses/van/mobility & reentry) ----------
- name: "Grants.gov β€” Capacity Building (general)"
type: grantsgov_api
enabled: true
url: "https://api.grants.gov/v1/api/search2"
geo: "US"
categories: ["capacity_building"]
api:
page_size: 100
max_pages: 5
payload:
keyword: "capacity building"
oppStatuses: "posted"
sortBy: "openDate|desc"
- name: "Grants.gov β€” Capacity + Transportation (buses/vans/transit)"
type: grantsgov_api
enabled: true
url: "https://api.grants.gov/v1/api/search2"
geo: "US"
categories: ["capacity_building", "transportation", "vehicle"]
api:
page_size: 100
max_pages: 5
payload:
keyword: "transportation OR shuttle OR bus OR van OR transit OR mobility"
oppStatuses: "posted"
agencies: "DOT|FTA|DOE|ACL"
fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
sortBy: "openDate|desc"
- name: "Grants.gov β€” FTA Section 5310 (Enhanced Mobility for Seniors & Individuals with Disabilities)"
type: grantsgov_api
enabled: true
url: "https://api.grants.gov/v1/api/search2"
geo: "US"
categories: ["elderly", "transportation", "vehicle"]
api:
page_size: 100
max_pages: 3
payload:
aln: "20.513"
keyword: "\"Enhanced Mobility\" OR \"Section 5310\" OR seniors OR elderly OR disabilities OR paratransit OR wheelchair OR shuttle OR van OR bus"
oppStatuses: "posted"
agencies: "FTA"
fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
sortBy: "openDate|desc"
- name: "Grants.gov β€” Reentry Transportation (Second Chance / BJA)"
type: grantsgov_api
enabled: true
url: "https://api.grants.gov/v1/api/search2"
geo: "US"
categories: ["reentry", "transportation", "justice"]
api:
page_size: 100
max_pages: 3
payload:
keyword: "reentry OR \"second chance\" OR returning citizens OR offender transition OR justice-involved employment OR transportation"
oppStatuses: "posted"
agencies: "BJA"
fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
sortBy: "openDate|desc"
- name: "Grants.gov β€” Aging & Mobility (ACL/HHS for seniors transport & access)"
type: grantsgov_api
enabled: true
url: "https://api.grants.gov/v1/api/search2"
geo: "US"
categories: ["elderly", "transportation", "human_services"]
api:
page_size: 100
max_pages: 3
payload:
keyword: "transportation OR mobility OR access OR paratransit OR shuttle OR rideshare"
oppStatuses: "posted"
agencies: "ACL"
fundingCategories: "TRANSPORTATION|HUMAN_SERVICES|LAW_JUSTICE"
sortBy: "openDate|desc"
# ---------- FEDERAL: Federal Register (broad NOFO scanning) ----------
- name: "Federal Register β€” Funding/NOFO keywords (API)"
type: http_json
enabled: true
url: "https://www.federalregister.gov/api/v1/documents.json"
geo: "US"
categories: ["capacity_building", "notices"]
api:
payload:
conditions[term]: "funding opportunity OR cooperative agreement OR NOFO"
per_page: 50
order: "newest"
parse:
item_path: "results[]"
title: "title"
link: "html_url"
published_at: "publication_date"
body: "abstract"
# ---------- STATE & METRO PASS-THROUGHS (FTA 5310 etc.) ----------
- name: "Maryland MTA β€” Grants (incl. 5310)"
type: web_page
enabled: true
url: "https://www.mta.maryland.gov/grants"
geo: "MD"
categories: ["transportation","elderly","disabilities","5310"]
crawl:
schedule: "weekly"
max_depth: 1
extract:
mode: "article"
keep_links: true
- name: "Pennsylvania PennDOT β€” 5310 Program (details)"
type: web_page
enabled: true
url: "https://www.pa.gov/grants/search/grant-details.penndot13"
geo: "PA"
categories: ["transportation","elderly","disabilities","5310"]
crawl:
schedule: "weekly"
extract:
mode: "article"
keep_links: true
- name: "Pennsylvania PennDOT β€” 5310 Guidelines (PDF)"
type: web_page
enabled: true
url: "https://www.pa.gov/content/dam/copapwp-pagov/en/penndot/documents/programs-and-doing-business/transit/capital-acquisitions-and-requirements/federal-section-5310-guidelines-and-policies.pdf"
geo: "PA"
categories: ["transportation","elderly","disabilities","5310","guidance"]
crawl:
schedule: "weekly"
extract:
mode: "pdf_text"
- name: "Virginia DRPT β€” Human Services Grant Program (5310)"
type: web_page
enabled: true
url: "https://drpt.virginia.gov/our-grant-programs/human-services-grant-program/"
geo: "VA"
categories: ["transportation","elderly","disabilities","5310"]
crawl:
schedule: "weekly"
extract:
mode: "article"
keep_links: true
- name: "Virginia DRPT β€” Coordinated Human Service Mobility Plan (req.)"
type: web_page
enabled: true
url: "https://drpt.virginia.gov/guidelines-and-requirements/coordinated-human-service-mobility-plan/"
geo: "VA"
categories: ["transportation","planning","compliance","5310"]
crawl:
schedule: "monthly"
extract:
mode: "article"
keep_links: true
- name: "MWCOG / TPB β€” Enhanced Mobility (DC region 5310)"
type: web_page
enabled: true
url: "https://www.mwcog.org/transportation/programs/enhanced-mobility/"
geo: "DC-Region"
categories: ["transportation","elderly","disabilities","5310"]
crawl:
schedule: "weekly"
extract:
mode: "article"
keep_links: true
# --- Pennsylvania: PA Creative Industries (PCA) ---
- name: "PA Creative Industries – Capacity Building (landing)"
type: http_html
enabled: true
url: "https://www.pa.gov/agencies/coa/grants-and-loans/capacity-building-programs.html"
geo: "PA"
categories: ["capacity_building"]
parse:
follow_links: true
link_selectors:
- "a[href*='capacity']"
- "a[href*='strategies-for-success']"
- "a[href$='.pdf']"
content_selectors:
- "main"
- "article"
- ".content"
- name: "PA Creative Industries – Creative Sector Flex Fund"
type: http_html
enabled: true
url: "https://www.pa.gov/agencies/coa/grants-and-loans/creative-sector-flex-fund.html"
geo: "PA"
categories: ["capacity_building"]
parse:
follow_links: true
link_selectors:
- "a[href$='.pdf']"
- "a[href*='guidelines']"
- "a[href*='apply']"
content_selectors:
- "main"
- "article"
- ".content"
# --- Pennsylvania: PCCD (eGrants announcements & PDFs) ---
- name: "PCCD – Funding Announcements (eGrants)"
type: http_html
enabled: true
url: "https://www.pccd.pa.gov/Funding/Pages/default.aspx"
geo: "PA"
categories: ["capacity_building", "public_safety", "youth"]
parse:
follow_links: true
link_selectors:
- "a[href*='Funding-Announcement']"
- "a[href$='.pdf']"
- "a[href*='CJAB']"
- "a[href*='VIP']"
- "a[href*='CCVI']"
- "a[href*='BOOST']"
content_selectors:
- "main"
- "article"
- ".ms-rtestate-field"
- name: "PCCD – PDFs (deep fetch)"
type: http_pdf
enabled: true
url_patterns:
- "https://www.pccd.pa.gov/*/*.pdf"
geo: "PA"
categories: ["capacity_building"]
# --- Maryland: OneStop (JS-rendered search) ---
- name: "Maryland OneStop – Capacity search (JS)"
type: http_html_js # Playwright adapter
enabled: true
url: "https://onestop.md.gov/search?query=capacity"
geo: "MD"
categories: ["capacity_building"]
options:
wait_for: "[role='main']"
scroll: true
max_pages: 3
timeout_ms: 180000 # NEW: longer timeout for SPA
network_idle: true # NEW: wait for background XHR/fetch to settle
# debug: true # optional: screenshot on failure
# click_selector: "a[aria-label='Next']" # uncomment if pagination controls appear
parse:
follow_links: true
link_selectors:
- "a[href*='/forms/']"
- "a[href*='/search/']"
content_selectors:
- "[role='main']"
- "main"
- "article"
# --- Maryland: DHCD (housing/community programs & press) ---
- name: "MD DHCD – Programs (grants & loans index)"
type: http_html
enabled: true
url: "https://dhcd.maryland.gov/Pages/Programs.aspx"
geo: "MD"
categories: ["capacity_building", "housing", "community_development"]
parse:
follow_links: true
link_selectors:
- "a[href*='Programs']"
- "a[href$='.pdf']"
- "a[href*='Trust']"
content_selectors:
- "#content"
- "main"
- "article"
- name: "MD DHCD – Press/Notices (watch for NOFOs)"
type: http_html
enabled: true
url: "https://dhcd.maryland.gov/Pages/PressReleases.aspx"
geo: "MD"
categories: ["capacity_building"]
parse:
follow_links: true
link_selectors:
- "a[href$='.pdf']"
- "a[href*='Notice']"
- "a[href*='Funding']"
content_selectors:
- "#content"
- "main"
- "article"
# --- Maryland: Chesapeake Bay Trust (recurring capacity-building RFPs) ---
- name: "Chesapeake Bay Trust – Capacity Building Initiative (CBI)"
type: http_html
enabled: true
url: "https://cbtrust.org/grants/capacity-building/"
geo: "MD"
categories: ["capacity_building", "environment", "community_health"]
parse:
follow_links: true
link_selectors:
- "a[href$='.pdf']"
- "a[href*='Request-for-Proposals']"
- "a[href*='RFP']"
content_selectors:
- "main"
- "article"
- ".entry-content"
- name: "CB Trust – PDFs (deep fetch)"
type: http_pdf
enabled: true
url_patterns:
- "https://cbtrust.org/*/*.pdf"
geo: "MD"
categories: ["capacity_building"]
# --- Pennsylvania: DCED (Programs index; JS-rendered) ---
- name: "PA DCED β€” Programs (JS)"
type: http_html_js
enabled: true
url: "https://dced.pa.gov/programs/"
geo: "PA"
categories: ["capacity_building","community_development","economic_development"]
options:
wait_for: "main"
scroll: true
max_pages: 5
timeout_ms: 180000 # NEW
network_idle: true # NEW
# click_selector: ".pagination a.next"
# debug: true
parse:
item_selector: ".program-listing .program, .content" # fallback
title: ".program-title, h1, h2"
link: ".program-title a@href, a@href"
body: ".program-summary, .entry-content, main"
deadline_selector: ".deadline, .key-dates"
eligibility_selector: ".eligibility, .who-eligible"
# ---------- OPTIONAL: Curated JSON (enable after you generate it) ----------
- name: "State 5310 Listings (curated JSON)"
type: json_static
enabled: false # set to true once you generate the file below
file: "data/state_5310_listings.json"
geo: "PA|MD|VA|DC"
categories: ["transportation","elderly","disabilities","5310","deadlines"]
- name: "Faith-based Foundations β€” Card/Grid (JS)"
type: http_html_js
enabled: true
skip_filters: true
url: "https://example.org/foundations/maryland/religion-related"
geo: "MD|PA|DE|NJ|VA"
categories: ["foundation_private","faith_based","capacity_building"]
options:
wait_for: "[role='main']" # or the results container CSS
scroll: true
scroll_selector: ".results-pane" # ← replace with the REAL scrolling DIV
scroll_times: 40
scroll_wait_ms: 250
min_cards: 20
timeout_ms: 30000
network_idle: false
# click_selector: ".pagination a.next" # only if the page has a Next button
selectors:
card: ".result-card, .card, article, .search-result"
title: "h2 a, h3 a, .card-title a, .result-title a, h2, h3, .card-title"
link: "h2 a, h3 a, .card-title a, .result-title a, a"
description: ".summary, .card-text, .excerpt, p"
meta: ".meta, .tags, .badge, .location"