"""
PDF generation utilities for StudyBuddy
"""
import os
import tempfile
import markdown
import re
from datetime import datetime
from typing import List, Dict
from fastapi import HTTPException
from utils.logger import get_logger
from helpers.setup import gemini_rotator, nvidia_rotator
logger = get_logger("PDF", __name__)
async def _parse_markdown_content(content: str, heading1_style, heading2_style, heading3_style, normal_style, code_style):
"""
Enhanced markdown parser that properly handles bold/italic formatting
"""
from reportlab.platypus import Paragraph, Spacer
from reportlab.lib.units import inch
story = []
lines = content.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
if not line:
story.append(Spacer(1, 6))
i += 1
continue
# Headers
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
header_text = line.lstrip('# ').strip()
header_text = _format_inline_markdown(header_text)
if level == 1:
story.append(Paragraph(header_text, heading1_style))
elif level == 2:
story.append(Paragraph(header_text, heading2_style))
elif level == 3:
story.append(Paragraph(header_text, heading3_style))
else:
story.append(Paragraph(header_text, normal_style))
# Code blocks with language detection
elif line.startswith('```'):
# Extract language if specified
language = line[3:].strip() if len(line) > 3 else 'text'
# Auto-detect language if not specified
if language == 'text':
language = _detect_language_from_content(lines, i)
code_lines = []
i += 1
while i < len(lines) and not lines[i].strip().startswith('```'):
code_lines.append(lines[i])
i += 1
if code_lines:
# Mermaid diagrams → render via Kroki PNG for PDF with retry logic
if language.lower() == 'mermaid':
try:
from reportlab.platypus import Image, Spacer
mermaid_code = '\n'.join(code_lines)
# Use retry logic from diagram.py
from helpers.diagram import _render_mermaid_with_retry
img_bytes = await _render_mermaid_with_retry(mermaid_code, user_id=user_id)
if img_bytes and len(img_bytes) > 0:
import io
img = Image(io.BytesIO(img_bytes))
# Fit within page width (~6 inches after margins)
max_width = 6.0 * inch
if img.drawWidth > max_width:
scale = max_width / float(img.drawWidth)
img.drawWidth = max_width
img.drawHeight = img.drawHeight * scale
story.append(img)
story.append(Spacer(1, 12))
i += 1
continue
else:
logger.warning("[PDF] Mermaid render returned empty image after retries, falling back to code block")
except Exception as me:
logger.warning(f"[PDF] Mermaid render failed after retries, falling back to code block: {me}")
# Fallback: render as code block with mermaid syntax
from reportlab.platypus import XPreformatted, Paragraph
raw_code = '\n'.join(code_lines)
raw_code = raw_code.replace('\t', ' ')
raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n')
raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code)
escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>')
lang_header = f"[MERMAID DIAGRAM]"
story.append(Paragraph(lang_header, code_style))
story.append(XPreformatted(escaped, code_style))
i += 1
continue
from reportlab.platypus import XPreformatted, Paragraph
# Join and sanitize code content: expand tabs, remove control chars that render as squares
raw_code = '\n'.join(code_lines)
raw_code = raw_code.replace('\t', ' ')
raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n')
# Strip non-printable except tab/newline
raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code)
# Escape for XML and apply lightweight syntax highlighting
escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>')
highlighted = _apply_syntax_highlight(escaped, language)
# Add a small language header, then render highlighted code with XPreformatted to preserve spacing
lang_header = f"[{language.upper()}]"
story.append(Paragraph(lang_header, code_style))
story.append(XPreformatted(highlighted, code_style))
# Lists (including nested)
elif line.startswith('- ') or line.startswith('* '):
# Count indentation level
indent_level = len(line) - len(line.lstrip())
list_text = line[2:].strip()
list_text = _format_inline_markdown(list_text)
# Add indentation based on level
indent = " " * (indent_level // 2) if indent_level > 0 else ""
story.append(Paragraph(f"{indent}• {list_text}", normal_style))
# Numbered lists (including nested)
elif re.match(r'^\d+\.\s', line):
# Count indentation level
indent_level = len(line) - len(line.lstrip())
list_text = re.sub(r'^\d+\.\s', '', line)
list_text = _format_inline_markdown(list_text)
# Add indentation based on level
indent = " " * (indent_level // 2) if indent_level > 0 else ""
story.append(Paragraph(f"{indent}• {list_text}", normal_style))
# Blockquotes
elif line.startswith('> '):
quote_text = line[2:].strip()
quote_text = _format_inline_markdown(quote_text)
story.append(Paragraph(f"{quote_text}", normal_style))
# Horizontal rules
elif line.startswith('---') or line.startswith('***'):
story.append(Spacer(1, 12))
story.append(Paragraph("_" * 50, normal_style))
story.append(Spacer(1, 12))
# Regular paragraphs - collect multi-line paragraphs
else:
paragraph_lines = [line]
i += 1
# Collect continuation lines until we hit a blank line or another block type
while i < len(lines):
next_line = lines[i].strip()
# Stop if we hit a blank line
if not next_line:
break
# Stop if we hit a new block type
if (next_line.startswith('#') or
next_line.startswith('```') or
next_line.startswith('- ') or
next_line.startswith('* ') or
re.match(r'^\d+\.\s', next_line) or
next_line.startswith('> ') or
next_line.startswith('---') or
next_line.startswith('***')):
break
paragraph_lines.append(next_line)
i += 1
# Process the complete paragraph
paragraph_text = ' '.join(paragraph_lines)
formatted_text = _format_inline_markdown(paragraph_text)
story.append(Paragraph(formatted_text, normal_style))
continue # Don't increment i again since we already did it in the loop
i += 1
return story
def _detect_language_from_content(lines: list, start_index: int) -> str:
"""
Auto-detect programming language from code content
"""
# Look at the next few lines to detect language
sample_lines = []
for i in range(start_index + 1, min(start_index + 10, len(lines))):
if lines[i].strip().startswith('```'):
break
sample_lines.append(lines[i])
sample_text = '\n'.join(sample_lines)
# Python detection
if (re.search(r'\bdef\s+\w+', sample_text) or
re.search(r'\bclass\s+\w+', sample_text) or
re.search(r'\bimport\s+\w+', sample_text) or
re.search(r'\bfrom\s+\w+', sample_text)):
return 'python'
# JavaScript detection
if (re.search(r'\bfunction\s+\w+', sample_text) or
re.search(r'\bvar\s+\w+', sample_text) or
re.search(r'\blet\s+\w+', sample_text) or
re.search(r'\bconst\s+\w+', sample_text) or
re.search(r'=>', sample_text)):
return 'javascript'
# Java detection
if (re.search(r'\bpublic\s+class', sample_text) or
re.search(r'\bprivate\s+\w+', sample_text) or
re.search(r'\bSystem\.out\.print', sample_text) or
re.search(r'\bimport\s+java\.', sample_text)):
return 'java'
# JSON detection
if (re.search(r'^\s*[{}]', sample_text) or
re.search(r'"[^"]*"\s*:', sample_text) or
re.search(r'\btrue\b|\bfalse\b|\bnull\b', sample_text)):
return 'json'
# XML/HTML detection
if (re.search(r'<[^>]+>', sample_text) or
re.search(r'<[^>]+>', sample_text)):
return 'xml'
# SQL detection
if (re.search(r'\bSELECT\b', sample_text, re.IGNORECASE) or
re.search(r'\bFROM\b', sample_text, re.IGNORECASE) or
re.search(r'\bWHERE\b', sample_text, re.IGNORECASE) or
re.search(r'\bINSERT\b', sample_text, re.IGNORECASE)):
return 'sql'
# YAML detection
if (re.search(r'^\s*\w+:', sample_text) or
re.search(r'^\s*-\s+', sample_text)):
return 'yaml'
# Bash detection
if (re.search(r'^\s*#!', sample_text) or
re.search(r'\$\w+', sample_text) or
re.search(r'^\s*\w+.*\|', sample_text)):
return 'bash'
return 'text'
def _format_code_block(code_text: str, language: str) -> str:
"""
Deprecated: We now render code blocks with Preformatted to avoid paragraph parser errors.
Kept for compatibility if referenced elsewhere; returns escaped plain text.
"""
code_text = code_text.replace('&', '&').replace('<', '<').replace('>', '>')
return f"{code_text}"
def _highlight_python(code: str) -> str:
"""Python syntax highlighting"""
# Keywords
keywords = ['def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'finally',
'import', 'from', 'as', 'with', 'return', 'yield', 'lambda', 'and', 'or', 'not',
'in', 'is', 'True', 'False', 'None', 'pass', 'break', 'continue', 'raise', 'assert']
# Built-in functions
builtins = ['print', 'len', 'str', 'int', 'float', 'list', 'dict', 'tuple', 'set', 'range',
'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed', 'open', 'input']
# String literals
code = re.sub(r'("""[\s\S]*?""")', r'\1', code) # Triple quotes
code = re.sub(r'(".*?")', r'\1', code) # Double quotes
code = re.sub(r"('''[\s\S]*?''')", r'\1', code) # Triple single quotes
code = re.sub(r"('.*?')", r'\1', code) # Single quotes
# Comments
code = re.sub(r'(#.*?)$', r'\1', code, flags=re.MULTILINE)
# Keywords
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'\1', code)
# Built-in functions
for builtin in builtins:
code = re.sub(r'\b(' + builtin + r')\b', r'\1', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code)
return f"{code}"
def _highlight_json(code: str) -> str:
"""JSON syntax highlighting"""
# Strings
code = re.sub(r'(".*?")', r'\1', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code)
# Keywords
code = re.sub(r'\b(true|false|null)\b', r'\1', code)
# Punctuation
code = re.sub(r'([{}[\]])', r'\1', code)
code = re.sub(r'([,])', r'\1', code)
return f"{code}"
def _highlight_xml(code: str) -> str:
"""XML/HTML syntax highlighting"""
# Tags
code = re.sub(r'(<[^>]*>)', r'\1', code)
# Attributes
code = re.sub(r'(\w+)=', r'\1=', code)
# Attribute values
code = re.sub(r'="([^"]*)"', r'="\1"', code)
# Comments
code = re.sub(r'(<!--[\s\S]*?-->)', r'\1', code)
return f"{code}"
def _highlight_java(code: str) -> str:
"""Java syntax highlighting"""
# Keywords
keywords = ['public', 'private', 'protected', 'static', 'final', 'class', 'interface', 'extends',
'implements', 'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue',
'return', 'try', 'catch', 'finally', 'throw', 'throws', 'new', 'this', 'super', 'import',
'package', 'void', 'int', 'long', 'float', 'double', 'boolean', 'char', 'byte', 'short',
'true', 'false', 'null']
# String literals
code = re.sub(r'(".*?")', r'\1', code)
code = re.sub(r"('.*?')", r'\1', code)
# Comments
code = re.sub(r'(//.*?)$', r'\1', code, flags=re.MULTILINE)
code = re.sub(r'(/\*[\s\S]*?\*/)', r'\1', code)
# Keywords
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'\1', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*[fFdDlL]?)\b', r'\1', code)
return f"{code}"
def _highlight_javascript(code: str) -> str:
"""JavaScript syntax highlighting"""
# Keywords
keywords = ['function', 'var', 'let', 'const', 'if', 'else', 'for', 'while', 'do', 'switch',
'case', 'break', 'continue', 'return', 'try', 'catch', 'finally', 'throw', 'new',
'this', 'typeof', 'instanceof', 'true', 'false', 'null', 'undefined', 'async', 'await']
# String literals
code = re.sub(r'(".*?")', r'\1', code)
code = re.sub(r"('.*?')", r'\1', code)
code = re.sub(r'(`.*?`)', r'\1', code) # Template literals
# Comments
code = re.sub(r'(//.*?)$', r'\1', code, flags=re.MULTILINE)
code = re.sub(r'(/\*[\s\S]*?\*/)', r'\1', code)
# Keywords
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'\1', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code)
return f"{code}"
def _highlight_sql(code: str) -> str:
"""SQL syntax highlighting"""
# Keywords
keywords = ['SELECT', 'FROM', 'WHERE', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER',
'TABLE', 'INDEX', 'VIEW', 'DATABASE', 'SCHEMA', 'JOIN', 'LEFT', 'RIGHT', 'INNER', 'OUTER',
'ON', 'GROUP', 'BY', 'ORDER', 'HAVING', 'UNION', 'DISTINCT', 'COUNT', 'SUM', 'AVG', 'MAX', 'MIN',
'AND', 'OR', 'NOT', 'IN', 'BETWEEN', 'LIKE', 'IS', 'NULL', 'ASC', 'DESC', 'LIMIT', 'OFFSET']
# String literals
code = re.sub(r"('.*?')", r'\1', code)
# Comments
code = re.sub(r'(--.*?)$', r'\1', code, flags=re.MULTILINE)
code = re.sub(r'(/\*[\s\S]*?\*/)', r'\1', code)
# Keywords (case insensitive)
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'\1', code, flags=re.IGNORECASE)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code)
return f"{code}"
def _highlight_yaml(code: str) -> str:
"""YAML syntax highlighting"""
# Keys
code = re.sub(r'^(\s*)([^:]+):', r'\1\2:', code, flags=re.MULTILINE)
# String values
code = re.sub(r'(".*?")', r'\1', code)
code = re.sub(r"('.*?')", r'\1', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code)
# Booleans
code = re.sub(r'\b(true|false|yes|no|on|off)\b', r'\1', code)
# Comments
code = re.sub(r'(#.*?)$', r'\1', code, flags=re.MULTILINE)
return f"{code}"
def _highlight_bash(code: str) -> str:
"""Bash/Shell syntax highlighting"""
# Comments
code = re.sub(r'(#.*?)$', r'\1', code, flags=re.MULTILINE)
# Commands (first word on line)
code = re.sub(r'^(\s*)([a-zA-Z_][a-zA-Z0-9_]*)', r'\1\2', code, flags=re.MULTILINE)
# Variables
code = re.sub(r'(\$[a-zA-Z_][a-zA-Z0-9_]*)', r'\1', code)
code = re.sub(r'(\$\{[^}]+\})', r'\1', code)
# Strings
code = re.sub(r'(".*?")', r'\1', code)
code = re.sub(r"('.*?')", r'\1', code)
# Redirections and pipes
code = re.sub(r'([<>|&])', r'\1', code)
return f"{code}"
def _format_inline_markdown(text: str) -> str:
"""
Format inline markdown elements (bold, italic, code, links)
"""
# Escape HTML characters first
text = text.replace('&', '&')
text = text.replace('<', '<')
text = text.replace('>', '>')
# Process in order of precedence to avoid nested tag conflicts
# 1. Inline code (`code`) - highest precedence, no nested formatting
text = re.sub(r'`([^`]+)`', r'\1', text)
# 2. Bold text (**text** or __text__) - but not inside code blocks
text = re.sub(r'(?\1', text)
text = re.sub(r'(?\1', text)
# 3. Italic text (*text* or _text_) - but not inside code blocks or bold
text = re.sub(r'(?\1', text)
text = re.sub(r'(?\1', text)
# 4. Strikethrough (~~text~~) - but not inside other formatting
text = re.sub(r'~~([^~]+)~~', r'\1', text)
# 5. Links [text](url) - convert to clickable text
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
# 6. Line breaks
text = text.replace('\n', '
')
return text
def _apply_syntax_highlight(escaped_code: str, language: str) -> str:
"""
Apply professional IDE-like syntax highlighting on XML-escaped code text.
Works with escaped entities (< > &), so regexes should not rely on raw quotes.
"""
def sub_outside_tags(pattern, repl, text, flags=0):
parts = re.split(r'(?[^>]+>)', text)
for idx in range(0, len(parts)):
if idx % 2 == 0: # outside tags
parts[idx] = re.sub(pattern, repl, parts[idx], flags=flags)
return ''.join(parts)
out = escaped_code
lang = (language or 'text').lower()
if lang in ('python', 'py'):
# Comments first (gray)
out = sub_outside_tags(r"(#[^\n]*)", r"\1", out)
# Docstrings (green)
out = sub_outside_tags(r'("""[\s\S]*?""")', r"\1", out)
out = sub_outside_tags(r"('''[\s\S]*?''')", r"\1", out)
# Keywords (purple)
keywords = (
'def|class|if|else|elif|for|while|try|except|finally|import|from|as|with|return|yield|lambda|and|or|not|in|is|True|False|None|pass|break|continue|raise|assert|global|nonlocal'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out)
# Built-in functions (blue)
builtins = (
'print|len|str|int|float|list|dict|tuple|set|range|enumerate|zip|map|filter|sorted|reversed|open|input|type|isinstance|hasattr|getattr|setattr|delattr'
)
out = sub_outside_tags(rf"\b({builtins})\b", r"\1", out)
elif lang in ('javascript', 'js', 'typescript', 'ts'):
# Comments (gray)
out = sub_outside_tags(r"(//[^\n]*)", r"\1", out)
out = sub_outside_tags(r"/\*[\s\S]*?\*/", lambda m: f"{m.group(0)}", out)
# Keywords (purple)
keywords = (
'function|var|let|const|if|else|for|while|do|switch|case|break|continue|return|try|catch|finally|throw|new|this|typeof|instanceof|true|false|null|undefined|async|await|class|extends|implements|interface|type|namespace|module|export|import|default|public|private|protected|static|abstract|readonly'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out)
# Built-in objects (blue)
builtins = (
'console|window|document|Array|Object|String|Number|Boolean|Date|Math|JSON|Promise|Set|Map|WeakSet|WeakMap|Symbol|Proxy|Reflect'
)
out = sub_outside_tags(rf"\b({builtins})\b", r"\1", out)
elif lang in ('json',):
# Boolean and null values (blue)
out = sub_outside_tags(r"\b(true|false|null)\b", r"\1", out)
# Keys (purple)
out = sub_outside_tags(r"("[^&]*?")(\s*:)", r"\1\2", out)
elif lang in ('bash', 'sh', 'shell'):
# Comments (gray)
out = sub_outside_tags(r"(#[^\n]*)", r"\1", out)
# Commands (purple)
out = sub_outside_tags(r"(^|\n)(\s*)([a-zA-Z_][a-zA-Z0-9_-]*)", r"\1\2\3", out)
# Variables (blue)
out = sub_outside_tags(r"(\$[a-zA-Z_][a-zA-Z0-9_]*)", r"\1", out)
out = sub_outside_tags(r"(\$\{[^}]+\})", r"\1", out)
elif lang in ('yaml', 'yml'):
# Keys (purple)
out = sub_outside_tags(r"(^|\n)(\s*)([^:\n]+)(:)", r"\1\2\3\4", out)
# Boolean values (blue)
out = sub_outside_tags(r"\b(true|false|yes|no|on|off)\b", r"\1", out, flags=re.IGNORECASE)
elif lang in ('sql',):
# Keywords (purple)
keywords = (
'SELECT|FROM|WHERE|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|TABLE|INDEX|VIEW|DATABASE|SCHEMA|JOIN|LEFT|RIGHT|INNER|OUTER|ON|GROUP|BY|ORDER|HAVING|UNION|DISTINCT|COUNT|SUM|AVG|MAX|MIN|AND|OR|NOT|IN|BETWEEN|LIKE|IS|NULL|ASC|DESC|LIMIT|OFFSET|CASE|WHEN|THEN|ELSE|END|EXISTS|ALL|ANY|SOME'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out, flags=re.IGNORECASE)
elif lang in ('java',):
# Comments (gray)
out = sub_outside_tags(r"(//[^\n]*)", r"\1", out)
out = sub_outside_tags(r"/\*[\s\S]*?\*/", lambda m: f"{m.group(0)}", out)
# Keywords (purple)
keywords = (
'public|private|protected|static|final|class|interface|extends|implements|if|else|for|while|do|switch|case|break|continue|return|try|catch|finally|throw|throws|new|this|super|import|package|void|int|long|float|double|boolean|char|byte|short|true|false|null|abstract|native|synchronized|volatile|transient|strictfp'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out)
# Built-in classes (blue)
builtins = (
'String|Object|Integer|Long|Float|Double|Boolean|Character|Byte|Short|System|Math|ArrayList|HashMap|HashSet|LinkedList|Vector|Collections|Arrays'
)
out = sub_outside_tags(rf"\b({builtins})\b", r"\1", out)
elif lang in ('css',):
# Selectors (purple)
out = sub_outside_tags(r"([.#]?[a-zA-Z][a-zA-Z0-9_-]*)(\s*\{)", r"\1\2", out)
# Properties (blue)
out = sub_outside_tags(r"([a-zA-Z-]+)(\s*:)", r"\1\2", out)
# Values (green)
out = sub_outside_tags(r"(\s*:\s*)([^;]+)(;)", r"\1\2\3", out)
elif lang in ('html', 'xml'):
# Tags (purple)
out = sub_outside_tags(r"(<[^>]*>)", r"\1", out)
# Attributes (blue)
out = sub_outside_tags(r"(\w+)=("[^&]*?")", r"\1=\2", out)
# Strings (green) - apply to all languages
out = sub_outside_tags(r"(".*?")", r"\1", out)
out = sub_outside_tags(r"('.*?')", r"\1", out)
out = sub_outside_tags(r"(`.*?`)", r"\1", out)
# Numbers (orange) - apply to all languages
out = sub_outside_tags(r"\b(\d+\.?\d*)\b", r"\1", out)
return out
def _render_mermaid_png(mermaid_text: str) -> bytes:
"""
Render mermaid code to PNG via Kroki service (no local mermaid-cli dependency).
Falls back to returning empty bytes on failure.
"""
try:
import base64
import json
import urllib.request
import urllib.error
# Validate and clean mermaid content
if not mermaid_text or not mermaid_text.strip():
logger.warning("[PDF] Empty mermaid content")
return b""
# Clean the mermaid text - remove any potential issues
cleaned_text = mermaid_text.strip()
# Basic mermaid syntax validation
if not cleaned_text.startswith(('graph', 'flowchart', 'sequenceDiagram', 'classDiagram', 'stateDiagram', 'erDiagram', 'journey', 'gantt', 'pie', 'gitgraph')):
logger.warning(f"[PDF] Invalid mermaid diagram type: {cleaned_text[:50]}...")
return b""
# Kroki POST API for mermaid -> png
data = json.dumps({"diagram_source": cleaned_text}).encode("utf-8")
req = urllib.request.Request(
url="https://kroki.io/mermaid/png",
data=data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=15) as resp:
if resp.status == 200:
return resp.read()
else:
logger.warning(f"[PDF] Kroki returned status {resp.status}")
return b""
except urllib.error.HTTPError as e:
if e.code == 400:
logger.warning(f"[PDF] Kroki mermaid syntax error (400): {e.reason}")
else:
logger.warning(f"[PDF] Kroki HTTP error {e.code}: {e.reason}")
except urllib.error.URLError as e:
logger.warning(f"[PDF] Kroki connection error: {e.reason}")
except Exception as e:
logger.warning(f"[PDF] Kroki mermaid render error: {e}")
return b""
async def _format_references_ieee(sources: List[Dict]) -> List[str]:
"""Format sources in IEEE citation style using NVIDIA API."""
try:
from utils.api.router import generate_answer_with_model
from helpers.setup import nvidia_rotator
if not sources or not nvidia_rotator:
return []
# Prepare source data for formatting
source_data = []
for i, source in enumerate(sources, 1):
source_info = {
"number": i,
"filename": source.get("filename", "Unknown"),
"url": source.get("url", ""),
"topic_name": source.get("topic_name", ""),
"kind": source.get("kind", "document")
}
source_data.append(source_info)
sys_prompt = """You are an expert at formatting academic references in IEEE style.
Format the provided sources as IEEE-style references. Each reference should be numbered and formatted according to IEEE standards.
For web sources: [1] Author/Organization, "Title," Website Name, URL, accessed: Date.
For documents: [1] Author, "Title," Document Type, Filename, Year.
Return only the formatted references, one per line, numbered sequentially."""
user_prompt = f"Format these sources in IEEE style:\n\n{source_data}"
selection = {"provider": "nvidia", "model": os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")}
response = await generate_answer_with_model(selection, sys_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id="system", context="pdf_citation")
# Parse the response into individual references
references = [line.strip() for line in response.split('\n') if line.strip() and line.strip().startswith('[')]
# If NVIDIA formatting fails, create basic IEEE format
if not references:
references = []
for i, source in enumerate(sources, 1):
if source.get("kind") == "web":
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
else:
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
references.append(ref)
return references
except Exception as e:
logger.warning(f"[PDF] IEEE reference formatting failed: {e}")
# Fallback to basic formatting
references = []
for i, source in enumerate(sources, 1):
if source.get("kind") == "web":
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
else:
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
references.append(ref)
return references
async def generate_report_pdf(report_content: str, user_id: str, project_id: str, sources: List[Dict] = None) -> bytes:
"""
Generate a PDF from report content using reportlab
Args:
report_content: Markdown content of the report
user_id: User ID for logging
project_id: Project ID for logging
Returns:
PDF content as bytes
Raises:
HTTPException: If PDF generation fails
"""
try:
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from io import BytesIO
logger.info(f"[PDF] Generating PDF for user {user_id}, project {project_id}")
# Create a BytesIO buffer for the PDF
buffer = BytesIO()
# Create the PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Get styles
styles = getSampleStyleSheet()
# Create custom styles
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=24,
spaceAfter=30,
textColor=colors.HexColor('#2c3e50'),
borderWidth=1,
borderColor=colors.HexColor('#3498db'),
borderPadding=10
)
heading1_style = ParagraphStyle(
'CustomHeading1',
parent=styles['Heading1'],
fontSize=18,
spaceAfter=12,
spaceBefore=20,
textColor=colors.HexColor('#2c3e50')
)
heading2_style = ParagraphStyle(
'CustomHeading2',
parent=styles['Heading2'],
fontSize=16,
spaceAfter=10,
spaceBefore=16,
textColor=colors.HexColor('#2c3e50')
)
heading3_style = ParagraphStyle(
'CustomHeading3',
parent=styles['Heading3'],
fontSize=14,
spaceAfter=8,
spaceBefore=12,
textColor=colors.HexColor('#2c3e50')
)
normal_style = ParagraphStyle(
'CustomNormal',
parent=styles['Normal'],
fontSize=11,
spaceAfter=6,
leading=14
)
# Professional IDE-like code styling with no background
base_code_parent = styles['Code'] if 'Code' in styles.byName else styles['Normal']
code_style = ParagraphStyle(
'Code',
parent=base_code_parent,
fontSize=9,
fontName='Courier',
textColor=colors.HexColor('#2c3e50'), # Dark text on white background
backColor=None, # No background color
borderColor=colors.HexColor('#e1e8ed'),
borderWidth=1,
borderPadding=8,
leftIndent=12,
rightIndent=12,
spaceBefore=6,
spaceAfter=6,
leading=11
)
# Parse markdown content
story = []
# Add title
story.append(Paragraph("StudyBuddy Report", title_style))
story.append(Paragraph(f"Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", normal_style))
story.append(Spacer(1, 20))
# Enhanced markdown parser with proper formatting
story.extend(await _parse_markdown_content(report_content, heading1_style, heading2_style, heading3_style, normal_style, code_style))
# Add references section if sources provided
if sources:
story.append(PageBreak())
story.append(Paragraph("References", heading1_style))
story.append(Spacer(1, 12))
# Format references in IEEE style using NVIDIA API
try:
ieee_references = await _format_references_ieee(sources)
except Exception as _ie:
logger.warning(f"[PDF] Reference formatting failed, falling back: {_ie}")
ieee_references = []
for i, source in enumerate(sources, 1):
if source.get("kind") == "web":
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
else:
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
ieee_references.append(ref)
for ref in ieee_references:
story.append(Paragraph(ref, normal_style))
story.append(Spacer(1, 6))
# Build PDF
doc.build(story)
# Get PDF content
pdf_content = buffer.getvalue()
buffer.close()
logger.info(f"[PDF] Successfully generated PDF ({len(pdf_content)} bytes) for user {user_id}, project {project_id}")
return pdf_content
except ImportError:
logger.error("[PDF] reportlab not installed. Install with: pip install reportlab")
raise HTTPException(500, detail="PDF generation not available. Please install reportlab.")
except Exception as e:
logger.error(f"[PDF] Failed to generate PDF: {e}")
# Keep error generic for client; avoid leaking internals
raise HTTPException(500, detail="Failed to generate PDF")