""" PDF generation utilities for StudyBuddy """ import os import tempfile import markdown import re from datetime import datetime from typing import List, Dict from fastapi import HTTPException from utils.logger import get_logger from helpers.setup import gemini_rotator, nvidia_rotator logger = get_logger("PDF", __name__) async def _parse_markdown_content(content: str, heading1_style, heading2_style, heading3_style, normal_style, code_style): """ Enhanced markdown parser that properly handles bold/italic formatting """ from reportlab.platypus import Paragraph, Spacer from reportlab.lib.units import inch story = [] lines = content.split('\n') i = 0 while i < len(lines): line = lines[i].strip() if not line: story.append(Spacer(1, 6)) i += 1 continue # Headers if line.startswith('#'): level = len(line) - len(line.lstrip('#')) header_text = line.lstrip('# ').strip() header_text = _format_inline_markdown(header_text) if level == 1: story.append(Paragraph(header_text, heading1_style)) elif level == 2: story.append(Paragraph(header_text, heading2_style)) elif level == 3: story.append(Paragraph(header_text, heading3_style)) else: story.append(Paragraph(header_text, normal_style)) # Code blocks with language detection elif line.startswith('```'): # Extract language if specified language = line[3:].strip() if len(line) > 3 else 'text' # Auto-detect language if not specified if language == 'text': language = _detect_language_from_content(lines, i) code_lines = [] i += 1 while i < len(lines) and not lines[i].strip().startswith('```'): code_lines.append(lines[i]) i += 1 if code_lines: # Mermaid diagrams → render via Kroki PNG for PDF with retry logic if language.lower() == 'mermaid': try: from reportlab.platypus import Image, Spacer mermaid_code = '\n'.join(code_lines) # Use retry logic from diagram.py from helpers.diagram import _render_mermaid_with_retry img_bytes = await _render_mermaid_with_retry(mermaid_code, user_id=user_id) if img_bytes and len(img_bytes) > 0: import io img = Image(io.BytesIO(img_bytes)) # Fit within page width (~6 inches after margins) max_width = 6.0 * inch if img.drawWidth > max_width: scale = max_width / float(img.drawWidth) img.drawWidth = max_width img.drawHeight = img.drawHeight * scale story.append(img) story.append(Spacer(1, 12)) i += 1 continue else: logger.warning("[PDF] Mermaid render returned empty image after retries, falling back to code block") except Exception as me: logger.warning(f"[PDF] Mermaid render failed after retries, falling back to code block: {me}") # Fallback: render as code block with mermaid syntax from reportlab.platypus import XPreformatted, Paragraph raw_code = '\n'.join(code_lines) raw_code = raw_code.replace('\t', ' ') raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n') raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code) escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>') lang_header = f"[MERMAID DIAGRAM]" story.append(Paragraph(lang_header, code_style)) story.append(XPreformatted(escaped, code_style)) i += 1 continue from reportlab.platypus import XPreformatted, Paragraph # Join and sanitize code content: expand tabs, remove control chars that render as squares raw_code = '\n'.join(code_lines) raw_code = raw_code.replace('\t', ' ') raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n') # Strip non-printable except tab/newline raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code) # Escape for XML and apply lightweight syntax highlighting escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>') highlighted = _apply_syntax_highlight(escaped, language) # Add a small language header, then render highlighted code with XPreformatted to preserve spacing lang_header = f"[{language.upper()}]" story.append(Paragraph(lang_header, code_style)) story.append(XPreformatted(highlighted, code_style)) # Lists (including nested) elif line.startswith('- ') or line.startswith('* '): # Count indentation level indent_level = len(line) - len(line.lstrip()) list_text = line[2:].strip() list_text = _format_inline_markdown(list_text) # Add indentation based on level indent = " " * (indent_level // 2) if indent_level > 0 else "" story.append(Paragraph(f"{indent}• {list_text}", normal_style)) # Numbered lists (including nested) elif re.match(r'^\d+\.\s', line): # Count indentation level indent_level = len(line) - len(line.lstrip()) list_text = re.sub(r'^\d+\.\s', '', line) list_text = _format_inline_markdown(list_text) # Add indentation based on level indent = " " * (indent_level // 2) if indent_level > 0 else "" story.append(Paragraph(f"{indent}• {list_text}", normal_style)) # Blockquotes elif line.startswith('> '): quote_text = line[2:].strip() quote_text = _format_inline_markdown(quote_text) story.append(Paragraph(f"{quote_text}", normal_style)) # Horizontal rules elif line.startswith('---') or line.startswith('***'): story.append(Spacer(1, 12)) story.append(Paragraph("_" * 50, normal_style)) story.append(Spacer(1, 12)) # Regular paragraphs - collect multi-line paragraphs else: paragraph_lines = [line] i += 1 # Collect continuation lines until we hit a blank line or another block type while i < len(lines): next_line = lines[i].strip() # Stop if we hit a blank line if not next_line: break # Stop if we hit a new block type if (next_line.startswith('#') or next_line.startswith('```') or next_line.startswith('- ') or next_line.startswith('* ') or re.match(r'^\d+\.\s', next_line) or next_line.startswith('> ') or next_line.startswith('---') or next_line.startswith('***')): break paragraph_lines.append(next_line) i += 1 # Process the complete paragraph paragraph_text = ' '.join(paragraph_lines) formatted_text = _format_inline_markdown(paragraph_text) story.append(Paragraph(formatted_text, normal_style)) continue # Don't increment i again since we already did it in the loop i += 1 return story def _detect_language_from_content(lines: list, start_index: int) -> str: """ Auto-detect programming language from code content """ # Look at the next few lines to detect language sample_lines = [] for i in range(start_index + 1, min(start_index + 10, len(lines))): if lines[i].strip().startswith('```'): break sample_lines.append(lines[i]) sample_text = '\n'.join(sample_lines) # Python detection if (re.search(r'\bdef\s+\w+', sample_text) or re.search(r'\bclass\s+\w+', sample_text) or re.search(r'\bimport\s+\w+', sample_text) or re.search(r'\bfrom\s+\w+', sample_text)): return 'python' # JavaScript detection if (re.search(r'\bfunction\s+\w+', sample_text) or re.search(r'\bvar\s+\w+', sample_text) or re.search(r'\blet\s+\w+', sample_text) or re.search(r'\bconst\s+\w+', sample_text) or re.search(r'=>', sample_text)): return 'javascript' # Java detection if (re.search(r'\bpublic\s+class', sample_text) or re.search(r'\bprivate\s+\w+', sample_text) or re.search(r'\bSystem\.out\.print', sample_text) or re.search(r'\bimport\s+java\.', sample_text)): return 'java' # JSON detection if (re.search(r'^\s*[{}]', sample_text) or re.search(r'"[^"]*"\s*:', sample_text) or re.search(r'\btrue\b|\bfalse\b|\bnull\b', sample_text)): return 'json' # XML/HTML detection if (re.search(r'<[^>]+>', sample_text) or re.search(r'<[^>]+>', sample_text)): return 'xml' # SQL detection if (re.search(r'\bSELECT\b', sample_text, re.IGNORECASE) or re.search(r'\bFROM\b', sample_text, re.IGNORECASE) or re.search(r'\bWHERE\b', sample_text, re.IGNORECASE) or re.search(r'\bINSERT\b', sample_text, re.IGNORECASE)): return 'sql' # YAML detection if (re.search(r'^\s*\w+:', sample_text) or re.search(r'^\s*-\s+', sample_text)): return 'yaml' # Bash detection if (re.search(r'^\s*#!', sample_text) or re.search(r'\$\w+', sample_text) or re.search(r'^\s*\w+.*\|', sample_text)): return 'bash' return 'text' def _format_code_block(code_text: str, language: str) -> str: """ Deprecated: We now render code blocks with Preformatted to avoid paragraph parser errors. Kept for compatibility if referenced elsewhere; returns escaped plain text. """ code_text = code_text.replace('&', '&').replace('<', '<').replace('>', '>') return f"{code_text}" def _highlight_python(code: str) -> str: """Python syntax highlighting""" # Keywords keywords = ['def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'finally', 'import', 'from', 'as', 'with', 'return', 'yield', 'lambda', 'and', 'or', 'not', 'in', 'is', 'True', 'False', 'None', 'pass', 'break', 'continue', 'raise', 'assert'] # Built-in functions builtins = ['print', 'len', 'str', 'int', 'float', 'list', 'dict', 'tuple', 'set', 'range', 'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed', 'open', 'input'] # String literals code = re.sub(r'("""[\s\S]*?""")', r'\1', code) # Triple quotes code = re.sub(r'(".*?")', r'\1', code) # Double quotes code = re.sub(r"('''[\s\S]*?''')", r'\1', code) # Triple single quotes code = re.sub(r"('.*?')", r'\1', code) # Single quotes # Comments code = re.sub(r'(#.*?)$', r'\1', code, flags=re.MULTILINE) # Keywords for keyword in keywords: code = re.sub(r'\b(' + keyword + r')\b', r'\1', code) # Built-in functions for builtin in builtins: code = re.sub(r'\b(' + builtin + r')\b', r'\1', code) # Numbers code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code) return f"{code}" def _highlight_json(code: str) -> str: """JSON syntax highlighting""" # Strings code = re.sub(r'(".*?")', r'\1', code) # Numbers code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code) # Keywords code = re.sub(r'\b(true|false|null)\b', r'\1', code) # Punctuation code = re.sub(r'([{}[\]])', r'\1', code) code = re.sub(r'([,])', r'\1', code) return f"{code}" def _highlight_xml(code: str) -> str: """XML/HTML syntax highlighting""" # Tags code = re.sub(r'(<[^>]*>)', r'\1', code) # Attributes code = re.sub(r'(\w+)=', r'\1=', code) # Attribute values code = re.sub(r'="([^"]*)"', r'="\1"', code) # Comments code = re.sub(r'(<!--[\s\S]*?-->)', r'\1', code) return f"{code}" def _highlight_java(code: str) -> str: """Java syntax highlighting""" # Keywords keywords = ['public', 'private', 'protected', 'static', 'final', 'class', 'interface', 'extends', 'implements', 'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue', 'return', 'try', 'catch', 'finally', 'throw', 'throws', 'new', 'this', 'super', 'import', 'package', 'void', 'int', 'long', 'float', 'double', 'boolean', 'char', 'byte', 'short', 'true', 'false', 'null'] # String literals code = re.sub(r'(".*?")', r'\1', code) code = re.sub(r"('.*?')", r'\1', code) # Comments code = re.sub(r'(//.*?)$', r'\1', code, flags=re.MULTILINE) code = re.sub(r'(/\*[\s\S]*?\*/)', r'\1', code) # Keywords for keyword in keywords: code = re.sub(r'\b(' + keyword + r')\b', r'\1', code) # Numbers code = re.sub(r'\b(\d+\.?\d*[fFdDlL]?)\b', r'\1', code) return f"{code}" def _highlight_javascript(code: str) -> str: """JavaScript syntax highlighting""" # Keywords keywords = ['function', 'var', 'let', 'const', 'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue', 'return', 'try', 'catch', 'finally', 'throw', 'new', 'this', 'typeof', 'instanceof', 'true', 'false', 'null', 'undefined', 'async', 'await'] # String literals code = re.sub(r'(".*?")', r'\1', code) code = re.sub(r"('.*?')", r'\1', code) code = re.sub(r'(`.*?`)', r'\1', code) # Template literals # Comments code = re.sub(r'(//.*?)$', r'\1', code, flags=re.MULTILINE) code = re.sub(r'(/\*[\s\S]*?\*/)', r'\1', code) # Keywords for keyword in keywords: code = re.sub(r'\b(' + keyword + r')\b', r'\1', code) # Numbers code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code) return f"{code}" def _highlight_sql(code: str) -> str: """SQL syntax highlighting""" # Keywords keywords = ['SELECT', 'FROM', 'WHERE', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER', 'TABLE', 'INDEX', 'VIEW', 'DATABASE', 'SCHEMA', 'JOIN', 'LEFT', 'RIGHT', 'INNER', 'OUTER', 'ON', 'GROUP', 'BY', 'ORDER', 'HAVING', 'UNION', 'DISTINCT', 'COUNT', 'SUM', 'AVG', 'MAX', 'MIN', 'AND', 'OR', 'NOT', 'IN', 'BETWEEN', 'LIKE', 'IS', 'NULL', 'ASC', 'DESC', 'LIMIT', 'OFFSET'] # String literals code = re.sub(r"('.*?')", r'\1', code) # Comments code = re.sub(r'(--.*?)$', r'\1', code, flags=re.MULTILINE) code = re.sub(r'(/\*[\s\S]*?\*/)', r'\1', code) # Keywords (case insensitive) for keyword in keywords: code = re.sub(r'\b(' + keyword + r')\b', r'\1', code, flags=re.IGNORECASE) # Numbers code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code) return f"{code}" def _highlight_yaml(code: str) -> str: """YAML syntax highlighting""" # Keys code = re.sub(r'^(\s*)([^:]+):', r'\1\2:', code, flags=re.MULTILINE) # String values code = re.sub(r'(".*?")', r'\1', code) code = re.sub(r"('.*?')", r'\1', code) # Numbers code = re.sub(r'\b(\d+\.?\d*)\b', r'\1', code) # Booleans code = re.sub(r'\b(true|false|yes|no|on|off)\b', r'\1', code) # Comments code = re.sub(r'(#.*?)$', r'\1', code, flags=re.MULTILINE) return f"{code}" def _highlight_bash(code: str) -> str: """Bash/Shell syntax highlighting""" # Comments code = re.sub(r'(#.*?)$', r'\1', code, flags=re.MULTILINE) # Commands (first word on line) code = re.sub(r'^(\s*)([a-zA-Z_][a-zA-Z0-9_]*)', r'\1\2', code, flags=re.MULTILINE) # Variables code = re.sub(r'(\$[a-zA-Z_][a-zA-Z0-9_]*)', r'\1', code) code = re.sub(r'(\$\{[^}]+\})', r'\1', code) # Strings code = re.sub(r'(".*?")', r'\1', code) code = re.sub(r"('.*?')", r'\1', code) # Redirections and pipes code = re.sub(r'([<>|&])', r'\1', code) return f"{code}" def _format_inline_markdown(text: str) -> str: """ Format inline markdown elements (bold, italic, code, links) """ # Escape HTML characters first text = text.replace('&', '&') text = text.replace('<', '<') text = text.replace('>', '>') # Process in order of precedence to avoid nested tag conflicts # 1. Inline code (`code`) - highest precedence, no nested formatting text = re.sub(r'`([^`]+)`', r'\1', text) # 2. Bold text (**text** or __text__) - but not inside code blocks text = re.sub(r'(?\1', text) text = re.sub(r'(?\1', text) # 3. Italic text (*text* or _text_) - but not inside code blocks or bold text = re.sub(r'(?\1', text) text = re.sub(r'(?\1', text) # 4. Strikethrough (~~text~~) - but not inside other formatting text = re.sub(r'~~([^~]+)~~', r'\1', text) # 5. Links [text](url) - convert to clickable text text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text) # 6. Line breaks text = text.replace('\n', '
') return text def _apply_syntax_highlight(escaped_code: str, language: str) -> str: """ Apply professional IDE-like syntax highlighting on XML-escaped code text. Works with escaped entities (< > &), so regexes should not rely on raw quotes. """ def sub_outside_tags(pattern, repl, text, flags=0): parts = re.split(r'(]+>)', text) for idx in range(0, len(parts)): if idx % 2 == 0: # outside tags parts[idx] = re.sub(pattern, repl, parts[idx], flags=flags) return ''.join(parts) out = escaped_code lang = (language or 'text').lower() if lang in ('python', 'py'): # Comments first (gray) out = sub_outside_tags(r"(#[^\n]*)", r"\1", out) # Docstrings (green) out = sub_outside_tags(r'("""[\s\S]*?""")', r"\1", out) out = sub_outside_tags(r"('''[\s\S]*?''')", r"\1", out) # Keywords (purple) keywords = ( 'def|class|if|else|elif|for|while|try|except|finally|import|from|as|with|return|yield|lambda|and|or|not|in|is|True|False|None|pass|break|continue|raise|assert|global|nonlocal' ) out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out) # Built-in functions (blue) builtins = ( 'print|len|str|int|float|list|dict|tuple|set|range|enumerate|zip|map|filter|sorted|reversed|open|input|type|isinstance|hasattr|getattr|setattr|delattr' ) out = sub_outside_tags(rf"\b({builtins})\b", r"\1", out) elif lang in ('javascript', 'js', 'typescript', 'ts'): # Comments (gray) out = sub_outside_tags(r"(//[^\n]*)", r"\1", out) out = sub_outside_tags(r"/\*[\s\S]*?\*/", lambda m: f"{m.group(0)}", out) # Keywords (purple) keywords = ( 'function|var|let|const|if|else|for|while|do|switch|case|break|continue|return|try|catch|finally|throw|new|this|typeof|instanceof|true|false|null|undefined|async|await|class|extends|implements|interface|type|namespace|module|export|import|default|public|private|protected|static|abstract|readonly' ) out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out) # Built-in objects (blue) builtins = ( 'console|window|document|Array|Object|String|Number|Boolean|Date|Math|JSON|Promise|Set|Map|WeakSet|WeakMap|Symbol|Proxy|Reflect' ) out = sub_outside_tags(rf"\b({builtins})\b", r"\1", out) elif lang in ('json',): # Boolean and null values (blue) out = sub_outside_tags(r"\b(true|false|null)\b", r"\1", out) # Keys (purple) out = sub_outside_tags(r"("[^&]*?")(\s*:)", r"\1\2", out) elif lang in ('bash', 'sh', 'shell'): # Comments (gray) out = sub_outside_tags(r"(#[^\n]*)", r"\1", out) # Commands (purple) out = sub_outside_tags(r"(^|\n)(\s*)([a-zA-Z_][a-zA-Z0-9_-]*)", r"\1\2\3", out) # Variables (blue) out = sub_outside_tags(r"(\$[a-zA-Z_][a-zA-Z0-9_]*)", r"\1", out) out = sub_outside_tags(r"(\$\{[^}]+\})", r"\1", out) elif lang in ('yaml', 'yml'): # Keys (purple) out = sub_outside_tags(r"(^|\n)(\s*)([^:\n]+)(:)", r"\1\2\3\4", out) # Boolean values (blue) out = sub_outside_tags(r"\b(true|false|yes|no|on|off)\b", r"\1", out, flags=re.IGNORECASE) elif lang in ('sql',): # Keywords (purple) keywords = ( 'SELECT|FROM|WHERE|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|TABLE|INDEX|VIEW|DATABASE|SCHEMA|JOIN|LEFT|RIGHT|INNER|OUTER|ON|GROUP|BY|ORDER|HAVING|UNION|DISTINCT|COUNT|SUM|AVG|MAX|MIN|AND|OR|NOT|IN|BETWEEN|LIKE|IS|NULL|ASC|DESC|LIMIT|OFFSET|CASE|WHEN|THEN|ELSE|END|EXISTS|ALL|ANY|SOME' ) out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out, flags=re.IGNORECASE) elif lang in ('java',): # Comments (gray) out = sub_outside_tags(r"(//[^\n]*)", r"\1", out) out = sub_outside_tags(r"/\*[\s\S]*?\*/", lambda m: f"{m.group(0)}", out) # Keywords (purple) keywords = ( 'public|private|protected|static|final|class|interface|extends|implements|if|else|for|while|do|switch|case|break|continue|return|try|catch|finally|throw|throws|new|this|super|import|package|void|int|long|float|double|boolean|char|byte|short|true|false|null|abstract|native|synchronized|volatile|transient|strictfp' ) out = sub_outside_tags(rf"\b({keywords})\b", r"\1", out) # Built-in classes (blue) builtins = ( 'String|Object|Integer|Long|Float|Double|Boolean|Character|Byte|Short|System|Math|ArrayList|HashMap|HashSet|LinkedList|Vector|Collections|Arrays' ) out = sub_outside_tags(rf"\b({builtins})\b", r"\1", out) elif lang in ('css',): # Selectors (purple) out = sub_outside_tags(r"([.#]?[a-zA-Z][a-zA-Z0-9_-]*)(\s*\{)", r"\1\2", out) # Properties (blue) out = sub_outside_tags(r"([a-zA-Z-]+)(\s*:)", r"\1\2", out) # Values (green) out = sub_outside_tags(r"(\s*:\s*)([^;]+)(;)", r"\1\2\3", out) elif lang in ('html', 'xml'): # Tags (purple) out = sub_outside_tags(r"(<[^>]*>)", r"\1", out) # Attributes (blue) out = sub_outside_tags(r"(\w+)=("[^&]*?")", r"\1=\2", out) # Strings (green) - apply to all languages out = sub_outside_tags(r"(".*?")", r"\1", out) out = sub_outside_tags(r"('.*?')", r"\1", out) out = sub_outside_tags(r"(`.*?`)", r"\1", out) # Numbers (orange) - apply to all languages out = sub_outside_tags(r"\b(\d+\.?\d*)\b", r"\1", out) return out def _render_mermaid_png(mermaid_text: str) -> bytes: """ Render mermaid code to PNG via Kroki service (no local mermaid-cli dependency). Falls back to returning empty bytes on failure. """ try: import base64 import json import urllib.request import urllib.error # Validate and clean mermaid content if not mermaid_text or not mermaid_text.strip(): logger.warning("[PDF] Empty mermaid content") return b"" # Clean the mermaid text - remove any potential issues cleaned_text = mermaid_text.strip() # Basic mermaid syntax validation if not cleaned_text.startswith(('graph', 'flowchart', 'sequenceDiagram', 'classDiagram', 'stateDiagram', 'erDiagram', 'journey', 'gantt', 'pie', 'gitgraph')): logger.warning(f"[PDF] Invalid mermaid diagram type: {cleaned_text[:50]}...") return b"" # Kroki POST API for mermaid -> png data = json.dumps({"diagram_source": cleaned_text}).encode("utf-8") req = urllib.request.Request( url="https://kroki.io/mermaid/png", data=data, headers={"Content-Type": "application/json"}, method="POST" ) with urllib.request.urlopen(req, timeout=15) as resp: if resp.status == 200: return resp.read() else: logger.warning(f"[PDF] Kroki returned status {resp.status}") return b"" except urllib.error.HTTPError as e: if e.code == 400: logger.warning(f"[PDF] Kroki mermaid syntax error (400): {e.reason}") else: logger.warning(f"[PDF] Kroki HTTP error {e.code}: {e.reason}") except urllib.error.URLError as e: logger.warning(f"[PDF] Kroki connection error: {e.reason}") except Exception as e: logger.warning(f"[PDF] Kroki mermaid render error: {e}") return b"" async def _format_references_ieee(sources: List[Dict]) -> List[str]: """Format sources in IEEE citation style using NVIDIA API.""" try: from utils.api.router import generate_answer_with_model from helpers.setup import nvidia_rotator if not sources or not nvidia_rotator: return [] # Prepare source data for formatting source_data = [] for i, source in enumerate(sources, 1): source_info = { "number": i, "filename": source.get("filename", "Unknown"), "url": source.get("url", ""), "topic_name": source.get("topic_name", ""), "kind": source.get("kind", "document") } source_data.append(source_info) sys_prompt = """You are an expert at formatting academic references in IEEE style. Format the provided sources as IEEE-style references. Each reference should be numbered and formatted according to IEEE standards. For web sources: [1] Author/Organization, "Title," Website Name, URL, accessed: Date. For documents: [1] Author, "Title," Document Type, Filename, Year. Return only the formatted references, one per line, numbered sequentially.""" user_prompt = f"Format these sources in IEEE style:\n\n{source_data}" selection = {"provider": "nvidia", "model": os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")} response = await generate_answer_with_model(selection, sys_prompt, user_prompt, gemini_rotator, nvidia_rotator, user_id="system", context="pdf_citation") # Parse the response into individual references references = [line.strip() for line in response.split('\n') if line.strip() and line.strip().startswith('[')] # If NVIDIA formatting fails, create basic IEEE format if not references: references = [] for i, source in enumerate(sources, 1): if source.get("kind") == "web": ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}." else: ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}." references.append(ref) return references except Exception as e: logger.warning(f"[PDF] IEEE reference formatting failed: {e}") # Fallback to basic formatting references = [] for i, source in enumerate(sources, 1): if source.get("kind") == "web": ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}." else: ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}." references.append(ref) return references async def generate_report_pdf(report_content: str, user_id: str, project_id: str, sources: List[Dict] = None) -> bytes: """ Generate a PDF from report content using reportlab Args: report_content: Markdown content of the report user_id: User ID for logging project_id: Project ID for logging Returns: PDF content as bytes Raises: HTTPException: If PDF generation fails """ try: from reportlab.lib.pagesizes import letter, A4 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import inch from reportlab.lib import colors from io import BytesIO logger.info(f"[PDF] Generating PDF for user {user_id}, project {project_id}") # Create a BytesIO buffer for the PDF buffer = BytesIO() # Create the PDF document doc = SimpleDocTemplate( buffer, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=18 ) # Get styles styles = getSampleStyleSheet() # Create custom styles title_style = ParagraphStyle( 'CustomTitle', parent=styles['Heading1'], fontSize=24, spaceAfter=30, textColor=colors.HexColor('#2c3e50'), borderWidth=1, borderColor=colors.HexColor('#3498db'), borderPadding=10 ) heading1_style = ParagraphStyle( 'CustomHeading1', parent=styles['Heading1'], fontSize=18, spaceAfter=12, spaceBefore=20, textColor=colors.HexColor('#2c3e50') ) heading2_style = ParagraphStyle( 'CustomHeading2', parent=styles['Heading2'], fontSize=16, spaceAfter=10, spaceBefore=16, textColor=colors.HexColor('#2c3e50') ) heading3_style = ParagraphStyle( 'CustomHeading3', parent=styles['Heading3'], fontSize=14, spaceAfter=8, spaceBefore=12, textColor=colors.HexColor('#2c3e50') ) normal_style = ParagraphStyle( 'CustomNormal', parent=styles['Normal'], fontSize=11, spaceAfter=6, leading=14 ) # Professional IDE-like code styling with no background base_code_parent = styles['Code'] if 'Code' in styles.byName else styles['Normal'] code_style = ParagraphStyle( 'Code', parent=base_code_parent, fontSize=9, fontName='Courier', textColor=colors.HexColor('#2c3e50'), # Dark text on white background backColor=None, # No background color borderColor=colors.HexColor('#e1e8ed'), borderWidth=1, borderPadding=8, leftIndent=12, rightIndent=12, spaceBefore=6, spaceAfter=6, leading=11 ) # Parse markdown content story = [] # Add title story.append(Paragraph("StudyBuddy Report", title_style)) story.append(Paragraph(f"Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", normal_style)) story.append(Spacer(1, 20)) # Enhanced markdown parser with proper formatting story.extend(await _parse_markdown_content(report_content, heading1_style, heading2_style, heading3_style, normal_style, code_style)) # Add references section if sources provided if sources: story.append(PageBreak()) story.append(Paragraph("References", heading1_style)) story.append(Spacer(1, 12)) # Format references in IEEE style using NVIDIA API try: ieee_references = await _format_references_ieee(sources) except Exception as _ie: logger.warning(f"[PDF] Reference formatting failed, falling back: {_ie}") ieee_references = [] for i, source in enumerate(sources, 1): if source.get("kind") == "web": ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}." else: ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}." ieee_references.append(ref) for ref in ieee_references: story.append(Paragraph(ref, normal_style)) story.append(Spacer(1, 6)) # Build PDF doc.build(story) # Get PDF content pdf_content = buffer.getvalue() buffer.close() logger.info(f"[PDF] Successfully generated PDF ({len(pdf_content)} bytes) for user {user_id}, project {project_id}") return pdf_content except ImportError: logger.error("[PDF] reportlab not installed. Install with: pip install reportlab") raise HTTPException(500, detail="PDF generation not available. Please install reportlab.") except Exception as e: logger.error(f"[PDF] Failed to generate PDF: {e}") # Keep error generic for client; avoid leaking internals raise HTTPException(500, detail="Failed to generate PDF")