Spaces:

vijayvizag
/

code-to-doc-streamlit

Runtime error

App Files Files Community

vijayvizag commited on Apr 21

Commit

f360edc

verified ·

1 Parent(s): a02d9ee

Upload 2 files

Browse files

Files changed (2) hide show

app.py +78 -354
codet5_summarizer.py +183 -0

app.py CHANGED Viewed

@@ -1,360 +1,84 @@
 import streamlit as st
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-import re
-import time
-# Model constants
-CODET5_MODEL = "Salesforce/codet5-base-multi-sum"
-class CodeT5Summarizer:
-    def __init__(self, device=None):
-        """Initialize CodeT5 summarization model."""
-        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
-        # Initialize model and tokenizer
-        with st.spinner("Loading CodeT5 model... this may take a minute..."):
-            self.tokenizer = AutoTokenizer.from_pretrained(CODET5_MODEL)
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(CODET5_MODEL).to(self.device)
-    def preprocess_code(self, code):
-        """Clean and preprocess the Python code."""
-        # Remove empty lines
-        code = re.sub(r'\n\s*\n', '\n', code)
-        # Remove excessive comments (keeping docstrings)
-        code_lines = []
-        in_docstring = False
-        docstring_delimiter = None
-        for line in code.split('\n'):
-            # Check for docstring delimiters
-            if '"""' in line or "'''" in line:
-                delimiter = '"""' if '"""' in line else "'''"
-                if not in_docstring:
-                    in_docstring = True
-                    docstring_delimiter = delimiter
-                elif docstring_delimiter == delimiter:
-                    in_docstring = False
-                    docstring_delimiter = None
-            # Keep docstrings and non-comment lines
-            if in_docstring or not line.strip().startswith('#'):
-                code_lines.append(line)
-        processed_code = '\n'.join(code_lines)
-        # Normalize whitespace
-        processed_code = re.sub(r' +', ' ', processed_code)
-        return processed_code
-    def extract_functions(self, code):
-        """Extract individual functions for summarization"""
-        # Simple regex to find function definitions
-        function_pattern = r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(.*?\).*?:'
-        function_matches = re.finditer(function_pattern, code, re.DOTALL)
-        functions = []
-        for match in function_matches:
-            start_pos = match.start()
-            # Find the function body
-            function_name = match.group(1)
-            lines = code[start_pos:].split('\n')
-            # Skip the function definition line
-            body_start = 1
-            while body_start < len(lines) and not lines[body_start].strip():
-                body_start += 1
-            if body_start < len(lines):
-                # Get the indentation of the function body
-                body_indent = len(lines[body_start]) - len(lines[body_start].lstrip())
-                # Gather all lines with at least this indentation
-                function_body = [lines[0]]  # The function definition
-                i = 1
-                while i < len(lines):
-                    line = lines[i]
-                    if line.strip() and (len(line) - len(line.lstrip())) < body_indent and not line.strip().startswith('#'):
-                        break
-                    function_body.append(line)
-                    i += 1
-                function_code = '\n'.join(function_body)
-                functions.append((function_name, function_code))
-        # Simple regex to find class methods
-        class_pattern = r'class\s+([a-zA-Z_][a-zA-Z0-9_]*)'
-        class_matches = re.finditer(class_pattern, code, re.DOTALL)
-        for match in class_matches:
-            class_name = match.group(1)
-            start_pos = match.start()
-            # Find class methods using the function pattern
-            class_code = code[start_pos:]
-            method_matches = re.finditer(function_pattern, class_code, re.DOTALL)
-            for method_match in method_matches:
-                method_name = method_match.group(1)
-                # Skip if this is not a method (i.e., it's a function outside the class)
-                if method_match.start() > 200:  # Simple heuristic to check if method is within class scope
-                    break
-                # Get the full method code
-                method_start = method_match.start()
-                method_lines = class_code[method_start:].split('\n')
-                # Skip the method definition line
-                body_start = 1
-                while body_start < len(method_lines) and not method_lines[body_start].strip():
-                    body_start += 1
-                if body_start < len(method_lines):
-                    # Get the indentation of the method body
-                    body_indent = len(method_lines[body_start]) - len(method_lines[body_start].lstrip())
-                    # Gather all lines with at least this indentation
-                    method_body = [method_lines[0]]  # The method definition
-                    i = 1
-                    while i < len(method_lines):
-                        line = method_lines[i]
-                        if line.strip() and (len(line) - len(line.lstrip())) < body_indent and not line.strip().startswith('#'):
-                            break
-                        method_body.append(line)
-                        i += 1
-                    method_code = '\n'.join(method_body)
-                    functions.append((f"{class_name}.{method_name}", method_code))
-        return functions
-    def extract_classes(self, code):
-        """Extract class definitions for summarization"""
-        class_pattern = r'class\s+([a-zA-Z_][a-zA-Z0-9_]*)'
-        class_matches = re.finditer(class_pattern, code, re.DOTALL)
-        classes = []
-        for match in class_matches:
-            class_name = match.group(1)
-            start_pos = match.start()
-            # Extract class body
-            class_lines = code[start_pos:].split('\n')
-            # Skip the class definition line
-            body_start = 1
-            while body_start < len(class_lines) and not class_lines[body_start].strip():
-                body_start += 1
-            if body_start < len(class_lines):
-                # Get the indentation of the class body
-                body_indent = len(class_lines[body_start]) - len(class_lines[body_start].lstrip())
-                # Gather all lines with at least this indentation
-                class_body = [class_lines[0]]  # The class definition
-                i = 1
-                while i < len(class_lines):
-                    line = class_lines[i]
-                    if line.strip() and (len(line) - len(line.lstrip())) < body_indent:
-                        break
-                    class_body.append(line)
-                    i += 1
-                class_code = '\n'.join(class_body)
-                classes.append((class_name, class_code))
-        return classes
-    def summarize(self, code, max_length=50):
-        """Generate summary using CodeT5."""
-        # Truncate input if needed
-        max_input_length = 512  # CodeT5 typically accepts up to 512 tokens
-        tokenized_code = self.tokenizer(code, truncation=True, max_length=max_input_length, return_tensors="pt").to(self.device)
-        with torch.no_grad():
-            generated_ids = self.model.generate(
-                tokenized_code["input_ids"],
-                max_length=max_length,
-                num_beams=4,
-                early_stopping=True
-            )
-        summary = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        return summary
-    def summarize_code(self, code, summarize_functions=True, summarize_classes=True):
-        """
-        Generate full file summary and optionally function/class level summaries.
-        Returns a dictionary with summaries.
-        """
-        preprocessed_code = self.preprocess_code(code)
-        results = {
-            "file_summary": None,
-            "function_summaries": {},
-            "class_summaries": {}
-        }
-        # Generate file-level summary
-        try:
-            file_summary = self.summarize(preprocessed_code)
-            results["file_summary"] = file_summary
-        except Exception as e:
-            results["file_summary"] = f"Error generating file summary: {str(e)}"
-        # Generate function-level summaries if requested
-        if summarize_functions:
-            functions = self.extract_functions(preprocessed_code)
-            for function_name, function_code in functions:
-                try:
-                    summary = self.summarize(function_code)
-                    results["function_summaries"][function_name] = summary
-                except Exception as e:
-                    results["function_summaries"][function_name] = f"Error: {str(e)}"
-        # Generate class-level summaries if requested
-        if summarize_classes:
-            classes = self.extract_classes(preprocessed_code)
-            for class_name, class_code in classes:
-                try:
-                    summary = self.summarize(class_code)
-                    results["class_summaries"][class_name] = summary
-                except Exception as e:
-                    results["class_summaries"][class_name] = f"Error: {str(e)}"
-        return results
-def main():
-    st.set_page_config(
-        page_title="Python Code Summarizer",
-        page_icon="📝",
-        layout="wide"
-    )
-    st.title("📝 Python Code Summarizer using CodeT5")
-    st.markdown("""
-    Upload a Python file or paste code directly to generate summaries.
-    This app uses CodeT5, a pretrained model for code understanding and generation.
-    """)
-    # Initialize session state
-    if 'summarizer' not in st.session_state:
-        st.session_state.summarizer = None
-    # Load model if not already loaded
-    if st.session_state.summarizer is None:
-        st.session_state.summarizer = CodeT5Summarizer()
-    # Create tabs for different input methods
-    tab1, tab2 = st.tabs(["Upload Python File", "Paste Code"])
-    with tab1:
-        uploaded_file = st.file_uploader("Choose a Python file", type=['py'])
-        if uploaded_file is not None:
-            code = uploaded_file.getvalue().decode('utf-8')
-            with st.expander("View Uploaded Code", expanded=False):
-                st.code(code, language='python')
-            # Add summarization options
-            st.subheader("Summarization Options")
-            col1, col2 = st.columns(2)
-            with col1:
-                summarize_functions = st.checkbox("Generate function summaries", value=True)
-            with col2:
-                summarize_classes = st.checkbox("Generate class summaries", value=True)
-            if st.button("Summarize Code", key="summarize_file"):
-                with st.spinner("Generating summaries..."):
-                    start_time = time.time()
-                    summaries = st.session_state.summarizer.summarize_code(
-                        code,
-                        summarize_functions=summarize_functions,
-                        summarize_classes=summarize_classes
-                    )
-                    end_time = time.time()
-                    # Display summaries
-                    st.success(f"Summarization completed in {end_time - start_time:.2f} seconds!")
-                    # File summary
-                    st.subheader("File Summary")
-                    st.write(summaries["file_summary"])
-                    # Function summaries
-                    if summarize_functions and summaries["function_summaries"]:
-                        st.subheader("Function Summaries")
-                        for func_name, summary in summaries["function_summaries"].items():
-                            with st.expander(f"Function: {func_name}"):
-                                st.write(summary)
-                    # Class summaries
-                    if summarize_classes and summaries["class_summaries"]:
-                        st.subheader("Class Summaries")
-                        for class_name, summary in summaries["class_summaries"].items():
-                            with st.expander(f"Class: {class_name}"):
-                                st.write(summary)
-    with tab2:
-        code = st.text_area("Paste Python code here", height=300)
-        if code:
-            # Add summarization options
-            st.subheader("Summarization Options")
-            col1, col2 = st.columns(2)
-            with col1:
-                summarize_functions = st.checkbox("Generate function summaries", value=True, key="func_paste")
-            with col2:
-                summarize_classes = st.checkbox("Generate class summaries", value=True, key="class_paste")
-            if st.button("Summarize Code", key="summarize_paste"):
-                with st.spinner("Generating summaries..."):
-                    start_time = time.time()
-                    summaries = st.session_state.summarizer.summarize_code(
-                        code,
-                        summarize_functions=summarize_functions,
-                        summarize_classes=summarize_classes
-                    )
-                    end_time = time.time()
-                    # Display summaries
-                    st.success(f"Summarization completed in {end_time - start_time:.2f} seconds!")
-                    # File summary
-                    st.subheader("File Summary")
-                    st.write(summaries["file_summary"])
-                    # Function summaries
-                    if summarize_functions and summaries["function_summaries"]:
-                        st.subheader("Function Summaries")
-                        for func_name, summary in summaries["function_summaries"].items():
-                            with st.expander(f"Function: {func_name}"):
-                                st.write(summary)
-                    # Class summaries
-                    if summarize_classes and summaries["class_summaries"]:
-                        st.subheader("Class Summaries")
-                        for class_name, summary in summaries["class_summaries"].items():
-                            with st.expander(f"Class: {class_name}"):
-                                st.write(summary)
     st.markdown("---")
-    st.markdown("### About")
-    st.markdown("""
-    This app uses the CodeT5 model to generate summaries of Python code. The model is trained on a large corpus of code and documentation.
-    **Features:**
-    - File-level summaries
-    - Function-level summaries
-    - Class-level summaries
-    **Limitations:**
-    - Summaries may not always be accurate
-    - Long files may be truncated
-    - Complex code structures might not be properly understood
-    """)
-if __name__ == "__main__":
-    main()

 import streamlit as st
+from codet5_summarizer import CodeT5Summarizer, MODEL_OPTIONS
+import textwrap
+import os
+import base64
+st.set_page_config(page_title="Code Summarizer & Report Generator", layout="wide")
+st.title("📄 Code Summarizer & Report Generator")
+st.markdown("""
+Upload a Python code file to get a high-level summary and a report structure with editable sections.
+You can choose from various models including Mistral, CodeT5, and Gemini.
+""")
+# Model selection
+model_label = st.selectbox("Select Model", list(MODEL_OPTIONS.keys()), index=0)
+summarizer = CodeT5Summarizer(model_name=MODEL_OPTIONS[model_label])
+# Upload code file
+uploaded_file = st.file_uploader("Upload a .py file", type="py")
+if uploaded_file:
+    code = uploaded_file.read().decode("utf-8")
+    st.code(code, language="python")
+    st.markdown("---")
+    st.subheader("🔍 Generating Summary...")
+    if "Mistral" in model_label or "Gemini" in model_label:
+        summary = summarizer.summarize(code)
+        function_summaries = None
+        class_summaries = None
+    else:
+        results = summarizer.summarize_code(code)
+        summary = results["file_summary"]
+        function_summaries = results["function_summaries"]
+        class_summaries = results["class_summaries"]
+    st.text_area("Summary", summary, height=200)
+    if function_summaries:
+        st.subheader("🧩 Function Summaries")
+        for func, summ in function_summaries.items():
+            st.text_area(f"Function: {func}", summ, height=100)
+    if class_summaries:
+        st.subheader("🏗️ Class Summaries")
+        for cls, summ in class_summaries.items():
+            st.text_area(f"Class: {cls}", summ, height=100)
+    # Report generation section
     st.markdown("---")
+    st.subheader("📘 Generate Report")
+    default_sections = [
+        "Abstract", "Introduction", "Literature Review", "Methodology",
+        "Modules", "Software & Hardware Requirements", "Architecture & UML Diagrams",
+        "References", "Conclusion"
+    ]
+    sections = st.multiselect("Select Sections", default_sections, default=default_sections)
+    report = ""
+    for section in sections:
+        content = st.text_area(f"✏️ {section} Content", value=f"{section} description goes here...", height=150)
+        report += f"\n## {section}\n\n{textwrap.dedent(content)}\n"
+    # Export format
+    st.markdown("---")
+    st.subheader("📤 Export Report")
+    export_format = st.radio("Select Export Format", ["Markdown", "Text", "HTML"])
+    def generate_download_link(content, filename):
+        b64 = base64.b64encode(content.encode()).decode()
+        return f'<a href="data:file/txt;base64,{b64}" download="{filename}">📥 Download {filename}</a>'
+    if st.button("Generate Export File"):
+        filename = uploaded_file.name.replace(".py", "")
+        if export_format == "Markdown":
+            st.markdown(generate_download_link(report, f"{filename}_report.md"), unsafe_allow_html=True)
+        elif export_format == "Text":
+            st.markdown(generate_download_link(report, f"{filename}_report.txt"), unsafe_allow_html=True)
+        else:
+            html_content = f"<html><body>{report.replace('\n', '<br>')}</body></html>"
+            st.markdown(generate_download_link(html_content, f"{filename}_report.html"), unsafe_allow_html=True)

codet5_summarizer.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# =============================
+# 📄 codet5_summarizer.py (Updated)
+# =============================
+import torch
+import re
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
+import os
+MODEL_OPTIONS = {
+    "CodeT5 Base (multi-sum)": "Salesforce/codet5-base-multi-sum",
+    "CodeT5 Base": "Salesforce/codet5-base",
+    "CodeT5 Small (Python-specific)": "stmnk/codet5-small-code-summarization-python",
+    "Gemini (describeai)": "describeai/gemini",
+    "Mistral 7B Instruct (v0.2)": "mistralai/Mistral-7B-Instruct-v0.2",
+}
+class CodeT5Summarizer:
+    def __init__(self, model_name=None):
+        model_name = model_name or MODEL_OPTIONS["CodeT5 Base (multi-sum)"]
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        hf_token = os.getenv('HF_TOKEN')
+        if hf_token is None:
+            raise ValueError("Hugging Face token must be set in the environment variable 'HF_TOKEN'.")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+        # Use causal model for decoder-only (e.g., Mistral), otherwise Seq2Seq
+        try:
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=hf_token).to(self.device)
+        except:
+            self.model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token).to(self.device)
+        self.is_encoder_decoder = self.model.config.is_encoder_decoder if hasattr(self.model.config, "is_encoder_decoder") else False
+    def preprocess_code(self, code):
+        code = re.sub(r'\n\s*\n', '\n', code)
+        lines = code.split('\n')
+        clean = []
+        docstring = False
+        for line in lines:
+            if '"""' in line or "'''" in line:
+                docstring = not docstring
+            if docstring or not line.strip().startswith('#'):
+                clean.append(line)
+        return re.sub(r' +', ' ', '\n'.join(clean))
+    def extract_functions(self, code):
+        function_pattern = r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(.*?\).*?:'
+        function_matches = re.finditer(function_pattern, code, re.DOTALL)
+        functions = []
+        for match in function_matches:
+            start_pos = match.start()
+            function_name = match.group(1)
+            lines = code[start_pos:].split('\n')
+            body_start = 1
+            while body_start < len(lines) and not lines[body_start].strip():
+                body_start += 1
+            if body_start < len(lines):
+                body_indent = len(lines[body_start]) - len(lines[body_start].lstrip())
+                function_body = [lines[0]]
+                i = 1
+                while i < len(lines):
+                    line = lines[i]
+                    if line.strip() and (len(line) - len(line.lstrip())) < body_indent and not line.strip().startswith('#'):
+                        break
+                    function_body.append(line)
+                    i += 1
+                function_code = '\n'.join(function_body)
+                functions.append((function_name, function_code))
+        # Class method detection
+        class_pattern = r'class\s+([a-zA-Z_][a-zA-Z0-9_]*)'
+        class_matches = re.finditer(class_pattern, code, re.DOTALL)
+        for match in class_matches:
+            class_name = match.group(1)
+            start_pos = match.start()
+            class_code = code[start_pos:]
+            method_matches = re.finditer(function_pattern, class_code, re.DOTALL)
+            for method_match in method_matches:
+                if method_match.start() > 200:  # Only near the top of the class
+                    break
+                method_name = method_match.group(1)
+                method_start = method_match.start()
+                method_lines = class_code[method_start:].split('\n')
+                body_start = 1
+                while body_start < len(method_lines) and not method_lines[body_start].strip():
+                    body_start += 1
+                if body_start < len(method_lines):
+                    body_indent = len(method_lines[body_start]) - len(method_lines[body_start].lstrip())
+                    method_body = [method_lines[0]]
+                    i = 1
+                    while i < len(method_lines):
+                        line = method_lines[i]
+                        if line.strip() and (len(line) - len(line.lstrip())) < body_indent and not line.strip().startswith('#'):
+                            break
+                        method_body.append(line)
+                        i += 1
+                    method_code = '\n'.join(method_body)
+                    functions.append((f"{class_name}.{method_name}", method_code))
+        return functions
+    def extract_classes(self, code):
+        class_pattern = r'class\s+([a-zA-Z_][a-zA-Z0-9_]*)'
+        class_matches = re.finditer(class_pattern, code, re.DOTALL)
+        classes = []
+        for match in class_matches:
+            class_name = match.group(1)
+            start_pos = match.start()
+            class_lines = code[start_pos:].split('\n')
+            body_start = 1
+            while body_start < len(class_lines) and not class_lines[body_start].strip():
+                body_start += 1
+            if body_start < len(class_lines):
+                body_indent = len(class_lines[body_start]) - len(class_lines[body_start].lstrip())
+                class_body = [class_lines[0]]
+                i = 1
+                while i < len(class_lines):
+                    line = class_lines[i]
+                    if line.strip() and (len(line) - len(line.lstrip())) < body_indent:
+                        break
+                    class_body.append(line)
+                    i += 1
+                class_code = '\n'.join(class_body)
+                classes.append((class_name, class_code))
+        return classes
+    def summarize(self, code, max_length=512):
+        inputs = self.tokenizer(code, return_tensors="pt", truncation=True, max_length=512).to(self.device)
+        with torch.no_grad():
+            if self.is_encoder_decoder:
+                output = self.model.generate(
+                    inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],  # Optional but good to include
+                    max_new_tokens=max_length,
+                    num_beams=4,
+                    early_stopping=True
+                )
+                return self.tokenizer.decode(output[0], skip_special_tokens=True)
+            else:
+                input_ids = inputs["input_ids"]
+                attention_mask = inputs["attention_mask"]
+                output = self.model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,  # ✅ Add this line
+                    max_new_tokens=max_length,
+                    do_sample=False,
+                    num_beams=4,
+                    early_stopping=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+                return self.tokenizer.decode(output[0], skip_special_tokens=True)
+    def summarize_code(self, code, summarize_functions=True, summarize_classes=True):
+        preprocessed_code = self.preprocess_code(code)
+        results = {
+            "file_summary": None,
+            "function_summaries": {},
+            "class_summaries": {}
+        }
+        try:
+            results["file_summary"] = self.summarize(preprocessed_code)
+        except Exception as e:
+            results["file_summary"] = f"Error generating file summary: {str(e)}"
+        if summarize_functions:
+            for function_name, function_code in self.extract_functions(preprocessed_code):
+                try:
+                    summary = self.summarize(function_code)
+                    results["function_summaries"][function_name] = summary
+                except Exception as e:
+                    results["function_summaries"][function_name] = f"Error: {str(e)}"
+        if summarize_classes:
+            for class_name, class_code in self.extract_classes(preprocessed_code):
+                try:
+                    summary = self.summarize(class_code)
+                    results["class_summaries"][class_name] = summary
+                except Exception as e:
+                    results["class_summaries"][class_name] = f"Error: {str(e)}"
+        return results