hmnshudhmn24 commited on 8 days ago

Commit

67f450b

verified ·

1 Parent(s): c40f5d3

Upload 35 files

Browse files

Files changed (35) hide show

.gitignore +9 -0
LICENSE +7 -0
README.md +17 -3
data/samples/sample.jsonl +1 -0
deployment/Dockerfile +7 -0
deployment/api_service.py +3 -0
deployment/gradio_service.py +3 -0
deployment/huggingface_spaces/app.py +18 -0
deployment/requirements_docker.txt +2 -0
models/README.md +4 -0
notebooks/01_data_exploration.ipynb +1 -0
notebooks/02_preprocessing.ipynb +1 -0
notebooks/03_model_finetuning.ipynb +1 -0
notebooks/04_evaluation.ipynb +1 -0
notebooks/05_inference_demo.ipynb +1 -0
requirements.txt +12 -0
setup.py +8 -0
src/__init__.py +1 -0
src/app/gradio_demo.py +19 -0
src/app/main.py +19 -0
src/config/model_config.yaml +11 -0
src/config/training_args.yaml +9 -0
src/data/data_loader.py +21 -0
src/data/preprocessing.py +15 -0
src/model/evaluate_model.py +11 -0
src/model/inference.py +22 -0
src/model/model_builder.py +12 -0
src/model/train_model.py +14 -0
src/utils/helper.py +3 -0
src/utils/logger.py +4 -0
src/utils/metrics.py +6 -0
tests/test_api.py +15 -0
tests/test_data.py +7 -0
tests/test_inference.py +8 -0
tests/test_model.py +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+.venv/
+.env
+*.pyc
+*.pkl
+*.pt
+experiments/
+models/
+.DS_Store

LICENSE ADDED Viewed

	@@ -0,0 +1,7 @@

+MIT License
+Copyright (c) 2025 AutoCodeFix
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction... (full MIT text should be placed here)

README.md CHANGED Viewed

@@ -1,3 +1,17 @@
----
-license: apache-2.0
----

+# AutoCodeFix (Lightweight Functional Version)
+This is a lightweight, fully-runnable scaffold of the AutoCodeFix project. It contains example code,
+dummy model logic (simulated), minimal FastAPI and Gradio interfaces, and tests so you can run locally
+and upload to Hugging Face Spaces. The model folders are intentionally left empty for you to add real
+model weights later.
+## Quickstart (local)
+```bash
+python -m venv .venv
+source .venv/bin/activate   # on Windows use: .venv\Scripts\activate
+pip install -r requirements.txt
+# Run API
+uvicorn src.app.main:app --reload
+# Or run Gradio demo
+python src/app/gradio_demo.py
+```

data/samples/sample.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"id": "1", "language": "python", "buggy": "def add(a, b)\\n return a + b", "fixed": "def add(a, b):\\n return a + b", "error_log": "SyntaxError: invalid syntax"}\n

deployment/Dockerfile ADDED Viewed

	@@ -0,0 +1,7 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . /app
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+EXPOSE 8000
+CMD ["uvicorn", "src.app.main:app", "--host", "0.0.0.0", "--port", "8000"]

deployment/api_service.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from uvicorn import run
+if __name__ == '__main__':
+    run('src.app.main:app', host='0.0.0.0', port=8000)

deployment/gradio_service.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from src.app.gradio_demo import iface
+if __name__ == '__main__':
+    iface.launch(server_name='0.0.0.0', server_port=7860)

deployment/huggingface_spaces/app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import gradio as gr
+from src.model.inference import AutoCodeFixer
+fixer = AutoCodeFixer(model_dir=None)
+def predict(buggy, error_log):
+    fixed, explanation = fixer.repair(buggy, error_log)
+    return fixed, explanation
+iface = gr.Interface(
+    fn=predict,
+    inputs=[gr.Textbox(lines=15, placeholder='Buggy code'), gr.Textbox(lines=4, placeholder='Error log (optional)')],
+    outputs=[gr.Textbox(lines=15), gr.Textbox(lines=6)],
+    title='AutoCodeFix (Spaces - Lightweight)'
+)
+if __name__ == '__main__':
+    iface.launch()

deployment/requirements_docker.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ -r ../requirements.txt
2	+ uvicorn[standard]

models/README.md ADDED Viewed

	@@ -0,0 +1,4 @@

+This folder contains placeholders for real model artifacts.
+- models/starcoder2/  <-- leave empty until you upload model weights
+- models/codet5plus/  <-- leave empty until you upload model weights
+- models/tokenizer/   <-- leave empty until you upload tokenizer files

notebooks/01_data_exploration.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

notebooks/02_preprocessing.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

notebooks/03_model_finetuning.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

notebooks/04_evaluation.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

notebooks/05_inference_demo.ipynb ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi
+uvicorn
+gradio
+transformers
+torch
+datasets
+pandas
+numpy
+pytest
+diff-match-patch
+pyyaml
+peft

setup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from setuptools import setup, find_packages
+setup(
+    name='autocodefix',
+    version='0.1.0',
+    packages=find_packages('src'),
+    package_dir={'': 'src'},
+)

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # AutoCodeFix package

src/app/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+from src.model.inference import AutoCodeFixer
+fixer = AutoCodeFixer()
+def run_fix(buggy, error_log):
+    fixed, explanation = fixer.repair(buggy, error_log)
+    return fixed, explanation
+iface = gr.Interface(
+    fn=run_fix,
+    inputs=[gr.Textbox(lines=15, placeholder='Paste buggy code here'), gr.Textbox(lines=4, placeholder='Error log (optional)')],
+    outputs=[gr.Textbox(lines=15, label='Fixed Code'), gr.Textbox(lines=6, label='Explanation')],
+    title='AutoCodeFix (Lightweight)',
+    description='Paste buggy code and optional error log — model returns a repaired version and a brief explanation.'
+)
+if __name__ == '__main__':
+    iface.launch()

src/app/main.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from src.model.inference import AutoCodeFixer
+app = FastAPI(title='AutoCodeFix API')
+fixer = AutoCodeFixer()
+class RepairRequest(BaseModel):
+    buggy: str
+    error_log: str = None
+@app.post('/repair')
+async def repair_code(req: RepairRequest):
+    fixed, explanation = fixer.repair(req.buggy, req.error_log)
+    return {'fixed': fixed, 'explanation': explanation}
+@app.get('/')
+async def root():
+    return {'message': 'AutoCodeFix is running (lightweight demo)'}

src/config/model_config.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+model_name: "Salesforce/codet5p-small"
+task: "code-repair"
+max_input_length: 1024
+max_output_length: 512
+learning_rate: 3e-5
+batch_size: 8
+num_train_epochs: 1
+lora:
+  r: 8
+  alpha: 16
+  dropout: 0.05

src/config/training_args.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+output_dir: experiments/checkpoints
+logging_dir: experiments/logs
+save_steps: 500
+eval_steps: 500
+logging_steps: 100
+per_device_train_batch_size: 4
+per_device_eval_batch_size: 4
+warmup_steps: 100
+weight_decay: 0.01

src/data/data_loader.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os, json
+from datasets import Dataset
+def load_jsonl_dataset(path: str):
+    files = []
+    if os.path.isdir(path):
+        for fname in os.listdir(path):
+            if fname.endswith('.jsonl') or fname.endswith('.json'):
+                files.append(os.path.join(path, fname))
+    elif os.path.isfile(path):
+        files = [path]
+    if not files:
+        raise FileNotFoundError(f'No data files found in {path}')
+    records = []
+    for f in files:
+        with open(f, 'r', encoding='utf-8') as fh:
+            for line in fh:
+                line=line.strip()
+                if not line: continue
+                records.append(json.loads(line))
+    return Dataset.from_list(records)

src/data/preprocessing.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import AutoTokenizer
+def preprocess_examples(examples, tokenizer: AutoTokenizer, max_input_length: int, max_output_length: int):
+    inputs = []
+    targets = []
+    for buggy, fixed, err in zip(examples.get('buggy', []), examples.get('fixed', []), examples.get('error_log', [])):
+        prompt = "### Buggy code:\n" + buggy
+        if err:
+            prompt += "\n### Error Log:\n" + err
+        inputs.append(prompt)
+        targets.append(fixed)
+    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_input_length)
+    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=max_output_length).input_ids
+    model_inputs['labels'] = labels
+    return model_inputs

src/model/evaluate_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from src.model.inference import AutoCodeFixer
+def main():
+    fixer = AutoCodeFixer()
+    sample = 'def add(a,b)\n return a + b'
+    fixed, explanation = fixer.repair(sample)
+    print('Sample fixed code:\n', fixed)
+    print('Explanation:\n', explanation)
+if __name__ == '__main__':
+    main()

src/model/inference.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import re
+class AutoCodeFixer:
+    def __init__(self, model_dir: str = None):
+        # lightweight demo: no large model loaded
+        self.model_dir = model_dir
+    def _simple_fix(self, buggy: str):
+        # Fix missing colons in Python def/class, fix indentation of return if obviously wrong.
+        fixed = buggy
+        fixed = re.sub(r"def (\w+\(.*\))\\n\s+return", r"def \1:\n    return", fixed)
+        fixed = re.sub(r"class (\w+)\\n", r"class \1:\n", fixed)
+        # ensure trailing newline
+        if not fixed.endswith('\n'):
+            fixed += '\n'
+        return fixed
+    def repair(self, buggy: str, error_log: str = None, max_new_tokens: int = 256):
+        # simple heuristic based repair
+        fixed = self._simple_fix(buggy)
+        explanation = 'Applied heuristic fixes: added missing colon(s) and normalized simple indentation.'
+        return fixed, explanation

src/model/model_builder.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def build_model_and_tokenizer(model_name: str = 'Salesforce/codet5p-small'):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    except Exception as e:
+        # Fallback minimal dummy tokenizer/model for offline/demo
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained('sshleifer/tiny-mbart', use_fast=True)
+        model = AutoModelForSeq2SeqLM.from_pretrained('sshleifer/tiny-mbart')
+    return model, tokenizer

src/model/train_model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import argparse, yaml
+from src.model.model_builder import build_model_and_tokenizer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, default='src/config/model_config.yaml')
+    args = parser.parse_args()
+    with open(args.config, 'r') as fh:
+        cfg = yaml.safe_load(fh)
+    model, tokenizer = build_model_and_tokenizer(cfg.get('model_name'))
+    print('Loaded model and tokenizer (demo). This script is a scaffold for real training.')
+if __name__ == '__main__':
+    main()

src/utils/helper.py ADDED Viewed

	@@ -0,0 +1,3 @@

+import difflib
+def unified_diff(a, b):
+    return '\n'.join(difflib.unified_diff(a.splitlines(), b.splitlines(), lineterm=''))

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import logging
+def get_logger(name=__name__):
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    return logging.getLogger(name)

src/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def exact_match_score(preds, refs):
+    correct = 0
+    for p, r in zip(preds, refs):
+        if p.strip() == r.strip():
+            correct += 1
+    return correct / max(1, len(preds))

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from fastapi.testclient import TestClient
+from src.app.main import app
+client = TestClient(app)
+def test_root():
+    r = client.get('/')
+    assert r.status_code == 200
+def test_repair_endpoint():
+    payload = {"buggy": "def add(a, b)\n    return a + b"}
+    r = client.post('/repair', json=payload)
+    assert r.status_code == 200
+    data = r.json()
+    assert 'fixed' in data and 'explanation' in data

tests/test_data.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import pytest
+from src.data.data_loader import load_jsonl_dataset
+import os
+def test_load_no_files_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        load_jsonl_dataset(str(tmp_path))

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from src.model.inference import AutoCodeFixer
+def test_repair_returns_tuple():
+    fixer = AutoCodeFixer()
+    buggy = 'def add(a, b)\n    return a + b'
+    fixed, explanation = fixer.repair(buggy)
+    assert isinstance(fixed, str)
+    assert isinstance(explanation, str)

tests/test_model.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from src.model.model_builder import build_model_and_tokenizer
+def test_build_model_tokenizer():
+    model, tokenizer = build_model_and_tokenizer('sshleifer/tiny-mbart')
+    assert model is not None
+    assert tokenizer is not None