hmnshudhmn24 commited on
Commit
67f450b
·
verified ·
1 Parent(s): c40f5d3

Upload 35 files

Browse files
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ .venv/
3
+ .env
4
+ *.pyc
5
+ *.pkl
6
+ *.pt
7
+ experiments/
8
+ models/
9
+ .DS_Store
LICENSE ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 AutoCodeFix
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction... (full MIT text should be placed here)
README.md CHANGED
@@ -1,3 +1,17 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AutoCodeFix (Lightweight Functional Version)
2
+
3
+ This is a lightweight, fully-runnable scaffold of the AutoCodeFix project. It contains example code,
4
+ dummy model logic (simulated), minimal FastAPI and Gradio interfaces, and tests so you can run locally
5
+ and upload to Hugging Face Spaces. The model folders are intentionally left empty for you to add real
6
+ model weights later.
7
+
8
+ ## Quickstart (local)
9
+ ```bash
10
+ python -m venv .venv
11
+ source .venv/bin/activate # on Windows use: .venv\Scripts\activate
12
+ pip install -r requirements.txt
13
+ # Run API
14
+ uvicorn src.app.main:app --reload
15
+ # Or run Gradio demo
16
+ python src/app/gradio_demo.py
17
+ ```
data/samples/sample.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"id": "1", "language": "python", "buggy": "def add(a, b)\\n return a + b", "fixed": "def add(a, b):\\n return a + b", "error_log": "SyntaxError: invalid syntax"}\n
deployment/Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+ WORKDIR /app
3
+ COPY . /app
4
+ RUN pip install --upgrade pip
5
+ RUN pip install -r requirements.txt
6
+ EXPOSE 8000
7
+ CMD ["uvicorn", "src.app.main:app", "--host", "0.0.0.0", "--port", "8000"]
deployment/api_service.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from uvicorn import run
2
+ if __name__ == '__main__':
3
+ run('src.app.main:app', host='0.0.0.0', port=8000)
deployment/gradio_service.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from src.app.gradio_demo import iface
2
+ if __name__ == '__main__':
3
+ iface.launch(server_name='0.0.0.0', server_port=7860)
deployment/huggingface_spaces/app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.model.inference import AutoCodeFixer
3
+
4
+ fixer = AutoCodeFixer(model_dir=None)
5
+
6
+ def predict(buggy, error_log):
7
+ fixed, explanation = fixer.repair(buggy, error_log)
8
+ return fixed, explanation
9
+
10
+ iface = gr.Interface(
11
+ fn=predict,
12
+ inputs=[gr.Textbox(lines=15, placeholder='Buggy code'), gr.Textbox(lines=4, placeholder='Error log (optional)')],
13
+ outputs=[gr.Textbox(lines=15), gr.Textbox(lines=6)],
14
+ title='AutoCodeFix (Spaces - Lightweight)'
15
+ )
16
+
17
+ if __name__ == '__main__':
18
+ iface.launch()
deployment/requirements_docker.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ -r ../requirements.txt
2
+ uvicorn[standard]
models/README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ This folder contains placeholders for real model artifacts.
2
+ - models/starcoder2/ <-- leave empty until you upload model weights
3
+ - models/codet5plus/ <-- leave empty until you upload model weights
4
+ - models/tokenizer/ <-- leave empty until you upload tokenizer files
notebooks/01_data_exploration.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
notebooks/02_preprocessing.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
notebooks/03_model_finetuning.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
notebooks/04_evaluation.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
notebooks/05_inference_demo.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Notebook placeholder\nThis is a placeholder notebook. Replace with full notebooks for EDA/training."]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ gradio
4
+ transformers
5
+ torch
6
+ datasets
7
+ pandas
8
+ numpy
9
+ pytest
10
+ diff-match-patch
11
+ pyyaml
12
+ peft
setup.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='autocodefix',
5
+ version='0.1.0',
6
+ packages=find_packages('src'),
7
+ package_dir={'': 'src'},
8
+ )
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # AutoCodeFix package
src/app/gradio_demo.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.model.inference import AutoCodeFixer
3
+
4
+ fixer = AutoCodeFixer()
5
+
6
+ def run_fix(buggy, error_log):
7
+ fixed, explanation = fixer.repair(buggy, error_log)
8
+ return fixed, explanation
9
+
10
+ iface = gr.Interface(
11
+ fn=run_fix,
12
+ inputs=[gr.Textbox(lines=15, placeholder='Paste buggy code here'), gr.Textbox(lines=4, placeholder='Error log (optional)')],
13
+ outputs=[gr.Textbox(lines=15, label='Fixed Code'), gr.Textbox(lines=6, label='Explanation')],
14
+ title='AutoCodeFix (Lightweight)',
15
+ description='Paste buggy code and optional error log — model returns a repaired version and a brief explanation.'
16
+ )
17
+
18
+ if __name__ == '__main__':
19
+ iface.launch()
src/app/main.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from src.model.inference import AutoCodeFixer
4
+
5
+ app = FastAPI(title='AutoCodeFix API')
6
+ fixer = AutoCodeFixer()
7
+
8
+ class RepairRequest(BaseModel):
9
+ buggy: str
10
+ error_log: str = None
11
+
12
+ @app.post('/repair')
13
+ async def repair_code(req: RepairRequest):
14
+ fixed, explanation = fixer.repair(req.buggy, req.error_log)
15
+ return {'fixed': fixed, 'explanation': explanation}
16
+
17
+ @app.get('/')
18
+ async def root():
19
+ return {'message': 'AutoCodeFix is running (lightweight demo)'}
src/config/model_config.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: "Salesforce/codet5p-small"
2
+ task: "code-repair"
3
+ max_input_length: 1024
4
+ max_output_length: 512
5
+ learning_rate: 3e-5
6
+ batch_size: 8
7
+ num_train_epochs: 1
8
+ lora:
9
+ r: 8
10
+ alpha: 16
11
+ dropout: 0.05
src/config/training_args.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ output_dir: experiments/checkpoints
2
+ logging_dir: experiments/logs
3
+ save_steps: 500
4
+ eval_steps: 500
5
+ logging_steps: 100
6
+ per_device_train_batch_size: 4
7
+ per_device_eval_batch_size: 4
8
+ warmup_steps: 100
9
+ weight_decay: 0.01
src/data/data_loader.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from datasets import Dataset
3
+
4
+ def load_jsonl_dataset(path: str):
5
+ files = []
6
+ if os.path.isdir(path):
7
+ for fname in os.listdir(path):
8
+ if fname.endswith('.jsonl') or fname.endswith('.json'):
9
+ files.append(os.path.join(path, fname))
10
+ elif os.path.isfile(path):
11
+ files = [path]
12
+ if not files:
13
+ raise FileNotFoundError(f'No data files found in {path}')
14
+ records = []
15
+ for f in files:
16
+ with open(f, 'r', encoding='utf-8') as fh:
17
+ for line in fh:
18
+ line=line.strip()
19
+ if not line: continue
20
+ records.append(json.loads(line))
21
+ return Dataset.from_list(records)
src/data/preprocessing.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ def preprocess_examples(examples, tokenizer: AutoTokenizer, max_input_length: int, max_output_length: int):
4
+ inputs = []
5
+ targets = []
6
+ for buggy, fixed, err in zip(examples.get('buggy', []), examples.get('fixed', []), examples.get('error_log', [])):
7
+ prompt = "### Buggy code:\n" + buggy
8
+ if err:
9
+ prompt += "\n### Error Log:\n" + err
10
+ inputs.append(prompt)
11
+ targets.append(fixed)
12
+ model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_input_length)
13
+ labels = tokenizer(targets, truncation=True, padding='max_length', max_length=max_output_length).input_ids
14
+ model_inputs['labels'] = labels
15
+ return model_inputs
src/model/evaluate_model.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.inference import AutoCodeFixer
2
+
3
+ def main():
4
+ fixer = AutoCodeFixer()
5
+ sample = 'def add(a,b)\n return a + b'
6
+ fixed, explanation = fixer.repair(sample)
7
+ print('Sample fixed code:\n', fixed)
8
+ print('Explanation:\n', explanation)
9
+
10
+ if __name__ == '__main__':
11
+ main()
src/model/inference.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ class AutoCodeFixer:
4
+ def __init__(self, model_dir: str = None):
5
+ # lightweight demo: no large model loaded
6
+ self.model_dir = model_dir
7
+
8
+ def _simple_fix(self, buggy: str):
9
+ # Fix missing colons in Python def/class, fix indentation of return if obviously wrong.
10
+ fixed = buggy
11
+ fixed = re.sub(r"def (\w+\(.*\))\\n\s+return", r"def \1:\n return", fixed)
12
+ fixed = re.sub(r"class (\w+)\\n", r"class \1:\n", fixed)
13
+ # ensure trailing newline
14
+ if not fixed.endswith('\n'):
15
+ fixed += '\n'
16
+ return fixed
17
+
18
+ def repair(self, buggy: str, error_log: str = None, max_new_tokens: int = 256):
19
+ # simple heuristic based repair
20
+ fixed = self._simple_fix(buggy)
21
+ explanation = 'Applied heuristic fixes: added missing colon(s) and normalized simple indentation.'
22
+ return fixed, explanation
src/model/model_builder.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ def build_model_and_tokenizer(model_name: str = 'Salesforce/codet5p-small'):
4
+ try:
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
6
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
7
+ except Exception as e:
8
+ # Fallback minimal dummy tokenizer/model for offline/demo
9
+ from transformers import AutoTokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained('sshleifer/tiny-mbart', use_fast=True)
11
+ model = AutoModelForSeq2SeqLM.from_pretrained('sshleifer/tiny-mbart')
12
+ return model, tokenizer
src/model/train_model.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse, yaml
2
+ from src.model.model_builder import build_model_and_tokenizer
3
+
4
+ def main():
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument('--config', type=str, default='src/config/model_config.yaml')
7
+ args = parser.parse_args()
8
+ with open(args.config, 'r') as fh:
9
+ cfg = yaml.safe_load(fh)
10
+ model, tokenizer = build_model_and_tokenizer(cfg.get('model_name'))
11
+ print('Loaded model and tokenizer (demo). This script is a scaffold for real training.')
12
+
13
+ if __name__ == '__main__':
14
+ main()
src/utils/helper.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import difflib
2
+ def unified_diff(a, b):
3
+ return '\n'.join(difflib.unified_diff(a.splitlines(), b.splitlines(), lineterm=''))
src/utils/logger.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import logging
2
+ def get_logger(name=__name__):
3
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
4
+ return logging.getLogger(name)
src/utils/metrics.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def exact_match_score(preds, refs):
2
+ correct = 0
3
+ for p, r in zip(preds, refs):
4
+ if p.strip() == r.strip():
5
+ correct += 1
6
+ return correct / max(1, len(preds))
tests/test_api.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+ from src.app.main import app
3
+
4
+ client = TestClient(app)
5
+
6
+ def test_root():
7
+ r = client.get('/')
8
+ assert r.status_code == 200
9
+
10
+ def test_repair_endpoint():
11
+ payload = {"buggy": "def add(a, b)\n return a + b"}
12
+ r = client.post('/repair', json=payload)
13
+ assert r.status_code == 200
14
+ data = r.json()
15
+ assert 'fixed' in data and 'explanation' in data
tests/test_data.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.data.data_loader import load_jsonl_dataset
3
+ import os
4
+
5
+ def test_load_no_files_raises(tmp_path):
6
+ with pytest.raises(FileNotFoundError):
7
+ load_jsonl_dataset(str(tmp_path))
tests/test_inference.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from src.model.inference import AutoCodeFixer
2
+
3
+ def test_repair_returns_tuple():
4
+ fixer = AutoCodeFixer()
5
+ buggy = 'def add(a, b)\n return a + b'
6
+ fixed, explanation = fixer.repair(buggy)
7
+ assert isinstance(fixed, str)
8
+ assert isinstance(explanation, str)
tests/test_model.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from src.model.model_builder import build_model_and_tokenizer
2
+
3
+ def test_build_model_tokenizer():
4
+ model, tokenizer = build_model_and_tokenizer('sshleifer/tiny-mbart')
5
+ assert model is not None
6
+ assert tokenizer is not None