hmnshudhmn24 commited on
Commit
60a6370
·
verified ·
1 Parent(s): 30b5c37

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
LICENSE ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Apache License 2.0
2
+
3
+ Copyright 2025 hmnshudhmn24
README.md CHANGED
@@ -1,3 +1,32 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AutoDocGen 🧠 — AI Code Documentation & Test Generator
2
+
3
+ AutoDocGen is an advanced model built on **CodeT5** that automatically generates
4
+ documentation, comments, and unit tests for source code files.
5
+
6
+ ---
7
+ ### 🚀 Features
8
+ - Auto-generate docstrings for Python functions
9
+ - Create unit tests from given functions
10
+ - Trainable and extendable on custom datasets
11
+
12
+ ---
13
+ ### 📦 Files
14
+ - `train_codet5_docgen.py` — training script
15
+ - `inference.py` — run doc generation on your code
16
+ - `cli.py` — command-line interface
17
+ - `configs/train_config.json` — training parameters
18
+ - `data_examples/` — contains example dataset and code
19
+ - `tests/` — unit test folder
20
+
21
+ ---
22
+ ### 🧠 Model Description
23
+ This model fine-tunes **CodeT5-small** from Hugging Face Transformers
24
+ on a dataset of Python code and natural language descriptions.
25
+
26
+ ---
27
+ ### ⚖️ License
28
+ Licensed under the **Apache License 2.0**.
29
+
30
+ ---
31
+ ### ✍️ Author
32
+ Developed by **hmnshudhmn24** — 2025.
cli.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from inference import generate_doc
3
+
4
+ if __name__ == "__main__":
5
+ parser = argparse.ArgumentParser()
6
+ parser.add_argument("file", help="Path to Python file to document")
7
+ args = parser.parse_args()
8
+
9
+ with open(args.file, "r") as f:
10
+ code = f.read()
11
+ print(generate_doc(code))
configs/train_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ { "epochs": 3, "batch_size": 4, "learning_rate": 5e-5 }
data_examples/example_code.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ def factorial(n):
2
+ if n == 0:
3
+ return 1
4
+ return n * factorial(n - 1)
data_examples/example_expected.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ### factorial(n)
2
+ Computes factorial of a number recursively.
data_examples/sample_dataset.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"code": "def add(a, b): return a + b", "doc": "Add two numbers."}
2
+ {"code": "def square(x): return x*x", "doc": "Return square of x."}
inference.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ def generate_doc(code_snippet):
4
+ model_name = "trained_model"
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
6
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
7
+
8
+ inputs = tokenizer(code_snippet, return_tensors="pt")
9
+ outputs = model.generate(**inputs, max_length=128)
10
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
11
+
12
+ if __name__ == "__main__":
13
+ print(generate_doc("def multiply(a, b): return a * b"))
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ datasets
4
+ numpy
tests/test_inference.py ADDED
@@ -0,0 +1 @@
 
 
1
+ def test_example(): assert True
train_codet5_docgen.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+ import torch, json
4
+
5
+ def train_model():
6
+ model_name = "Salesforce/codet5-small"
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
+
10
+ dataset = load_dataset("json", data_files={"train": "data_examples/sample_dataset.jsonl"})
11
+ def preprocess(batch):
12
+ inputs = tokenizer(batch["code"], truncation=True, padding="max_length", max_length=128)
13
+ labels = tokenizer(batch["doc"], truncation=True, padding="max_length", max_length=128)
14
+ inputs["labels"] = labels["input_ids"]
15
+ return inputs
16
+ tokenized = dataset["train"].map(preprocess, batched=True)
17
+
18
+ args = TrainingArguments(
19
+ output_dir="results",
20
+ num_train_epochs=3,
21
+ per_device_train_batch_size=2,
22
+ save_strategy="epoch",
23
+ logging_dir="logs",
24
+ )
25
+
26
+ trainer = Trainer(model=model, args=args, train_dataset=tokenized)
27
+ trainer.train()
28
+ model.save_pretrained("trained_model")
29
+ tokenizer.save_pretrained("trained_model")
30
+
31
+ if __name__ == "__main__":
32
+ train_model()
utils.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def count_lines(file_path):
2
+ with open(file_path) as f:
3
+ return len(f.readlines())