hmnshudhmn24 commited on
Commit
bcb577e
·
verified ·
1 Parent(s): e8b170b

Upload 11 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  *.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Apache License 2.0
2
+
3
+ Copyright 2025 hmnshudhmn24
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
README.md CHANGED
@@ -1,3 +1,58 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ datasets: daily_dialog
5
+ pipeline_tag: text-generation
6
+ library_name: transformers
7
+ tags:
8
+ - gpt2
9
+ - conversational
10
+ - chatbot
11
+ - nlp
12
+ base_model: gpt2
13
+ ---
14
+
15
+ # GPT-2 Personal Assistant
16
+
17
+ **Model repo:** `hmnshudhmn24/gpt2-personal-assistant`
18
+ A lightweight conversational assistant based on **GPT-2**, fine-tuned on the **DailyDialog** dataset for chat and casual Q&A.
19
+
20
+ ## Model details
21
+ - **Base model:** gpt2
22
+ - **Task:** Conversational text generation / Chatbot
23
+ - **Dataset used for demo:** daily_dialog (small subset used in training script for quick demo)
24
+ - **Language:** English
25
+ - **License:** Apache-2.0
26
+
27
+ ## How to use (inference)
28
+
29
+ ```python
30
+ from transformers import pipeline
31
+
32
+ generator = pipeline("text-generation", model="hmnshudhmn24/gpt2-personal-assistant")
33
+ prompt = "User: Hello\nAssistant: Hi! How can I help you?\nUser: What's the weather like today?\nAssistant:"
34
+ print(generator(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"])
35
+ ```
36
+
37
+ ## Train locally (quick demo)
38
+ Run:
39
+ ```bash
40
+ python train_chatbot.py
41
+ ```
42
+ This script fine-tunes `gpt2` on a subset of the DailyDialog dataset and saves the model to `./gpt2-personal-assistant` folder.
43
+
44
+ ## Files in this repo
45
+ - `config.json`, `tokenizer_config.json`, `special_tokens_map.json` — model/tokenizer configs
46
+ - `train_chatbot.py` — training script (demo)
47
+ - `inference.py` — simple inference example
48
+ - `utils.py` — helper to build conversation prompts
49
+ - `example_conversations.txt` — small sample dialogues
50
+ - `requirements.txt` — Python dependencies
51
+
52
+ ## Notes & limitations
53
+ - GPT-2 is a general-purpose LM; it can generate incorrect or unsafe outputs. Do not rely on it for critical advice.
54
+ - For production, use larger datasets, more epochs, and safety filtering.
55
+ - If uploading to Hugging Face, include `pytorch_model.bin` (weights) after training.
56
+
57
+ ## License
58
+ Apache-2.0
config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GPT2LMHeadModel"
4
+ ],
5
+ "model_type": "gpt2",
6
+ "n_ctx": 1024,
7
+ "n_embd": 768,
8
+ "n_layer": 12,
9
+ "n_head": 12
10
+ }
example_conversations.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ User: Hi, how are you?
2
+ Assistant: I'm good — thanks! How can I assist you today?
3
+
4
+ User: Tell me a short joke.
5
+ Assistant: Why did the scarecrow win an award? Because he was outstanding in his field!
6
+
7
+ User: How can I improve my focus while studying?
8
+ Assistant: Create a distraction-free environment, use short focused sessions (25–50 minutes), take regular breaks, and set clear goals.
inference.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inference.py
2
+ from transformers import pipeline
3
+ from utils import build_conversation_prompt
4
+
5
+ MODEL_ID = "hmnshudhmn24/gpt2-personal-assistant"
6
+
7
+ def chat_once(model_id=MODEL_ID):
8
+ generator = pipeline("text-generation", model=model_id, tokenizer=model_id, device=0 if __import__('torch').cuda.is_available() else -1)
9
+ history = [
10
+ "User: Hello!",
11
+ "Assistant: Hi there! How can I help you today?"
12
+ ]
13
+ user_input = "Can you summarize the benefits of exercise?"
14
+ prompt = build_conversation_prompt(history, user_input, system_prompt="You are a helpful assistant.")
15
+ outputs = generator(prompt, max_length=300, num_return_sequences=1, do_sample=False, pad_token_id=50256)
16
+ print(outputs[0]["generated_text"])
17
+
18
+ if __name__ == "__main__":
19
+ chat_once()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers>=4.44.0
2
+ datasets>=2.21.0
3
+ torch>=1.12.0
4
+ accelerate>=0.20.3
5
+ sentencepiece
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "",
3
+ "bos_token": " ",
4
+ "unk_token": "<|unk|>",
5
+ "pad_token": "<|pad|>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_max_length": 1024,
3
+ "padding_side": "left",
4
+ "truncation_side": "right"
5
+ }
train_chatbot.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # train_chatbot.py
2
+ import os
3
+ from datasets import load_dataset
4
+ from transformers import (
5
+ GPT2TokenizerFast,
6
+ GPT2LMHeadModel,
7
+ DataCollatorForLanguageModeling,
8
+ Trainer,
9
+ TrainingArguments
10
+ )
11
+ import torch
12
+
13
+ # === EDITABLE SETTINGS ===
14
+ HF_USERNAME = "hmnshudhmn24"
15
+ REPO_ID = f"{HF_USERNAME}/gpt2-personal-assistant"
16
+ BASE_MODEL = "gpt2"
17
+ OUTPUT_DIR = "./results"
18
+ MAX_TRAIN_SAMPLES = 4000
19
+ MAX_VAL_SAMPLES = 500
20
+ EPOCHS = 1
21
+ BATCH_SIZE = 4
22
+ LEARNING_RATE = 5e-5
23
+ # =========================
24
+
25
+ def prepare_dataset():
26
+ ds = load_dataset("daily_dialog")
27
+ def to_text(ex):
28
+ dialog = ex["dialog"]
29
+ text = "\n".join(dialog)
30
+ return {"text": text}
31
+ ds = ds.map(to_text, remove_columns=ds["train"].column_names)
32
+ ds["train"] = ds["train"].select(range(min(MAX_TRAIN_SAMPLES, len(ds["train"]))))
33
+ ds["validation"] = ds["validation"].select(range(min(MAX_VAL_SAMPLES, len(ds["validation"]))))
34
+ return ds
35
+
36
+ def main():
37
+ tokenizer = GPT2TokenizerFast.from_pretrained(BASE_MODEL)
38
+ if tokenizer.pad_token is None:
39
+ tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
40
+ model = GPT2LMHeadModel.from_pretrained(BASE_MODEL)
41
+ model.resize_token_embeddings(len(tokenizer))
42
+
43
+ ds = prepare_dataset()
44
+
45
+ def tokenize_batch(examples):
46
+ return tokenizer(examples["text"], truncation=True, max_length=512)
47
+
48
+ tokenized = ds.map(tokenize_batch, batched=True, remove_columns=["text"])
49
+
50
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
51
+
52
+ training_args = TrainingArguments(
53
+ output_dir=OUTPUT_DIR,
54
+ overwrite_output_dir=True,
55
+ num_train_epochs=EPOCHS,
56
+ per_device_train_batch_size=BATCH_SIZE,
57
+ per_device_eval_batch_size=BATCH_SIZE,
58
+ evaluation_strategy="epoch",
59
+ save_strategy="epoch",
60
+ learning_rate=LEARNING_RATE,
61
+ weight_decay=0.01,
62
+ fp16=torch.cuda.is_available(),
63
+ push_to_hub=False,
64
+ logging_steps=100
65
+ )
66
+
67
+ trainer = Trainer(
68
+ model=model,
69
+ args=training_args,
70
+ train_dataset=tokenized["train"],
71
+ eval_dataset=tokenized["validation"],
72
+ data_collator=data_collator,
73
+ tokenizer=tokenizer
74
+ )
75
+
76
+ trainer.train()
77
+
78
+ save_path = "./gpt2-personal-assistant"
79
+ os.makedirs(save_path, exist_ok=True)
80
+ trainer.save_model(save_path)
81
+ tokenizer.save_pretrained(save_path)
82
+ print(f"Model and tokenizer saved to {save_path}")
83
+
84
+ if __name__ == "__main__":
85
+ main()
utils.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+ from typing import List
3
+
4
+ def build_conversation_prompt(history: List[str], user_input: str, system_prompt: str = None) -> str:
5
+ """
6
+ Build a single string prompt for the causal LM from conversation history and the new user input.
7
+
8
+ history: list of previous lines (alternating user/assistant) or full conversation pieces.
9
+ user_input: current user message.
10
+ system_prompt: optional introductory prompt at beginning.
11
+ """
12
+ parts = []
13
+ if system_prompt:
14
+ parts.append(system_prompt.strip())
15
+ for i, h in enumerate(history):
16
+ parts.append(h.strip())
17
+ parts.append("User: " + user_input.strip())
18
+ parts.append("Assistant:")
19
+ return "\n".join(parts)