diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..671ff1d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: Python CI + +on: + pull_request: + push: + branches: + - main + - codex/chatbot-original-10b + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Compile Python files + run: python -m compileall chatbot.py chat_llm.py train_llm.py train_tokenizer.py train_10b.py src scripts + + - name: Run tests + run: pytest -q + + - name: Estimate 10B parameters + run: python scripts/estimate_params.py --config configs/chatbot-10b.yaml diff --git a/.gitignore b/.gitignore index 8e258b9..94be039 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,9 @@ venv/ # Training outputs can be large and should not be committed by accident. checkpoints/ +adapters/ +tokenizers/ +runs/ +wandb/ +*.safetensors +*.ckpt diff --git a/README.md b/README.md index d763f5c..dee96b4 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,28 @@ # ChatBot -ChatBot is a small LLM-style chatbot project built with PyTorch. It uses a -decoder-only Transformer trained with next-token prediction, which is the same -basic idea used by larger language models. +ChatBot is an educational LLM project built with PyTorch. The repository now +contains two related paths: + +- a tiny local model path for learning, tests, and CPU smoke runs +- an original untrained `ChatBot-10B` architecture for real large-scale training + +The 10B model is not a fine-tune of Qwen, Gemma, DeepSeek, gpt-oss, or any other +external model. Those projects are used only as architectural inspiration for +modern decoder design choices such as RoPE, RMSNorm, SwiGLU, grouped-query +attention, KV caching, tied embeddings, and bf16-friendly training. ## What changed -- The model is a compact GPT-like Transformer. -- Cornell Movie Dialogues still works as the default offline dataset. -- DailyDialog is available as an optional cleaner conversational dataset when - the `datasets` package can download it from Hugging Face. -- The project is modular: data loading, tokenization, modeling, training, and - chatting live in separate files. -- Previous experimental artifacts were removed so the repository now points - clearly at the upgraded architecture. +- Added an original dense `ChatBot-10B` config at about `9.999B` parameters. +- Replaced the previous simple Transformer internals with modular decoder + blocks: RMSNorm, RoPE, grouped-query attention, SwiGLU, and tied LM head. +- Added BPE tokenizer training through Hugging Face `tokenizers`. +- Added validation perplexity, top-p sampling, greedy decoding, beam search, and + repetition penalty controls. +- Added dataset recipes for Cornell, DailyDialog, UltraChat, OpenAssistant + OASST1, and Dolly 15k. +- Added tests and GitHub Actions CI for the tiny model path and parameter-count + checks. ## Repository structure @@ -22,183 +31,134 @@ basic idea used by larger language models. |-- chatbot.py |-- chat_llm.py |-- train_llm.py -|-- requirements.txt -|-- README.md +|-- train_tokenizer.py +|-- train_10b.py +|-- configs/ +| |-- chatbot-10b.yaml +| `-- chatbot-tiny.yaml +|-- scripts/ +| `-- estimate_params.py |-- data/ -| |-- cornell movie-dialogs corpus/ -| | |-- movie_lines.txt -| | |-- movie_conversations.txt -| | |-- movie_titles_metadata.txt -| | |-- movie_characters_metadata.txt -| | |-- raw_script_urls.txt -| | |-- chameleons.pdf -| | `-- README.txt +| |-- dataset_manifest.json +| `-- cornell movie-dialogs corpus/ +|-- tests/ +| |-- fixtures/ +| `-- test_*.py `-- src/ `-- chatbot/ - |-- __init__.py |-- chat.py |-- config.py |-- data.py |-- model.py |-- tokenizer.py - `-- train.py + |-- tokenizer_train.py + |-- train.py + `-- train_10b.py ``` -### Top-level scripts - -`train_llm.py` starts training. It is intentionally tiny because the real -training logic lives in `src/chatbot/train.py`. - -`chatbot.py` and `chat_llm.py` both start an interactive chat session from a -trained checkpoint. `chatbot.py` is kept as the familiar main entrypoint. - -### `src/chatbot/config.py` - -Contains the `ModelConfig` dataclass. This stores model size settings such as -embedding size, number of Transformer layers, number of attention heads, and -maximum context length. - -### `src/chatbot/data.py` - -Loads training examples. - -- `load_cornell_pairs()` reads Cornell files already stored under `data/`. -- `load_dailydialog_pairs()` optionally downloads DailyDialog through Hugging - Face datasets. -- `ConversationDataset` converts text into `(input_tokens, target_tokens)` pairs - for next-token prediction. - -### `src/chatbot/tokenizer.py` - -Implements a small word-level tokenizer. It lowercases text, separates words and -punctuation, keeps special tokens like `` and ``, and maps uncommon -words to ``. - -This is simpler than a production BPE tokenizer, but it keeps the project easy -to understand and removes the need for extra tokenizer files. - -### `src/chatbot/model.py` - -Defines `TransformerChatModel`, the new small LLM. It uses: - -- token embeddings -- positional embeddings -- causal self-attention through Transformer encoder layers -- a language-model head that predicts the next token - -The causal mask is what makes it LLM-like: each token can only look backward at -earlier tokens, never forward at the answer it is supposed to predict. - -### `src/chatbot/train.py` - -Handles the full training loop: +## Model notes -- reads the selected dataset -- builds the tokenizer -- creates train/validation splits -- trains the Transformer -- saves a checkpoint containing model weights, tokenizer vocabulary, config, and - basic metrics +`configs/chatbot-10b.yaml` defines the original 10B blueprint: -### `src/chatbot/chat.py` +```text +vocab_size: 128000 +n_layer: 36 +n_embd: 5120 +n_head: 40 +n_kv_head: 8 +ffn_hidden_size: 12800 +block_size: 4096 +``` -Loads a checkpoint and runs terminal inference. User messages are formatted as: +With tied input/output embeddings, this reports about `9.999B` parameters: -```text - your message +```powershell +python scripts/estimate_params.py --config configs/chatbot-10b.yaml ``` -The model then generates the bot side until it reaches a stop token or the -maximum response length. +Do not commit 10B checkpoints, random initialized weights, adapters, tokenizer +outputs, or training runs. They are intentionally ignored by `.gitignore`. ## Setup -Create and activate a virtual environment if you want one, then install the -dependencies: - ```powershell pip install -r requirements.txt ``` -`torch` is required. `datasets` is only required for DailyDialog. +For full 10B training, use a Linux multi-GPU environment with recent PyTorch, +bf16-capable GPUs, and a distributed training launcher such as FSDP or DeepSpeed. +This repository provides the model and data pipeline, but ordinary laptops are +not expected to train the 10B config. -## Training +## Tiny local training -Train on the bundled Cornell Movie Dialogues data: +Use the tiny config to verify the code path on CPU: ```powershell -python train_llm.py --dataset cornell --steps 2000 +python train_llm.py --dataset cornell --max-pairs 64 --steps 5 --batch-size 4 --config configs/chatbot-tiny.yaml --cpu ``` -For a quick CPU smoke run: +The checkpoint stores model config, tokenizer metadata, train args, metrics, and +model weights. -```powershell -python train_llm.py --dataset cornell --max-pairs 256 --steps 5 --batch-size 16 --cpu -``` +## BPE tokenizer -Train with the optional DailyDialog dataset: +Train a BPE tokenizer from the configured data mix: ```powershell -python train_llm.py --dataset dailydialog --steps 2000 +python train_tokenizer.py --dataset mixed --max-pairs 50000 --vocab-size 128000 --output tokenizers/chatbot-bpe.json ``` -Useful knobs: +For tiny experiments, lower `--vocab-size` and `--max-pairs`. -- `--max-pairs`: limit examples for experiments -- `--block-size`: maximum token context length -- `--n-layer`: number of Transformer layers -- `--n-head`: number of attention heads -- `--n-embd`: embedding size -- `--steps`: number of optimizer steps -- `--batch-size`: training batch size +## ChatBot-10B training -The default checkpoint path is: +After creating a BPE tokenizer, launch training with the 10B config: -```text -checkpoints/chatbot-small-llm.pt +```powershell +python train_10b.py --config configs/chatbot-10b.yaml --tokenizer bpe --tokenizer-path tokenizers/chatbot-bpe.json --dataset mixed ``` -## Chatting - -After training, start a chat session: +For real training, run the same entrypoint through your distributed launcher and +set batch size, gradient accumulation, precision, checkpointing, and output +locations for the target cluster. The full datasets are downloaded during +training; they are not stored in this repository. -```powershell -python chatbot.py --checkpoint checkpoints/chatbot-small-llm.pt -``` +## Dataset notes -You can also use: +The repo keeps Cornell Movie Dialogues bundled for offline experiments. The +other recipes are downloaded through Hugging Face `datasets` when requested: -```powershell -python chat_llm.py --checkpoint checkpoints/chatbot-small-llm.pt -``` +- `OpenRL/daily_dialog` +- `HuggingFaceH4/ultrachat_200k` +- `OpenAssistant/oasst1` +- `databricks/databricks-dolly-15k` -Type `quit`, `q`, or `exit` to stop. +Always review each dataset license and terms before training or publishing +weights. The test suite uses tiny synthetic fixtures, not full external data. -## Dataset notes +## Chatting -Cornell Movie Dialogues is large and already present in this repository, so it -is the most reliable default. It contains movie dialogue, which can be dramatic, -noisy, or old-fashioned. +After training a checkpoint: -DailyDialog is usually better for simple everyday conversation because it is -made of short multi-turn daily-life dialogues. It is optional because it needs -an internet download and the Hugging Face `datasets` package. +```powershell +python chatbot.py --checkpoint checkpoints/chatbot-small-llm.pt --temperature 0.8 --top-p 0.9 +``` -## Model notes +Useful inference controls: -This is a small educational LLM, not a production assistant. It learns from the -dataset you train it on and does not include instruction tuning, safety tuning, -retrieval, or external knowledge. Better responses usually require: +- `--greedy`: always choose the highest-scoring token +- `--top-k`: keep only the top k tokens before sampling +- `--top-p`: nucleus sampling threshold +- `--num-beams`: beam search width +- `--repetition-penalty`: reduce repeated tokens -- more training steps -- a cleaner dataset -- a larger model -- a subword tokenizer -- validation examples that match the chat style you want +## Tests -## Suggested next upgrades +```powershell +python -m compileall chatbot.py chat_llm.py train_llm.py train_tokenizer.py train_10b.py src scripts +pytest +python scripts/estimate_params.py --config configs/chatbot-10b.yaml +``` -- Add a byte-pair encoding tokenizer for better handling of rare words. -- Add perplexity tracking over a held-out validation file. -- Add beam search or nucleus sampling controls for inference. -- Add a small test suite for data loading, tokenization, and checkpoint loading. +CI runs these checks on Python 3.11 and verifies the tiny CPU training path. diff --git a/configs/chatbot-10b.yaml b/configs/chatbot-10b.yaml new file mode 100644 index 0000000..41dd612 --- /dev/null +++ b/configs/chatbot-10b.yaml @@ -0,0 +1,46 @@ +# This preset is an original dense decoder-only model blueprint. It describes +# the architecture only; it does not include trained weights. +model_name: chatbot-10b + +# The tokenizer can assign ids to up to 128k different subword pieces. +vocab_size: 128000 + +# block_size is the context window: the most tokens the model can read at once. +block_size: 4096 + +# n_embd is the hidden size, or the width of each token vector inside the model. +n_embd: 5120 + +# n_head is the number of query attention heads. +n_head: 40 + +# n_kv_head is smaller than n_head for grouped-query attention, saving memory. +n_kv_head: 8 + +# n_layer is the number of repeated Transformer decoder blocks. +n_layer: 36 + +# SwiGLU expands hidden vectors to this size before projecting them back down. +ffn_hidden_size: 12800 + +# Dropout is disabled for the large blueprint; production training can tune it. +dropout: 0.0 +attention_dropout: 0.0 + +# RoPE theta controls the scale of rotary position embeddings. +rope_theta: 1000000.0 + +# norm_eps prevents division by zero inside RMSNorm. +norm_eps: 0.00001 + +# New weights start as small random values with this standard deviation. +initializer_range: 0.02 + +# is id 0 in both SimpleTokenizer and BPETokenizer. +pad_token_id: 0 + +# Reuse the token embedding table as the output classifier to save parameters. +tie_embeddings: true + +# Modern decoder-only LLMs commonly omit linear-layer bias terms. +use_bias: false diff --git a/configs/chatbot-tiny.yaml b/configs/chatbot-tiny.yaml new file mode 100644 index 0000000..c8b16b8 --- /dev/null +++ b/configs/chatbot-tiny.yaml @@ -0,0 +1,30 @@ +# Tiny preset for tests and laptop smoke runs. It uses the same architecture as +# ChatBot-10B, but with very small dimensions so it can run on CPU. +model_name: chatbot-tiny + +# Keep the vocabulary small for fixtures and quick tests. +vocab_size: 128 + +# A short context window keeps training examples and tests fast. +block_size: 32 + +# Small hidden width, head count, and layer count make this model lightweight. +n_embd: 64 +n_head: 4 +n_kv_head: 2 +n_layer: 2 + +# The feed-forward layer still uses SwiGLU, just at tiny scale. +ffn_hidden_size: 128 + +# Dropout is disabled so tests are deterministic. +dropout: 0.0 +attention_dropout: 0.0 + +# These values mirror the main config style but stay cheap to execute. +rope_theta: 10000.0 +norm_eps: 0.00001 +initializer_range: 0.02 +pad_token_id: 0 +tie_embeddings: true +use_bias: false diff --git a/data/dataset_manifest.json b/data/dataset_manifest.json new file mode 100644 index 0000000..542c4ce --- /dev/null +++ b/data/dataset_manifest.json @@ -0,0 +1,26 @@ +{ + "cornell": { + "type": "local", + "description": "Bundled Cornell Movie Dialogues adjacent turns." + }, + "dailydialog": { + "type": "huggingface", + "dataset": "OpenRL/daily_dialog", + "description": "Short daily-life conversations." + }, + "ultrachat": { + "type": "huggingface", + "dataset": "HuggingFaceH4/ultrachat_200k", + "description": "Instruction-style user/assistant chat data." + }, + "oasst1": { + "type": "huggingface", + "dataset": "OpenAssistant/oasst1", + "description": "OpenAssistant conversation tree prompt/assistant pairs." + }, + "dolly": { + "type": "huggingface", + "dataset": "databricks/databricks-dolly-15k", + "description": "Instruction, optional context, and response examples." + } +} diff --git a/requirements.txt b/requirements.txt index 5e970d1..64b9864 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ torch>=2.1 +tokenizers>=0.15 +PyYAML>=6.0 +pytest>=8.0 -# Optional: needed only when using --dataset dailydialog. +# Optional: needed when using Hugging Face dataset recipes. datasets>=2.18 diff --git a/scripts/estimate_params.py b/scripts/estimate_params.py new file mode 100644 index 0000000..0ec9952 --- /dev/null +++ b/scripts/estimate_params.py @@ -0,0 +1,38 @@ +"""Estimate model parameters without allocating weights.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + # Running scripts/estimate_params.py directly sets sys.path to scripts/. + # Adding the repo root lets Python import src.chatbot without installation. + sys.path.insert(0, str(ROOT)) + +from src.chatbot.config import ModelConfig +from src.chatbot.params import estimate_parameter_count + + +def main() -> None: + parser = argparse.ArgumentParser(description="Estimate ChatBot model parameter counts.") + parser.add_argument("--config", required=True, help="Path to a YAML model config.") + args = parser.parse_args() + + # The estimator reads the blueprint only. It never creates a 10B model in + # memory, so it is safe to run on a laptop or in CI. + config = ModelConfig.from_yaml_file(args.config) + report = estimate_parameter_count(config) + print(f"model: {config.model_name}") + print(f"total_parameters: {report.total}") + print(f"total_billions: {report.total / 1_000_000_000:.3f}B") + print(f"embeddings: {report.embeddings}") + print(f"layers: {report.layers}") + print(f"final_norm: {report.final_norm}") + print(f"lm_head: {report.lm_head}") + + +if __name__ == "__main__": + main() diff --git a/src/chatbot/__init__.py b/src/chatbot/__init__.py index 4567c40..2e315ed 100644 --- a/src/chatbot/__init__.py +++ b/src/chatbot/__init__.py @@ -1,7 +1,25 @@ """Small LLM chatbot package.""" -from .config import ModelConfig -from .model import TransformerChatModel -from .tokenizer import SimpleTokenizer +from .config import ModelConfig, chatbot_10b_config, tiny_config +from .params import estimate_parameter_count +from .tokenizer import BPETokenizer, SimpleTokenizer -__all__ = ["ModelConfig", "SimpleTokenizer", "TransformerChatModel"] +__all__ = [ + "BPETokenizer", + "ModelConfig", + "SimpleTokenizer", + "TransformerChatModel", + "chatbot_10b_config", + "estimate_parameter_count", + "tiny_config", +] + + +def __getattr__(name): + """Load PyTorch model objects only when they are requested.""" + + if name == "TransformerChatModel": + from .model import TransformerChatModel + + return TransformerChatModel + raise AttributeError(name) diff --git a/src/chatbot/chat.py b/src/chatbot/chat.py index 9f0412f..2a640d4 100644 --- a/src/chatbot/chat.py +++ b/src/chatbot/chat.py @@ -9,7 +9,7 @@ from .config import ModelConfig from .model import TransformerChatModel -from .tokenizer import BOT_TOKEN, BOS_TOKEN, EOS_TOKEN, USER_TOKEN, SimpleTokenizer +from .tokenizer import BOT_TOKEN, BOS_TOKEN, EOS_TOKEN, USER_TOKEN, tokenizer_from_dict History = List[Tuple[str, str]] @@ -26,7 +26,9 @@ def load_chatbot(checkpoint_path: str, cpu: bool = False): device = torch.device("cuda" if torch.cuda.is_available() and not cpu else "cpu") checkpoint = torch.load(checkpoint_path, map_location=device) - tokenizer = SimpleTokenizer.from_dict(checkpoint["tokenizer"]) + # The checkpoint contains both the learned weights and the "recipe" needed + # to rebuild the model class before loading those weights. + tokenizer = tokenizer_from_dict(checkpoint["tokenizer"]) config = ModelConfig.from_dict(checkpoint["model_config"]) model = TransformerChatModel(config).to(device) model.load_state_dict(checkpoint["model_state"]) @@ -53,13 +55,17 @@ def build_prompt(message: str, history: History, max_history: int = 3) -> str: @torch.no_grad() def generate_reply( model: TransformerChatModel, - tokenizer: SimpleTokenizer, + tokenizer, message: str, device: torch.device, history: History | None = None, max_new_tokens: int = 48, temperature: float = 0.8, top_k: int | None = 50, + top_p: float | None = None, + do_sample: bool = True, + num_beams: int = 1, + repetition_penalty: float = 1.0, ) -> str: """Generate one response for a user message.""" @@ -69,6 +75,8 @@ def generate_reply( # If the prompt is longer than the model's context window, keep the most # recent tokens. Recent conversation is usually the most relevant context. prompt_ids = tokenizer.encode(prompt)[-model.config.block_size :] + + # PyTorch models expect batches, so wrap the one prompt in a batch of size 1. input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device) output_ids = model.generate( @@ -76,6 +84,10 @@ def generate_reply( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, + top_p=top_p, + do_sample=do_sample, + num_beams=num_beams, + repetition_penalty=repetition_penalty, )[0].tolist() new_ids = output_ids[len(prompt_ids) :] @@ -104,11 +116,15 @@ def generate_reply( def chat_loop( model: TransformerChatModel, - tokenizer: SimpleTokenizer, + tokenizer, device: torch.device, max_new_tokens: int, temperature: float, top_k: int | None, + top_p: float | None, + do_sample: bool, + num_beams: int, + repetition_penalty: float, ) -> None: """Run a terminal chat loop until the user quits.""" @@ -119,6 +135,8 @@ def chat_loop( if message.lower() in {"q", "quit", "exit"}: break + # Keep the conversation history in memory so each next reply can see a + # few recent turns. This is not long-term memory; it is prompt context. reply = generate_reply( model=model, tokenizer=tokenizer, @@ -128,6 +146,10 @@ def chat_loop( max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k, + top_p=top_p, + do_sample=do_sample, + num_beams=num_beams, + repetition_penalty=repetition_penalty, ) history.append((message, reply)) print(f"Bot: {reply}") @@ -142,6 +164,10 @@ def build_arg_parser() -> argparse.ArgumentParser: parser.add_argument("--max-new-tokens", type=int, default=48) parser.add_argument("--temperature", type=float, default=0.8) parser.add_argument("--top-k", type=int, default=50) + parser.add_argument("--top-p", type=float, default=None) + parser.add_argument("--num-beams", type=int, default=1) + parser.add_argument("--repetition-penalty", type=float, default=1.0) + parser.add_argument("--greedy", action="store_true", help="Disable sampling and always choose the highest-scoring token.") return parser @@ -157,4 +183,8 @@ def main() -> None: max_new_tokens=args.max_new_tokens, temperature=args.temperature, top_k=args.top_k, + top_p=args.top_p, + do_sample=not args.greedy, + num_beams=args.num_beams, + repetition_penalty=args.repetition_penalty, ) diff --git a/src/chatbot/config.py b/src/chatbot/config.py index c7dc282..8d6e1f9 100644 --- a/src/chatbot/config.py +++ b/src/chatbot/config.py @@ -1,24 +1,77 @@ """Configuration objects used by training and inference. -Dataclasses keep the settings in one readable place. They are also easy to -save into a checkpoint because they can be converted to plain dictionaries. +The project now has two model scales: + +* tiny configs that can run in tests or on a laptop, and +* the original untrained ChatBot-10B config used for real training plans. + +Keeping both in one dataclass makes the large model easy to describe without +ever allocating its weights during normal tests. """ +from __future__ import annotations + from dataclasses import asdict, dataclass +from pathlib import Path from typing import Any, Dict @dataclass class ModelConfig: - """Size settings for the small decoder-only Transformer.""" + """Settings for the original decoder-only ChatBot architecture. + + The names intentionally stay close to common LLM papers and configs: + ``n_embd`` is the hidden size, ``n_head`` is the number of query heads, and + ``n_kv_head`` is the smaller number of key/value heads used by grouped-query + attention. Beginners can read this file as the model's blueprint. + """ vocab_size: int block_size: int = 96 n_embd: int = 128 n_head: int = 4 n_layer: int = 4 + n_kv_head: int | None = None + ffn_hidden_size: int | None = None dropout: float = 0.1 + attention_dropout: float = 0.1 + rope_theta: float = 10000.0 + norm_eps: float = 1e-5 + initializer_range: float = 0.02 pad_token_id: int = 0 + tie_embeddings: bool = True + use_bias: bool = False + model_name: str = "chatbot-small" + + def __post_init__(self) -> None: + """Fill derived defaults and validate shape choices.""" + + if self.n_kv_head is None: + # If no grouped-query setting is provided, fall back to classic + # attention where every query head has its own key/value head. + self.n_kv_head = self.n_head + if self.ffn_hidden_size is None: + # SwiGLU uses three projections, so the hidden size can be smaller + # than the classic Transformer 4x MLP while still being expressive. + self.ffn_hidden_size = self.n_embd * 4 + if self.n_embd % self.n_head != 0: + # Each attention head receives an equal slice of the hidden vector, + # so the hidden size must divide cleanly by the head count. + raise ValueError("n_embd must be divisible by n_head.") + if self.n_head % self.n_kv_head != 0: + # GQA repeats key/value heads across query heads, which only works + # when the repeat count is a whole number. + raise ValueError("n_head must be divisible by n_kv_head for GQA.") + if (self.n_embd // self.n_head) % 2 != 0: + # RoPE rotates hidden dimensions in pairs, so one head must contain + # an even number of dimensions. + raise ValueError("The attention head dimension must be even for RoPE.") + + @property + def head_dim(self) -> int: + """Number of hidden units handled by one attention head.""" + + return self.n_embd // self.n_head def to_dict(self) -> Dict[str, Any]: """Return a JSON-friendly copy of the configuration.""" @@ -29,4 +82,68 @@ def to_dict(self) -> Dict[str, Any]: def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig": """Build a config from the dictionary stored in a checkpoint.""" + # Older checkpoints do not have the newer architecture fields. The + # dataclass defaults fill them in, so old tiny checkpoints still load. return cls(**data) + + @classmethod + def from_yaml_file(cls, path: str | Path) -> "ModelConfig": + """Load a model config from a YAML file.""" + + try: + import yaml + except ImportError as exc: # pragma: no cover - exercised by users + raise ImportError("Reading YAML configs requires PyYAML.") from exc + + with open(path, "r", encoding="utf-8") as file: + data = yaml.safe_load(file) or {} + + # The YAML file maps directly onto ModelConfig field names, keeping the + # config readable for people who do not want to edit Python code. + return cls.from_dict(data) + + +def chatbot_10b_config() -> ModelConfig: + """Return the original untrained ChatBot-10B blueprint. + + This config is intentionally dense, not MoE, because the user wanted the + 10B model to be ours and ours only. The parameter count is approximately + 9.999B when the input embedding and output head are tied. + """ + + return ModelConfig( + model_name="chatbot-10b", + vocab_size=128000, + block_size=4096, + n_embd=5120, + n_head=40, + n_kv_head=8, + n_layer=36, + ffn_hidden_size=12800, + dropout=0.0, + attention_dropout=0.0, + rope_theta=1000000.0, + norm_eps=1e-5, + initializer_range=0.02, + tie_embeddings=True, + use_bias=False, + ) + + +def tiny_config(vocab_size: int = 128) -> ModelConfig: + """Return a tiny config used by tests and CPU smoke runs.""" + + # This is intentionally much smaller than a useful LLM. Its job is to prove + # the code path works on a normal laptop or in CI. + return ModelConfig( + model_name="chatbot-tiny", + vocab_size=vocab_size, + block_size=32, + n_embd=64, + n_head=4, + n_kv_head=2, + n_layer=2, + ffn_hidden_size=128, + dropout=0.0, + attention_dropout=0.0, + ) diff --git a/src/chatbot/data.py b/src/chatbot/data.py index 908bee1..5aa134b 100644 --- a/src/chatbot/data.py +++ b/src/chatbot/data.py @@ -9,10 +9,12 @@ from __future__ import annotations import ast +import json import os import random from dataclasses import dataclass -from typing import List, Sequence, Tuple +from pathlib import Path +from typing import Dict, Iterable, List, Sequence, Tuple import torch from torch.utils.data import Dataset, random_split @@ -24,6 +26,34 @@ DialogPair = Tuple[str, str] +DATASET_RECIPES: Dict[str, Dict[str, str]] = { + "cornell": { + "type": "local", + "description": "Bundled Cornell Movie Dialogues adjacent turns.", + }, + "dailydialog": { + "type": "huggingface", + "dataset": "OpenRL/daily_dialog", + "description": "Short daily-life conversations.", + }, + "ultrachat": { + "type": "huggingface", + "dataset": "HuggingFaceH4/ultrachat_200k", + "description": "Instruction-style user/assistant chat data.", + }, + "oasst1": { + "type": "huggingface", + "dataset": "OpenAssistant/oasst1", + "description": "OpenAssistant conversation tree prompt/assistant pairs.", + }, + "dolly": { + "type": "huggingface", + "dataset": "databricks/databricks-dolly-15k", + "description": "Instruction, optional context, and response examples.", + }, +} + + @dataclass(frozen=True) class DatasetBundle: """Container for train and validation datasets.""" @@ -119,6 +149,111 @@ def load_dailydialog_pairs( return pairs +def _load_hf_dataset(dataset_name: str, split: str): + """Import and load a Hugging Face dataset only when it is requested.""" + + try: + from datasets import load_dataset + except ImportError as exc: + raise ImportError( + "Hugging Face dataset loading needs the optional 'datasets' package. " + "Install requirements.txt or use --dataset cornell." + ) from exc + + # Keeping this in a tiny helper means importing src.chatbot.data does not + # require downloading or even installing Hugging Face datasets. + return load_dataset(dataset_name, split=split) + + +def pairs_from_messages(messages: Sequence[dict]) -> List[DialogPair]: + """Convert chat message dictionaries into user/assistant pairs.""" + + pairs: List[DialogPair] = [] + for left, right in zip(messages, messages[1:]): + left_role = str(left.get("role", "")).lower() + right_role = str(right.get("role", "")).lower() + + # Most chat datasets store a list like: + # [{"role": "user", "content": "..."}, {"role": "assistant", ...}] + # We only keep pairs where the direction is user -> assistant. + if left_role in {"user", "prompter"} and right_role in {"assistant", "bot"}: + prompt = str(left.get("content") or left.get("text") or "").strip() + response = str(right.get("content") or right.get("text") or "").strip() + if prompt and response: + pairs.append((prompt, response)) + return pairs + + +def load_ultrachat_pairs( + dataset_name: str = DATASET_RECIPES["ultrachat"]["dataset"], + split: str = "train_sft", + max_pairs: int | None = None, +) -> List[DialogPair]: + """Load UltraChat user/assistant pairs from its messages column.""" + + dataset = _load_hf_dataset(dataset_name, split=split) + pairs: List[DialogPair] = [] + for row in dataset: + for pair in pairs_from_messages(row.get("messages", [])): + pairs.append(pair) + if max_pairs is not None and len(pairs) >= max_pairs: + return pairs + return pairs + + +def load_oasst1_pairs( + dataset_name: str = DATASET_RECIPES["oasst1"]["dataset"], + split: str = "train", + max_pairs: int | None = None, +) -> List[DialogPair]: + """Load English OpenAssistant prompt/assistant pairs. + + OASST1 is a conversation tree. Each assistant row points at its parent + prompt, so we build a lookup and pair assistant messages with their parent. + """ + + dataset = _load_hf_dataset(dataset_name, split=split) + rows = [row for row in dataset if row.get("lang") in {None, "en"}] + by_id = {row.get("message_id"): row for row in rows} + pairs: List[DialogPair] = [] + + for row in rows: + if row.get("role") != "assistant": + continue + parent = by_id.get(row.get("parent_id")) + if not parent or parent.get("role") not in {"prompter", "user"}: + continue + prompt = str(parent.get("text", "")).strip() + response = str(row.get("text", "")).strip() + if prompt and response: + pairs.append((prompt, response)) + if max_pairs is not None and len(pairs) >= max_pairs: + return pairs + return pairs + + +def load_dolly_pairs( + dataset_name: str = DATASET_RECIPES["dolly"]["dataset"], + split: str = "train", + max_pairs: int | None = None, +) -> List[DialogPair]: + """Load Databricks Dolly instruction/response rows as dialog pairs.""" + + dataset = _load_hf_dataset(dataset_name, split=split) + pairs: List[DialogPair] = [] + for row in dataset: + instruction = str(row.get("instruction", "")).strip() + context = str(row.get("context", "")).strip() + response = str(row.get("response", "")).strip() + if context: + instruction = f"{instruction}\n\nContext:\n{context}" + if instruction and response: + pairs.append((instruction, response)) + if max_pairs is not None and len(pairs) >= max_pairs: + return pairs + return pairs + + def pair_to_training_text(pair: DialogPair) -> str: """Format one dialog pair as a single autoregressive training sequence. @@ -140,20 +275,67 @@ def load_training_texts( ) -> List[str]: """Load dialog pairs and format them for language-model training.""" - if dataset == "cornell": - pairs = load_cornell_pairs(corpus_dir, max_pairs=max_pairs) - elif dataset == "dailydialog": - pairs = load_dailydialog_pairs( - dataset_name=hf_dataset_name, - split=hf_split, - max_pairs=max_pairs, - ) - else: - raise ValueError("dataset must be either 'cornell' or 'dailydialog'") + dataset_names = parse_dataset_names(dataset) + pairs: List[DialogPair] = [] + for dataset_name in dataset_names: + # Every loader returns the same simple shape: (prompt, response). That + # common shape lets the training code stay independent of dataset quirks. + if dataset_name == "cornell": + pairs.extend(load_cornell_pairs(corpus_dir, max_pairs=max_pairs)) + elif dataset_name == "dailydialog": + pairs.extend( + load_dailydialog_pairs( + dataset_name=hf_dataset_name if dataset == "dailydialog" else DATASET_RECIPES["dailydialog"]["dataset"], + split=hf_split, + max_pairs=max_pairs, + ) + ) + elif dataset_name == "ultrachat": + pairs.extend(load_ultrachat_pairs(split=hf_split, max_pairs=max_pairs)) + elif dataset_name == "oasst1": + pairs.extend(load_oasst1_pairs(split=hf_split, max_pairs=max_pairs)) + elif dataset_name == "dolly": + pairs.extend(load_dolly_pairs(split=hf_split, max_pairs=max_pairs)) + else: + raise ValueError(f"Unknown dataset: {dataset_name}") return [pair_to_training_text(pair) for pair in pairs] +def parse_dataset_names(dataset: str) -> List[str]: + """Parse one dataset name, a comma-separated list, or ``mixed``.""" + + if dataset == "mixed": + return ["cornell", "dailydialog", "ultrachat", "oasst1", "dolly"] + names = [name.strip().lower() for name in dataset.split(",") if name.strip()] + return names or ["cornell"] + + +def write_dataset_manifest(path: str | Path) -> None: + """Write dataset recipe metadata for readers and training scripts.""" + + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + # The manifest documents what data can be loaded without checking Python + # code. It is intentionally metadata only, not the full external datasets. + path.write_text(json.dumps(DATASET_RECIPES, indent=2), encoding="utf-8") + + +def load_jsonl_pairs(path: str | Path) -> List[DialogPair]: + """Load local JSONL fixtures with ``prompt`` and ``response`` fields.""" + + pairs: List[DialogPair] = [] + with open(path, "r", encoding="utf-8") as file: + for raw_line in file: + row = json.loads(raw_line) + prompt = str(row.get("prompt", "")).strip() + response = str(row.get("response", "")).strip() + if prompt and response: + pairs.append((prompt, response)) + return pairs + + class ConversationDataset(Dataset): """Turns formatted text into next-token-prediction examples. @@ -213,6 +395,8 @@ def build_datasets( if train_size <= 0: raise ValueError("Need at least two examples to create a validation split.") + # A fixed generator seed makes the split repeatable. That is useful when + # comparing two training runs because they validate on the same examples. generator = torch.Generator().manual_seed(seed) train_dataset, valid_dataset = random_split( full_dataset, diff --git a/src/chatbot/model.py b/src/chatbot/model.py index 2244ec0..68a1930 100644 --- a/src/chatbot/model.py +++ b/src/chatbot/model.py @@ -1,119 +1,329 @@ -"""Decoder-only Transformer model for the chatbot. +"""Original decoder-only Transformer model for ChatBot. -This is a small LLM-style architecture: every token can only attend to earlier -tokens, and training asks the model to predict the next token in the dialogue. +The model is inspired by patterns used in modern open LLMs, but it does not +load or copy their weights. The 10B config is our own dense decoder blueprint: +RoPE positions, RMSNorm, grouped-query attention, SwiGLU feed-forward layers, +and tied token/output embeddings. """ from __future__ import annotations +import math +from typing import List, Optional, Tuple + import torch import torch.nn as nn import torch.nn.functional as F from .config import ModelConfig +from .params import estimate_parameter_count -class TransformerChatModel(nn.Module): - """A compact GPT-like model built with PyTorch Transformer blocks. +# A KV cache stores the key and value tensors from earlier generated tokens. +# Reusing those tensors makes chat generation much faster than re-reading the +# full prompt for every new token. +PastKeyValue = Tuple[torch.Tensor, torch.Tensor] + + +class RMSNorm(nn.Module): + """Root-mean-square normalization used by many modern LLMs. + + LayerNorm recenters and rescales activations. RMSNorm only rescales them, + which is simpler and common in large decoder-only models. + """ + + def __init__(self, hidden_size: int, eps: float): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Compute the average squared size of each hidden vector. This is the + # "RMS" part: root mean square. + variance = x.pow(2).mean(dim=-1, keepdim=True) + + # rsqrt means "1 / sqrt". Multiplying by it rescales large activations + # down and small activations up, which keeps training numerically calm. + x = x * torch.rsqrt(variance + self.eps) + return self.weight * x + + +def rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotate pairs of hidden values for rotary position embeddings.""" - "Decoder-only" means the model reads the conversation from left to right and - repeatedly answers the question: "what token should come next?" This is the - same basic training objective used by larger language models. + x_even = x[..., 0::2] + x_odd = x[..., 1::2] + return torch.stack((-x_odd, x_even), dim=-1).flatten(-2) + + +class RotaryEmbedding(nn.Module): + """Rotary position embedding (RoPE). + + RoPE adds position information by rotating query/key vectors. Unlike a + learned position table, it has no trainable parameters and works naturally + with KV caching because every token position has a deterministic rotation. """ + def __init__(self, head_dim: int, theta: float): + super().__init__() + # inv_freq controls how quickly each pair of hidden dimensions rotates. + # Early dimensions rotate quickly; later dimensions rotate more slowly. + inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2).float() / head_dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + position_ids: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # position_ids is [0, 1, 2, ...] for a fresh prompt, or continues after + # the cached prompt length during generation. + freqs = torch.einsum("t,d->td", position_ids.float(), self.inv_freq) + + # Each frequency is used for a pair of dimensions, so repeat_interleave + # expands [d/2] values into [d] values that match the attention head. + emb = torch.repeat_interleave(freqs, repeats=2, dim=-1) + + # Add two singleton dimensions so cos/sin broadcast over batch and + # attention heads: [1, 1, seq_len, head_dim]. + cos = emb.cos()[None, None, :, :] + sin = emb.sin()[None, None, :, :] + return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) + + +def repeat_kv(x: torch.Tensor, repeats: int) -> torch.Tensor: + """Repeat key/value heads so grouped-query attention can use them.""" + + if repeats == 1: + return x + batch, kv_heads, seq_len, head_dim = x.shape + + # GQA uses fewer key/value heads than query heads. Repeating is how each + # query head gets a matching key/value head without storing extra weights. + x = x[:, :, None, :, :].expand(batch, kv_heads, repeats, seq_len, head_dim) + return x.reshape(batch, kv_heads * repeats, seq_len, head_dim) + + +class CausalSelfAttention(nn.Module): + """Multi-head causal attention with grouped key/value heads.""" + def __init__(self, config: ModelConfig): super().__init__() self.config = config + self.n_head = config.n_head + self.n_kv_head = config.n_kv_head + self.head_dim = config.head_dim + self.kv_repeats = config.n_head // config.n_kv_head - # Token embeddings turn integer token ids into learnable vectors. The - # model cannot work directly with words like "hello"; it works with - # numbers, and embeddings give those numbers meaning during training. - self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd) + # q_proj creates one query head per attention head. k_proj and v_proj + # create fewer heads when grouped-query attention is enabled. + self.q_proj = nn.Linear(config.n_embd, config.n_head * self.head_dim, bias=config.use_bias) + self.k_proj = nn.Linear(config.n_embd, config.n_kv_head * self.head_dim, bias=config.use_bias) + self.v_proj = nn.Linear(config.n_embd, config.n_kv_head * self.head_dim, bias=config.use_bias) + + # o_proj mixes all heads back into one hidden vector per token. + self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.use_bias) + self.attn_dropout = nn.Dropout(config.attention_dropout) + self.resid_dropout = nn.Dropout(config.dropout) + self.rope = RotaryEmbedding(self.head_dim, config.rope_theta) + + def forward( + self, + x: torch.Tensor, + position_ids: torch.Tensor, + past_key_value: Optional[PastKeyValue] = None, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[PastKeyValue]]: + batch, seq_len, _ = x.shape + + # Project hidden states into query/key/value tensors and reshape them + # into [batch, heads, tokens, head_dim], the layout used by attention. + q = self.q_proj(x).view(batch, seq_len, self.n_head, self.head_dim).transpose(1, 2) + k = self.k_proj(x).view(batch, seq_len, self.n_kv_head, self.head_dim).transpose(1, 2) + v = self.v_proj(x).view(batch, seq_len, self.n_kv_head, self.head_dim).transpose(1, 2) + + # RoPE is applied to q and k because attention scores come from q*k. + # Values carry content, so they are not position-rotated. + q, k = self.rope(q, k, position_ids) + + if past_key_value is not None: + past_k, past_v = past_key_value + + # During generation, append the new token's k/v to the cached past. + # This avoids recalculating old keys and values every step. + k = torch.cat([past_k, k], dim=2) + v = torch.cat([past_v, v], dim=2) + + present = (k, v) if use_cache else None - # A Transformer sees all tokens in a sequence at once, so we add a - # second learned vector that tells it where each token appears. - self.position_embedding = nn.Embedding(config.block_size, config.n_embd) + # After caching, repeat the smaller key/value head set so the tensor + # has one key/value head for every query head. + k = repeat_kv(k, self.kv_repeats) + v = repeat_kv(v, self.kv_repeats) + + # Dot-product attention: high score means this query token should pay + # more attention to that key token. Dividing by sqrt(head_dim) keeps + # scores from getting too large as the head gets wider. + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim) + + # Causal masking stops a token from looking at future tokens. That is + # what makes next-token prediction honest during training. + total_len = k.size(2) + key_positions = torch.arange(total_len, device=x.device) + mask = key_positions[None, :] > position_ids[:, None] + scores = scores.masked_fill(mask[None, None, :, :], torch.finfo(scores.dtype).min) + + # Softmax turns raw scores into probabilities that add up to 1. + weights = F.softmax(scores.float(), dim=-1).to(dtype=q.dtype) + weights = self.attn_dropout(weights) + y = torch.matmul(weights, v) + + # Merge attention heads back into [batch, tokens, hidden_size]. + y = y.transpose(1, 2).contiguous().view(batch, seq_len, self.config.n_embd) + return self.resid_dropout(self.o_proj(y)), present + + +class SwiGLU(nn.Module): + """SwiGLU feed-forward network. + + The gate decides what information should pass through, and the up/down + projections expand then compress the hidden state. + """ + + def __init__(self, config: ModelConfig): + super().__init__() + self.gate_proj = nn.Linear(config.n_embd, config.ffn_hidden_size, bias=config.use_bias) + self.up_proj = nn.Linear(config.n_embd, config.ffn_hidden_size, bias=config.use_bias) + self.down_proj = nn.Linear(config.ffn_hidden_size, config.n_embd, bias=config.use_bias) self.dropout = nn.Dropout(config.dropout) - # Each Transformer layer contains self-attention plus a feed-forward - # network. Self-attention lets every token compare itself with earlier - # tokens, which is how the model uses conversation context. - layer = nn.TransformerEncoderLayer( - d_model=config.n_embd, - nhead=config.n_head, - dim_feedforward=config.n_embd * 4, - dropout=config.dropout, - activation="gelu", - batch_first=True, - norm_first=True, - ) - self.blocks = nn.TransformerEncoder(layer, num_layers=config.n_layer) - self.final_norm = nn.LayerNorm(config.n_embd) + def forward(self, x: torch.Tensor) -> torch.Tensor: + # silu(gate) decides what information should pass through. Multiplying + # by up_proj(x) is the "gated" part of SwiGLU. + return self.dropout(self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))) - # The language-model head converts hidden vectors back into vocabulary - # scores. A score is produced for every token the model knows. - self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) - # Reusing the token embedding matrix for the output layer is common in - # language models. It saves parameters and usually helps small models. - self.lm_head.weight = self.token_embedding.weight +class DecoderBlock(nn.Module): + """One Transformer block: normalize, attend, normalize, feed forward.""" + + def __init__(self, config: ModelConfig): + super().__init__() + self.attn_norm = RMSNorm(config.n_embd, config.norm_eps) + self.attn = CausalSelfAttention(config) + self.mlp_norm = RMSNorm(config.n_embd, config.norm_eps) + self.mlp = SwiGLU(config) - # The causal mask hides future tokens. Without this mask, the model - # could "cheat" during training by looking at the answer token it is - # supposed to predict. - mask = torch.triu( - torch.ones(config.block_size, config.block_size, dtype=torch.bool), - diagonal=1, + def forward( + self, + x: torch.Tensor, + position_ids: torch.Tensor, + past_key_value: Optional[PastKeyValue] = None, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[PastKeyValue]]: + attn_out, present = self.attn( + self.attn_norm(x), + position_ids=position_ids, + past_key_value=past_key_value, + use_cache=use_cache, ) - self.register_buffer("causal_mask", mask, persistent=False) + # Residual connections add the block's change back to the original + # hidden state. They help gradients flow through many layers. + x = x + attn_out + x = x + self.mlp(self.mlp_norm(x)) + return x, present + + +class TransformerChatModel(nn.Module): + """Original ChatBot decoder model. + + It is still trained with next-token prediction, but the internals now match + the building blocks used by current open LLM families. + """ + + def __init__(self, config: ModelConfig): + super().__init__() + self.config = config + + # Token embeddings are the model's lookup table from token id to a + # dense vector the neural network can process. + self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd) + self.dropout = nn.Dropout(config.dropout) + + # Stacking many decoder blocks is what gives a Transformer depth. + self.blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config.n_layer)]) + self.final_norm = RMSNorm(config.n_embd, config.norm_eps) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + if config.tie_embeddings: + # Tying means the input embedding table is reused as the output + # classifier. This saves parameters and is common in LLMs. + self.lm_head.weight = self.token_embedding.weight self.apply(self._init_weights) def _init_weights(self, module: nn.Module) -> None: - """Initialize weights with stable defaults for small Transformers.""" - + # New weights start as small random numbers. Training gradually nudges + # them toward values that predict the next token well. if isinstance(module, nn.Linear): - nn.init.normal_(module.weight, mean=0.0, std=0.02) + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) if module.bias is not None: nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, mean=0.0, std=0.02) - - def forward(self, idx: torch.Tensor, targets: torch.Tensor | None = None): - """Run the model and optionally compute next-token loss. + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) - idx has shape [batch, time]. Each row is one training example, and each - column is one token position in that example. - """ + def forward( + self, + idx: torch.Tensor, + targets: torch.Tensor | None = None, + past_key_values: Optional[List[PastKeyValue]] = None, + use_cache: bool = False, + ): + """Run the model and optionally compute next-token loss.""" _, seq_len = idx.shape - if seq_len > self.config.block_size: + past_len = 0 if past_key_values is None else past_key_values[0][0].size(2) + + # block_size is the context window. The model cannot attend to more + # tokens than this without changing its configuration. + if past_len + seq_len > self.config.block_size: raise ValueError( - f"Sequence length {seq_len} is larger than block_size " + f"Sequence length {past_len + seq_len} is larger than block_size " f"{self.config.block_size}." ) - # Create position ids [0, 1, 2, ...] so every token gets both a token - # meaning and a location meaning. - positions = torch.arange(seq_len, device=idx.device).unsqueeze(0) - token_embeddings = self.token_embedding(idx) - position_embeddings = self.position_embedding(positions) - x = self.dropout(token_embeddings + position_embeddings) + # Position ids tell RoPE where each token lives in the sequence. + position_ids = torch.arange(past_len, past_len + seq_len, device=idx.device) + x = self.dropout(self.token_embedding(idx)) + next_cache: List[PastKeyValue] = [] + + for layer_index, block in enumerate(self.blocks): + layer_past = None if past_key_values is None else past_key_values[layer_index] + x, present = block( + x, + position_ids=position_ids, + past_key_value=layer_past, + use_cache=use_cache, + ) + if use_cache and present is not None: + next_cache.append(present) - causal_mask = self.causal_mask[:seq_len, :seq_len] - x = self.blocks(x, mask=causal_mask) x = self.final_norm(x) + + # logits are raw scores for every vocabulary token at every position. + # The highest logit is the model's strongest next-token guess. logits = self.lm_head(x) loss = None if targets is not None: - # Cross entropy compares the model's vocabulary scores with the - # correct next token. Padding is ignored because it is only filler, - # not real conversation text. + # Cross-entropy compares the logits with the correct next tokens. + # Padding tokens are ignored so short examples do not affect loss. loss = F.cross_entropy( logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=self.config.pad_token_id, ) + + if use_cache: + return logits, loss, next_cache return logits, loss @torch.no_grad() @@ -123,39 +333,116 @@ def generate( max_new_tokens: int, temperature: float = 0.8, top_k: int | None = 50, + top_p: float | None = None, + do_sample: bool = True, + num_beams: int = 1, + repetition_penalty: float = 1.0, ) -> torch.Tensor: - """Generate new token ids after the prompt tokens in idx. + """Generate new token ids after the prompt tokens in ``idx``.""" - Generation is a loop: predict one next token, append it to the prompt, - then use the longer prompt to predict the following token. - """ + if num_beams > 1: + return self._beam_search(idx, max_new_tokens=max_new_tokens, num_beams=num_beams) for _ in range(max_new_tokens): - # Keep only the latest block_size tokens so the prompt fits inside - # the context window the model was trained for. + # Keep only the latest context-window tokens if the conversation is + # longer than block_size. idx_cond = idx[:, -self.config.block_size :] logits, _ = self(idx_cond) - # We only need the final position because that is the model's - # prediction for the next token after the whole prompt. + # We only sample from the final position because it predicts the + # next token after the entire prompt. logits = logits[:, -1, :] + logits = apply_repetition_penalty(logits, idx, repetition_penalty) + next_id = sample_next_token( + logits, + temperature=temperature, + top_k=top_k, + top_p=top_p, + do_sample=do_sample, + ) + idx = torch.cat((idx, next_id), dim=1) + return idx - if temperature <= 0: - # Greedy decoding: always pick the highest-scoring token. - next_id = torch.argmax(logits, dim=-1, keepdim=True) - else: - # Temperature controls randomness. Lower values are safer and - # more repetitive; higher values are more varied but less stable. - logits = logits / temperature - if top_k is not None and top_k > 0: - # top_k sampling keeps only the k most likely tokens before - # sampling, which prevents very unlikely words from popping - # into the response. - values, _ = torch.topk(logits, k=min(top_k, logits.size(-1))) - logits[logits < values[:, [-1]]] = -float("inf") - probabilities = F.softmax(logits, dim=-1) - next_id = torch.multinomial(probabilities, num_samples=1) + def _beam_search(self, idx: torch.Tensor, max_new_tokens: int, num_beams: int) -> torch.Tensor: + """Small, readable beam search for single-prompt inference.""" - idx = torch.cat((idx, next_id), dim=1) + if idx.size(0) != 1: + raise ValueError("Beam search currently supports batch size 1.") - return idx + beams = [(idx, 0.0)] + for _ in range(max_new_tokens): + candidates = [] + for tokens, score in beams: + logits, _ = self(tokens[:, -self.config.block_size :]) + log_probs = F.log_softmax(logits[:, -1, :], dim=-1) + + # Each beam branches into its best next-token options. We keep + # only the strongest num_beams candidates after sorting. + values, token_ids = torch.topk(log_probs, k=num_beams, dim=-1) + for value, token_id in zip(values[0], token_ids[0]): + next_tokens = torch.cat([tokens, token_id.view(1, 1)], dim=1) + candidates.append((next_tokens, score + float(value.item()))) + beams = sorted(candidates, key=lambda item: item[1], reverse=True)[:num_beams] + return beams[0][0] + + +def apply_repetition_penalty( + logits: torch.Tensor, + previous_tokens: torch.Tensor, + repetition_penalty: float, +) -> torch.Tensor: + """Lower the score of tokens that already appeared in the prompt.""" + + if repetition_penalty <= 1.0: + return logits + adjusted = logits.clone() + for batch_index in range(previous_tokens.size(0)): + for token_id in set(previous_tokens[batch_index].tolist()): + # Dividing lowers the chance of repeating that token again. + adjusted[batch_index, token_id] = adjusted[batch_index, token_id] / repetition_penalty + return adjusted + + +def sample_next_token( + logits: torch.Tensor, + temperature: float, + top_k: int | None, + top_p: float | None, + do_sample: bool, +) -> torch.Tensor: + """Pick one next token using greedy or filtered sampling.""" + + if not do_sample or temperature <= 0: + # Greedy decoding is deterministic: always choose the best-scoring id. + return torch.argmax(logits, dim=-1, keepdim=True) + + # Higher temperature makes the probability distribution flatter and more + # random. Lower temperature makes it sharper and more conservative. + logits = logits / temperature + if top_k is not None and top_k > 0: + values, _ = torch.topk(logits, k=min(top_k, logits.size(-1))) + # Remove every token outside the best k scores. + logits = logits.masked_fill(logits < values[:, [-1]], -float("inf")) + if top_p is not None and 0 < top_p < 1: + # Nucleus sampling keeps the smallest high-probability set whose + # combined probability reaches top_p. + logits = top_p_filter(logits, top_p) + + probabilities = F.softmax(logits, dim=-1) + return torch.multinomial(probabilities, num_samples=1) + + +def top_p_filter(logits: torch.Tensor, top_p: float) -> torch.Tensor: + """Keep the smallest set of tokens whose probability mass reaches top_p.""" + + sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) + sorted_probs = F.softmax(sorted_logits, dim=-1) + cumulative_probs = torch.cumsum(sorted_probs, dim=-1) + + # Mark tokens after the probability mass has passed top_p, but shift the + # mask right so the first token that crosses the threshold is still kept. + sorted_remove = cumulative_probs > top_p + sorted_remove[:, 1:] = sorted_remove[:, :-1].clone() + sorted_remove[:, 0] = False + remove = torch.zeros_like(sorted_remove).scatter(1, sorted_indices, sorted_remove) + return logits.masked_fill(remove, -float("inf")) diff --git a/src/chatbot/params.py b/src/chatbot/params.py new file mode 100644 index 0000000..76d2215 --- /dev/null +++ b/src/chatbot/params.py @@ -0,0 +1,57 @@ +"""Parameter-count utilities that do not allocate model weights.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from .config import ModelConfig + + +@dataclass(frozen=True) +class ParameterReport: + """Human-readable parameter count summary.""" + + total: int + embeddings: int + layers: int + final_norm: int + lm_head: int + + +def estimate_parameter_count(config: ModelConfig) -> ParameterReport: + """Estimate parameter count without importing PyTorch.""" + + head_dim = config.head_dim + kv_dim = config.n_kv_head * head_dim + + # The embedding table has one vector per vocabulary token. + embeddings = config.vocab_size * config.n_embd + + # Attention projection matrices convert hidden states into q, k, v, and + # then mix the attended heads back through o_proj. + q_proj = config.n_embd * (config.n_head * head_dim) + k_proj = config.n_embd * kv_dim + v_proj = config.n_embd * kv_dim + o_proj = config.n_embd * config.n_embd + attention = q_proj + k_proj + v_proj + o_proj + + # SwiGLU has three large matrices: gate, up, and down. RMSNorm adds one + # learned scale vector before attention and one before the MLP. + mlp = 3 * config.n_embd * config.ffn_hidden_size + norms = 2 * config.n_embd + + # Every decoder block repeats the same attention, MLP, and norm structure. + layers = config.n_layer * (attention + mlp + norms) + final_norm = config.n_embd + + # If embeddings are tied, the output head reuses token_embedding.weight and + # does not add another vocab_size * hidden_size parameters. + lm_head = 0 if config.tie_embeddings else config.vocab_size * config.n_embd + + return ParameterReport( + total=embeddings + layers + final_norm + lm_head, + embeddings=embeddings, + layers=layers, + final_norm=final_norm, + lm_head=lm_head, + ) diff --git a/src/chatbot/tokenizer.py b/src/chatbot/tokenizer.py index c50a810..b5d2e52 100644 --- a/src/chatbot/tokenizer.py +++ b/src/chatbot/tokenizer.py @@ -1,15 +1,18 @@ -"""A tiny word-level tokenizer for the chatbot. +"""Tokenizers for the chatbot. -This project intentionally avoids a heavy tokenizer dependency. The tokenizer -below is simple enough for beginners to read: it lowercases text, splits words -and punctuation, and keeps a fixed vocabulary learned from the training data. +The original project used only ``SimpleTokenizer`` because it is easy to read. +That tokenizer stays available for beginners and tiny experiments. The 10B +training path adds ``BPETokenizer``, a byte-pair encoding tokenizer like the +subword tokenizers used by real LLM training pipelines. """ from __future__ import annotations +import json import re import unicodedata from collections import Counter +from pathlib import Path from typing import Dict, Iterable, List, Sequence @@ -40,8 +43,13 @@ def normalize_text(text: str) -> str: """Convert text into a clean, lowercase ASCII form.""" + # NFKD separates accents from letters. The ASCII encode/decode step then + # drops those accents so accented and unaccented words behave consistently. ascii_text = unicodedata.normalize("NFKD", text) ascii_text = ascii_text.encode("ascii", "ignore").decode("ascii") + + # Lowercasing keeps the beginner tokenizer small: "Hello" and "hello" + # become the same token instead of two separate vocabulary entries. ascii_text = ascii_text.lower().strip() return re.sub(r"\s+", " ", ascii_text) @@ -133,7 +141,7 @@ def decode(self, ids: Sequence[int], skip_special: bool = True) -> str: def to_dict(self) -> Dict[str, Dict[str, int]]: """Return a checkpoint-friendly representation.""" - return {"token_to_id": self.token_to_id} + return {"kind": "simple", "token_to_id": self.token_to_id} @classmethod def from_dict(cls, data: Dict[str, Dict[str, int]]) -> "SimpleTokenizer": @@ -146,3 +154,154 @@ def tokenize(text: str) -> List[str]: """Split text into tokens using the project regex.""" return TOKEN_PATTERN.findall(normalize_text(text)) + + +class BPETokenizer: + """Byte-pair encoding tokenizer backed by Hugging Face ``tokenizers``. + + BPE learns common chunks of text instead of whole words. That matters for a + large model because rare words, names, and typos can be represented as + smaller pieces instead of becoming one useless ```` token. + """ + + kind = "bpe" + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self._refresh_special_ids() + + def _refresh_special_ids(self) -> None: + # Store special token ids as attributes so the chat code can stop on + # , ignore , and recognize speaker markers quickly. + self.pad_id = self.tokenizer.token_to_id(PAD_TOKEN) + self.unk_id = self.tokenizer.token_to_id(UNK_TOKEN) + self.bos_id = self.tokenizer.token_to_id(BOS_TOKEN) + self.eos_id = self.tokenizer.token_to_id(EOS_TOKEN) + self.user_id = self.tokenizer.token_to_id(USER_TOKEN) + self.bot_id = self.tokenizer.token_to_id(BOT_TOKEN) + + @property + def token_to_id(self) -> Dict[str, int]: + """Expose a simple mapping for code shared with ``SimpleTokenizer``.""" + + return self.tokenizer.get_vocab() + + @property + def vocab_size(self) -> int: + """Number of BPE tokens available to the model.""" + + return self.tokenizer.get_vocab_size() + + @classmethod + def train( + cls, + texts: Iterable[str], + vocab_size: int = 32000, + min_frequency: int = 2, + ) -> "BPETokenizer": + """Train a BPE tokenizer from an iterator of text strings.""" + + try: + from tokenizers import Tokenizer, decoders, models, pre_tokenizers, trainers + except ImportError as exc: # pragma: no cover - exercised by users + raise ImportError("BPE tokenization requires the 'tokenizers' package.") from exc + + # The BPE model starts with bytes and repeatedly learns useful merges + # like "ing" or common word pieces from the training corpus. + tokenizer = Tokenizer(models.BPE(unk_token=UNK_TOKEN)) + + # ByteLevel keeps the tokenizer robust: any text can be represented, + # even if it contains unusual characters or rare names. + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) + tokenizer.decoder = decoders.ByteLevel() + + # The trainer decides vocabulary size and guarantees our special tokens + # are present before normal learned tokens. + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=SPECIAL_TOKENS, + ) + tokenizer.train_from_iterator(texts, trainer=trainer) + return cls(tokenizer) + + def encode(self, text: str) -> List[int]: + """Turn text into BPE token ids.""" + + return self.tokenizer.encode(text).ids + + def decode(self, ids: Sequence[int], skip_special: bool = True) -> str: + """Turn BPE token ids back into text.""" + + return self.tokenizer.decode(list(map(int, ids)), skip_special_tokens=skip_special).strip() + + def save(self, path: str | Path) -> None: + """Save the tokenizer JSON that future model runs must reuse.""" + + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + self.tokenizer.save(str(path)) + + @classmethod + def load(cls, path: str | Path) -> "BPETokenizer": + try: + from tokenizers import Tokenizer + except ImportError as exc: # pragma: no cover - exercised by users + raise ImportError("Loading a BPE tokenizer requires the 'tokenizers' package.") from exc + + return cls(Tokenizer.from_file(str(path))) + + def to_dict(self) -> Dict[str, str]: + """Store the tokenizer JSON inside a checkpoint if needed.""" + + return {"kind": "bpe", "tokenizer_json": self.tokenizer.to_str()} + + @classmethod + def from_dict(cls, data: Dict[str, str]) -> "BPETokenizer": + try: + from tokenizers import Tokenizer + except ImportError as exc: # pragma: no cover - exercised by users + raise ImportError("Loading a BPE tokenizer requires the 'tokenizers' package.") from exc + + return cls(Tokenizer.from_str(data["tokenizer_json"])) + + +def tokenizer_from_dict(data): + """Load either tokenizer type from checkpoint metadata.""" + + kind = data.get("kind", "simple") + + # Checkpoints can come from older simple-tokenizer runs or newer BPE runs. + # Dispatching here keeps chat.py independent of tokenizer details. + if kind == "bpe": + return BPETokenizer.from_dict(data) + if kind == "simple": + return SimpleTokenizer.from_dict(data) + raise ValueError(f"Unknown tokenizer kind: {kind}") + + +def tokenizer_metadata(tokenizer, tokenizer_path: str | None = None) -> Dict[str, str]: + """Return checkpoint metadata for a tokenizer. + + When a tokenizer file path is supplied, we store the path for clarity and + also embed the tokenizer payload so the checkpoint remains self-contained. + """ + + data = tokenizer.to_dict() + if tokenizer_path: + data["path"] = tokenizer_path + return data + + +def write_tokenizer_manifest(path: str | Path, tokenizer_path: str, vocab_size: int) -> None: + """Write a tiny JSON manifest describing a trained tokenizer.""" + + manifest = { + "kind": "bpe", + "tokenizer_path": tokenizer_path, + "vocab_size": vocab_size, + "special_tokens": SPECIAL_TOKENS, + } + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(manifest, indent=2), encoding="utf-8") diff --git a/src/chatbot/tokenizer_train.py b/src/chatbot/tokenizer_train.py new file mode 100644 index 0000000..4f9b8c9 --- /dev/null +++ b/src/chatbot/tokenizer_train.py @@ -0,0 +1,56 @@ +"""Command-line helpers for training the BPE tokenizer.""" + +from __future__ import annotations + +import argparse +import os + +from .data import load_training_texts +from .tokenizer import BPETokenizer, write_tokenizer_manifest + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Train a BPE tokenizer for ChatBot.") + + # Dataset flags match train.py so the tokenizer can be trained on the same + # text distribution the model will later learn from. + parser.add_argument("--dataset", default="mixed") + parser.add_argument("--corpus-dir", default=os.path.join("data", "cornell movie-dialogs corpus")) + parser.add_argument("--hf-dataset", default="OpenRL/daily_dialog") + parser.add_argument("--hf-split", default="train") + parser.add_argument("--max-pairs", type=int, default=50000) + parser.add_argument("--vocab-size", type=int, default=128000) + parser.add_argument("--min-frequency", type=int, default=2) + parser.add_argument("--output", default=os.path.join("tokenizers", "chatbot-bpe.json")) + parser.add_argument("--manifest", default=os.path.join("tokenizers", "chatbot-bpe-manifest.json")) + return parser + + +def main() -> None: + args = build_arg_parser().parse_args() + + # First collect formatted conversation strings such as: + # hello hi + texts = load_training_texts( + dataset=args.dataset, + corpus_dir=args.corpus_dir, + max_pairs=args.max_pairs, + hf_dataset_name=args.hf_dataset, + hf_split=args.hf_split, + ) + + # Then learn subword pieces from those strings and save the tokenizer for + # training/inference. The model and tokenizer must stay paired. + tokenizer = BPETokenizer.train( + texts, + vocab_size=args.vocab_size, + min_frequency=args.min_frequency, + ) + tokenizer.save(args.output) + write_tokenizer_manifest(args.manifest, tokenizer_path=args.output, vocab_size=tokenizer.vocab_size) + print(f"Saved BPE tokenizer to {args.output}") + print(f"Vocabulary size: {tokenizer.vocab_size:,}") + + +if __name__ == "__main__": + main() diff --git a/src/chatbot/train.py b/src/chatbot/train.py index 31ed087..d10d88a 100644 --- a/src/chatbot/train.py +++ b/src/chatbot/train.py @@ -1,10 +1,12 @@ -"""Training loop for the small Transformer chatbot.""" +"""Training loop for the original Transformer chatbot family.""" from __future__ import annotations import argparse +import math import os import random +from pathlib import Path from typing import Dict import torch @@ -14,7 +16,7 @@ from .config import ModelConfig from .data import build_datasets, load_training_texts from .model import TransformerChatModel -from .tokenizer import SimpleTokenizer +from .tokenizer import BPETokenizer, SimpleTokenizer, tokenizer_metadata def set_seed(seed: int) -> None: @@ -60,10 +62,20 @@ def estimate_loss( return sum(losses) / max(len(losses), 1) +def loss_to_perplexity(loss: float) -> float: + """Convert cross-entropy loss into perplexity. + + Perplexity is roughly "how many choices the model feels confused between." + Lower is better. We cap the exponent to avoid overflow on broken runs. + """ + + return math.exp(min(loss, 20.0)) + + def save_checkpoint( checkpoint_path: str, model: TransformerChatModel, - tokenizer: SimpleTokenizer, + tokenizer, args: argparse.Namespace, metrics: Dict[str, float], ) -> None: @@ -79,7 +91,7 @@ def save_checkpoint( { "model_config": model.config.to_dict(), "model_state": model.state_dict(), - "tokenizer": tokenizer.to_dict(), + "tokenizer": tokenizer_metadata(tokenizer, getattr(args, "tokenizer_path", None)), "train_args": vars(args), "metrics": metrics, }, @@ -87,6 +99,61 @@ def save_checkpoint( ) +def build_tokenizer(args: argparse.Namespace, texts): + """Create or load the tokenizer selected by CLI args.""" + + if args.tokenizer == "bpe": + if args.tokenizer_path and os.path.exists(args.tokenizer_path): + # Reusing an existing tokenizer keeps token ids stable across runs. + # Stable ids matter because saved model weights line up with ids. + return BPETokenizer.load(args.tokenizer_path) + + # If no tokenizer file exists yet, learn one from the training text. + tokenizer = BPETokenizer.train( + texts, + vocab_size=args.max_vocab_size, + min_frequency=args.min_freq, + ) + if args.tokenizer_path: + # Saving lets a later 10B training run use the exact same vocabulary. + tokenizer.save(args.tokenizer_path) + return tokenizer + + # The simple tokenizer is intentionally easy to inspect, so it remains the + # default for beginner-friendly CPU experiments. + return SimpleTokenizer.build( + texts, + max_vocab_size=args.max_vocab_size, + min_freq=args.min_freq, + ) + + +def build_model_config(args: argparse.Namespace, tokenizer) -> ModelConfig: + """Build the model config from a YAML file or classic CLI knobs.""" + + if args.config: + config = ModelConfig.from_yaml_file(args.config) + + # The tokenizer may produce a smaller vocabulary than the YAML target + # during tests, so the model must match the tokenizer actually in use. + config.vocab_size = tokenizer.vocab_size + config.pad_token_id = tokenizer.pad_id + return config + + return ModelConfig( + vocab_size=tokenizer.vocab_size, + block_size=args.block_size, + n_embd=args.n_embd, + n_head=args.n_head, + n_kv_head=args.n_kv_head, + n_layer=args.n_layer, + ffn_hidden_size=args.ffn_hidden_size, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + pad_token_id=tokenizer.pad_id, + ) + + def train(args: argparse.Namespace) -> str: """Train the model and return the checkpoint path. @@ -99,6 +166,9 @@ def train(args: argparse.Namespace) -> str: """ set_seed(args.seed) + + # CUDA means an NVIDIA GPU is available. CPU mode is useful for tiny smoke + # tests, but the 10B config requires serious multi-GPU hardware. device = torch.device("cuda" if torch.cuda.is_available() and not args.cpu else "cpu") print(f"Using device: {device}") @@ -115,33 +185,28 @@ def train(args: argparse.Namespace) -> str: raise ValueError("No training texts were loaded.") print(f"Loaded {len(texts):,} conversation pairs from {args.dataset}.") - # The tokenizer decides which words/punctuation become known vocabulary. - # Tokens outside the vocabulary become , which keeps the model small. - tokenizer = SimpleTokenizer.build( - texts, - max_vocab_size=args.max_vocab_size, - min_freq=args.min_freq, - ) + if args.config and args.tokenizer == "bpe" and not args.tokenizer_path: + config_vocab_size = ModelConfig.from_yaml_file(args.config).vocab_size + + # For a large model config, train the tokenizer to the same vocabulary + # size unless the user chose an explicit tokenizer file. + args.max_vocab_size = config_vocab_size + + # The tokenizer decides which text chunks become known vocabulary. The + # simple tokenizer is readable; BPE is the serious path for larger models. + tokenizer = build_tokenizer(args, texts) print(f"Vocabulary size: {tokenizer.vocab_size:,}") - # These settings define the model size. Increasing them usually improves - # capacity but also makes training slower and more memory-heavy. - model_config = ModelConfig( - vocab_size=tokenizer.vocab_size, - block_size=args.block_size, - n_embd=args.n_embd, - n_head=args.n_head, - n_layer=args.n_layer, - dropout=args.dropout, - pad_token_id=tokenizer.pad_id, - ) + # These settings define the model size. For 10B training they usually come + # from configs/chatbot-10b.yaml instead of individual CLI flags. + model_config = build_model_config(args, tokenizer) # build_datasets creates x/y token pairs. x is the prompt tokens and y is # the same sequence shifted left, so y contains the next-token answers. datasets = build_datasets( texts, tokenizer, - block_size=args.block_size, + block_size=model_config.block_size, valid_fraction=args.valid_fraction, seed=args.seed, ) @@ -158,6 +223,8 @@ def train(args: argparse.Namespace) -> str: num_workers=0, ) + # At this point the model has random weights. It only becomes useful after + # the training loop below updates those weights many times. model = TransformerChatModel(model_config).to(device) # AdamW is a common optimizer for Transformers. It updates model weights @@ -168,6 +235,8 @@ def train(args: argparse.Namespace) -> str: step = 0 last_train_loss = 0.0 while step < args.steps: + # One epoch means one full pass over train_loader. This loop keeps + # cycling through epochs until the requested number of steps is reached. for x, y in train_loader: step += 1 x = x.to(device) @@ -189,19 +258,35 @@ def train(args: argparse.Namespace) -> str: print(f"step {step:>5}/{args.steps} | train loss {last_train_loss:.4f}") if step % args.eval_every == 0 or step == args.steps: + # Validation is slower than logging training loss, so it runs + # less often. It is the better signal for model quality. valid_loss = estimate_loss(model, valid_loader, device) - print(f"step {step:>5}/{args.steps} | valid loss {valid_loss:.4f}") + valid_ppl = loss_to_perplexity(valid_loss) + print( + f"step {step:>5}/{args.steps} | valid loss {valid_loss:.4f} " + f"| valid ppl {valid_ppl:.2f}" + ) if step >= args.steps: break checkpoint_path = os.path.join(args.output_dir, args.checkpoint_name) + final_valid_loss = estimate_loss(model, valid_loader, device) + + # The checkpoint is deliberately a single portable file for small runs. + # For real 10B training, large external checkpoint formats should be used + # outside git instead of committing weights. save_checkpoint( checkpoint_path, model, tokenizer, args, - metrics={"train_loss": last_train_loss, "valid_loss": estimate_loss(model, valid_loader, device)}, + metrics={ + "train_loss": last_train_loss, + "train_perplexity": loss_to_perplexity(last_train_loss), + "valid_loss": final_valid_loss, + "valid_perplexity": loss_to_perplexity(final_valid_loss), + }, ) print(f"Saved checkpoint to {checkpoint_path}") return checkpoint_path @@ -210,23 +295,33 @@ def train(args: argparse.Namespace) -> str: def build_arg_parser() -> argparse.ArgumentParser: """Create the training CLI.""" - parser = argparse.ArgumentParser(description="Train a small Transformer chatbot.") - parser.add_argument("--dataset", choices=["cornell", "dailydialog"], default="cornell") + parser = argparse.ArgumentParser(description="Train the original ChatBot decoder model.") + parser.add_argument( + "--dataset", + default="cornell", + help="Dataset name, comma-separated names, or 'mixed'. Options include cornell, dailydialog, ultrachat, oasst1, dolly.", + ) parser.add_argument("--corpus-dir", default=os.path.join("data", "cornell movie-dialogs corpus")) parser.add_argument("--hf-dataset", default="OpenRL/daily_dialog") parser.add_argument("--hf-split", default="train") parser.add_argument("--max-pairs", type=int, default=None) + parser.add_argument("--config", default=None, help="Optional YAML model config.") parser.add_argument("--output-dir", default="checkpoints") parser.add_argument("--checkpoint-name", default="chatbot-small-llm.pt") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--cpu", action="store_true", help="Force CPU training even if CUDA is available.") + parser.add_argument("--tokenizer", choices=["simple", "bpe"], default="simple") + parser.add_argument("--tokenizer-path", default=None) parser.add_argument("--block-size", type=int, default=96) parser.add_argument("--n-embd", type=int, default=128) parser.add_argument("--n-head", type=int, default=4) + parser.add_argument("--n-kv-head", type=int, default=None) parser.add_argument("--n-layer", type=int, default=4) + parser.add_argument("--ffn-hidden-size", type=int, default=None) parser.add_argument("--dropout", type=float, default=0.1) + parser.add_argument("--attention-dropout", type=float, default=0.1) parser.add_argument("--max-vocab-size", type=int, default=12000) parser.add_argument("--min-freq", type=int, default=2) diff --git a/src/chatbot/train_10b.py b/src/chatbot/train_10b.py new file mode 100644 index 0000000..35c4489 --- /dev/null +++ b/src/chatbot/train_10b.py @@ -0,0 +1,32 @@ +"""ChatBot-10B training wrapper.""" + +from __future__ import annotations + +import argparse + +from .train import build_arg_parser, train + + +def main() -> None: + parser = build_arg_parser() + parser.description = "Train the original untrained ChatBot-10B model." + + # This wrapper is just the normal trainer with safer 10B defaults. Keeping + # it thin means improvements to train.py automatically apply here too. + parser.set_defaults( + config="configs/chatbot-10b.yaml", + tokenizer="bpe", + tokenizer_path="tokenizers/chatbot-bpe.json", + dataset="mixed", + checkpoint_name="chatbot-10b-untrained-init.pt", + ) + args = parser.parse_args() + if args.cpu: + # A CPU run is useful for checking argument parsing, but not for actual + # 10B training. The README explains multi-GPU/cloud expectations. + print("Warning: ChatBot-10B is not practical on CPU; use configs/chatbot-tiny.yaml for smoke tests.") + train(args) + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..17cdbe1 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,6 @@ +from pathlib import Path +import sys + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) diff --git a/tests/fixtures/sample_pairs.jsonl b/tests/fixtures/sample_pairs.jsonl new file mode 100644 index 0000000..2491b05 --- /dev/null +++ b/tests/fixtures/sample_pairs.jsonl @@ -0,0 +1,3 @@ +{"prompt": "Hello there!", "response": "Hi, nice to meet you."} +{"prompt": "What can you do?", "response": "I can learn conversation patterns from training data."} +{"prompt": "Tell me a tiny fact.", "response": "A tokenizer turns text into model-friendly numbers."} diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..3240299 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,53 @@ +import subprocess +import sys + +import pytest + +pytest.importorskip("torch") + + +def run_help(script): + result = subprocess.run( + [sys.executable, script, "--help"], + check=True, + capture_output=True, + text=True, + ) + assert "usage:" in result.stdout.lower() + + +def test_cli_help_entrypoints(): + for script in ["train_llm.py", "train_tokenizer.py", "train_10b.py", "chatbot.py"]: + run_help(script) + + +def test_tiny_cpu_smoke_train(tmp_path): + result = subprocess.run( + [ + sys.executable, + "train_llm.py", + "--dataset", + "cornell", + "--max-pairs", + "8", + "--steps", + "1", + "--batch-size", + "2", + "--config", + "configs/chatbot-tiny.yaml", + "--max-vocab-size", + "128", + "--cpu", + "--output-dir", + str(tmp_path), + "--checkpoint-name", + "tiny.pt", + ], + check=True, + capture_output=True, + text=True, + ) + + assert "valid ppl" in result.stdout + assert (tmp_path / "tiny.pt").exists() diff --git a/tests/test_config_and_params.py b/tests/test_config_and_params.py new file mode 100644 index 0000000..27725b5 --- /dev/null +++ b/tests/test_config_and_params.py @@ -0,0 +1,20 @@ +from src.chatbot.config import ModelConfig, chatbot_10b_config, tiny_config +from src.chatbot.params import estimate_parameter_count + + +def test_chatbot_10b_config_is_about_ten_billion_parameters(): + config = chatbot_10b_config() + report = estimate_parameter_count(config) + + assert config.model_name == "chatbot-10b" + assert config.n_layer == 36 + assert config.n_embd == 5120 + assert config.n_head == 40 + assert config.n_kv_head == 8 + assert 9.99e9 <= report.total <= 10.01e9 + + +def test_yaml_config_round_trip_for_tiny_config(): + config = ModelConfig.from_yaml_file("configs/chatbot-tiny.yaml") + + assert config.to_dict() == tiny_config(vocab_size=128).to_dict() diff --git a/tests/test_model_generation.py b/tests/test_model_generation.py new file mode 100644 index 0000000..cdf8da9 --- /dev/null +++ b/tests/test_model_generation.py @@ -0,0 +1,44 @@ +import pytest + +torch = pytest.importorskip("torch") + +from src.chatbot.config import tiny_config +from src.chatbot.model import TransformerChatModel + + +def test_tiny_model_forward_returns_logits_and_loss(): + config = tiny_config(vocab_size=32) + model = TransformerChatModel(config) + x = torch.randint(0, config.vocab_size, (2, 8)) + y = torch.randint(0, config.vocab_size, (2, 8)) + + logits, loss = model(x, y) + + assert logits.shape == (2, 8, config.vocab_size) + assert loss is not None + assert torch.isfinite(loss) + + +def test_kv_cache_path_returns_one_cache_per_layer(): + config = tiny_config(vocab_size=32) + model = TransformerChatModel(config) + x = torch.randint(0, config.vocab_size, (1, 4)) + + logits, _, cache = model(x, use_cache=True) + + assert logits.shape == (1, 4, config.vocab_size) + assert len(cache) == config.n_layer + assert cache[0][0].shape[2] == 4 + + +def test_generation_supports_top_p_and_beam_search(): + torch.manual_seed(0) + config = tiny_config(vocab_size=32) + model = TransformerChatModel(config) + prompt = torch.tensor([[1, 2, 3]], dtype=torch.long) + + sampled = model.generate(prompt, max_new_tokens=2, top_k=8, top_p=0.9, temperature=0.8) + beams = model.generate(prompt, max_new_tokens=2, num_beams=2, do_sample=False) + + assert sampled.shape == (1, 5) + assert beams.shape == (1, 5) diff --git a/tests/test_tokenizer_and_data.py b/tests/test_tokenizer_and_data.py new file mode 100644 index 0000000..960f2a4 --- /dev/null +++ b/tests/test_tokenizer_and_data.py @@ -0,0 +1,55 @@ +from pathlib import Path + +import pytest + +pytest.importorskip("tokenizers") + +from src.chatbot.data import ( + load_jsonl_pairs, + pair_to_training_text, + pairs_from_messages, + parse_dataset_names, +) +from src.chatbot.tokenizer import BOT_TOKEN, BPETokenizer, EOS_TOKEN, USER_TOKEN + + +def test_bpe_tokenizer_round_trips_save_and_load(tmp_path): + texts = [ + " hello there hi there ", + " tokenizer test tokenizers split text ", + ] + tokenizer = BPETokenizer.train(texts, vocab_size=64, min_frequency=1) + path = tmp_path / "chatbot-bpe.json" + tokenizer.save(path) + + loaded = BPETokenizer.load(path) + ids = loaded.encode("hello tokenizer") + + assert ids + assert loaded.decode(ids) + assert loaded.vocab_size == tokenizer.vocab_size + + +def test_dataset_helpers_format_common_chat_text(): + pairs = load_jsonl_pairs(Path("tests/fixtures/sample_pairs.jsonl")) + text = pair_to_training_text(pairs[0]) + + assert len(pairs) == 3 + assert USER_TOKEN in text + assert BOT_TOKEN in text + assert EOS_TOKEN in text + + +def test_message_rows_convert_to_user_assistant_pairs(): + messages = [ + {"role": "system", "content": "be helpful"}, + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello"}, + ] + + assert pairs_from_messages(messages) == [("Hi", "Hello")] + + +def test_dataset_name_parser_supports_mixed_and_lists(): + assert parse_dataset_names("cornell,dolly") == ["cornell", "dolly"] + assert "ultrachat" in parse_dataset_names("mixed") diff --git a/train_10b.py b/train_10b.py new file mode 100644 index 0000000..f1921cd --- /dev/null +++ b/train_10b.py @@ -0,0 +1,12 @@ +"""Training entrypoint for the original untrained ChatBot-10B config. + +This script intentionally does not ship weights. It wires the normal training +loop to ``configs/chatbot-10b.yaml`` so a user with suitable multi-GPU hardware +can initialize and train the model outside GitHub. +""" + +from src.chatbot.train_10b import main + + +if __name__ == "__main__": + main() diff --git a/train_tokenizer.py b/train_tokenizer.py new file mode 100644 index 0000000..906d9e2 --- /dev/null +++ b/train_tokenizer.py @@ -0,0 +1,7 @@ +"""Train a BPE tokenizer for the original ChatBot model.""" + +from src.chatbot.tokenizer_train import main + + +if __name__ == "__main__": + main()